123 lines
3.6 KiB
Plaintext
123 lines
3.6 KiB
Plaintext
#include <cuda_runtime.h>
|
||
#include <stdio.h>
|
||
#include <chrono>
|
||
|
||
#define CHECK(call) \
|
||
{ \
|
||
const cudaError_t error = call; \
|
||
if (error != cudaSuccess) \
|
||
{ \
|
||
printf("Error: %s:%d, ", __FILE__, __LINE__); \
|
||
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
|
||
exit(1); \
|
||
} \
|
||
}
|
||
|
||
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
|
||
{
|
||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||
if (i < N) {
|
||
dev_c[i] = dev_a[i] + dev_b[i];
|
||
}
|
||
}
|
||
|
||
void vectorAddTest(int N, int threadsPerBlock)
|
||
{
|
||
// 计算块数
|
||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||
|
||
// 分配主机内存
|
||
int *host_a = (int*)malloc(N * sizeof(int));
|
||
int *host_b = (int*)malloc(N * sizeof(int));
|
||
int *host_c = (int*)malloc(N * sizeof(int));
|
||
|
||
// 初始化数据
|
||
for (int i = 0; i < N; i++) {
|
||
host_a[i] = i;
|
||
host_b[i] = i << 1; // 相当于乘以2
|
||
}
|
||
|
||
// 分配设备内存
|
||
int *dev_a = NULL;
|
||
int *dev_b = NULL;
|
||
int *dev_c = NULL;
|
||
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
|
||
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
|
||
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
|
||
|
||
// 拷贝数据到设备
|
||
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
|
||
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
|
||
|
||
// 创建CUDA事件用于计时
|
||
cudaEvent_t start, stop;
|
||
CHECK(cudaEventCreate(&start));
|
||
CHECK(cudaEventCreate(&stop));
|
||
|
||
// 预热一次,避免首次启动的额外开销
|
||
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
||
cudaDeviceSynchronize();
|
||
|
||
// 记录开始时间
|
||
CHECK(cudaEventRecord(start));
|
||
|
||
// 执行核函数
|
||
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
||
|
||
// 记录结束时间并等待完成
|
||
CHECK(cudaEventRecord(stop));
|
||
CHECK(cudaEventSynchronize(stop));
|
||
|
||
// 计算耗时(毫秒)
|
||
float elapsedTime_ms = 0;
|
||
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
|
||
float elapsedTime = elapsedTime_ms * 1000.0f; // 转换为微秒
|
||
|
||
// 输出结果
|
||
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
|
||
|
||
// 验证结果(可选)
|
||
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
|
||
bool success = true;
|
||
for (int i = 0; i < N; i++) {
|
||
if (host_c[i] != host_a[i] + host_b[i]) {
|
||
success = false;
|
||
break;
|
||
}
|
||
}
|
||
if (!success) {
|
||
printf("Error: Computation failed for N=%d\n", N);
|
||
}
|
||
|
||
// 清理资源
|
||
CHECK(cudaEventDestroy(start));
|
||
CHECK(cudaEventDestroy(stop));
|
||
CHECK(cudaFree(dev_a));
|
||
CHECK(cudaFree(dev_b));
|
||
CHECK(cudaFree(dev_c));
|
||
free(host_a);
|
||
free(host_b);
|
||
free(host_c);
|
||
}
|
||
|
||
int main(void)
|
||
{
|
||
// 设置线程数(保持不变)
|
||
const int threadsPerBlock = 256;
|
||
|
||
// 测试不同向量长度
|
||
int testSizes[] = {128, 256, 512, 1024, 2048}; // 注意:2056改为2048(2的幂次)
|
||
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
|
||
|
||
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
|
||
printf("========================================================\n");
|
||
|
||
for (int i = 0; i < numTests; i++) {
|
||
vectorAddTest(testSizes[i], threadsPerBlock);
|
||
}
|
||
|
||
printf("========================================================\n");
|
||
printf("All tests completed.\n");
|
||
|
||
return 0;
|
||
} |