hpc-lab-code/lab4/vectoradd.cu
2026-01-21 18:02:30 +08:00

123 lines
3.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
dev_c[i] = dev_a[i] + dev_b[i];
}
}
void vectorAddTest(int N, int threadsPerBlock)
{
// 计算块数
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// 分配主机内存
int *host_a = (int*)malloc(N * sizeof(int));
int *host_b = (int*)malloc(N * sizeof(int));
int *host_c = (int*)malloc(N * sizeof(int));
// 初始化数据
for (int i = 0; i < N; i++) {
host_a[i] = i;
host_b[i] = i << 1; // 相当于乘以2
}
// 分配设备内存
int *dev_a = NULL;
int *dev_b = NULL;
int *dev_c = NULL;
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
// 拷贝数据到设备
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
// 创建CUDA事件用于计时
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
// 预热一次,避免首次启动的额外开销
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
cudaDeviceSynchronize();
// 记录开始时间
CHECK(cudaEventRecord(start));
// 执行核函数
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
// 记录结束时间并等待完成
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
// 计算耗时(毫秒)
float elapsedTime_ms = 0;
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
float elapsedTime = elapsedTime_ms * 1000.0f; // 转换为微秒
// 输出结果
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
// 验证结果(可选)
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
bool success = true;
for (int i = 0; i < N; i++) {
if (host_c[i] != host_a[i] + host_b[i]) {
success = false;
break;
}
}
if (!success) {
printf("Error: Computation failed for N=%d\n", N);
}
// 清理资源
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
CHECK(cudaFree(dev_a));
CHECK(cudaFree(dev_b));
CHECK(cudaFree(dev_c));
free(host_a);
free(host_b);
free(host_c);
}
int main(void)
{
// 设置线程数(保持不变)
const int threadsPerBlock = 256;
// 测试不同向量长度
int testSizes[] = {128, 256, 512, 1024, 2048}; // 注意2056改为20482的幂次
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
printf("========================================================\n");
for (int i = 0; i < numTests; i++) {
vectorAddTest(testSizes[i], threadsPerBlock);
}
printf("========================================================\n");
printf("All tests completed.\n");
return 0;
}