hpc-lab-code/submit/lab4/vectoradd.cu
2026-01-21 18:02:30 +08:00

92 lines
3.0 KiB
Plaintext

#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
// 向量加法核函数
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
dev_c[i] = dev_a[i] + dev_b[i];
}
}
// 执行一次向量加法测试并计时
void vectorAddTest(int N, int threadsPerBlock)
{
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
int *host_a = (int*)malloc(N * sizeof(int));
int *host_b = (int*)malloc(N * sizeof(int));
int *host_c = (int*)malloc(N * sizeof(int));
for (int i = 0; i < N; i++) {
host_a[i] = i;
host_b[i] = i << 1;
}
int *dev_a = NULL;
int *dev_b = NULL;
int *dev_c = NULL;
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
cudaDeviceSynchronize();
CHECK(cudaEventRecord(start));
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
float elapsedTime_ms = 0;
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
float elapsedTime = elapsedTime_ms * 1000.0f;
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
bool success = true;
for (int i = 0; i < N; i++) {
if (host_c[i] != host_a[i] + host_b[i]) {
success = false;
break;
}
}
if (!success) {
printf("Error: Computation failed for N=%d\n", N);
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
CHECK(cudaFree(dev_a));
CHECK(cudaFree(dev_b));
CHECK(cudaFree(dev_c));
free(host_a);
free(host_b);
free(host_c);
}
int main(void)
{
const int threadsPerBlock = 256;
int testSizes[] = {128, 256, 512, 1024, 2048};
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
printf("========================================================\n");
for (int i = 0; i < numTests; i++) {
vectorAddTest(testSizes[i], threadsPerBlock);
}
printf("========================================================\n");
printf("All tests completed.\n");
return 0;
}