#include #include #include #define CHECK(call) \ { \ const cudaError_t error = call; \ if (error != cudaSuccess) \ { \ printf("Error: %s:%d, ", __FILE__, __LINE__); \ printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \ exit(1); \ } \ } // 向量加法核函数 __global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < N) { dev_c[i] = dev_a[i] + dev_b[i]; } } // 执行一次向量加法测试并计时 void vectorAddTest(int N, int threadsPerBlock) { int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int *host_a = (int*)malloc(N * sizeof(int)); int *host_b = (int*)malloc(N * sizeof(int)); int *host_c = (int*)malloc(N * sizeof(int)); for (int i = 0; i < N; i++) { host_a[i] = i; host_b[i] = i << 1; } int *dev_a = NULL; int *dev_b = NULL; int *dev_c = NULL; CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int))); CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int))); CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int))); CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice)); cudaEvent_t start, stop; CHECK(cudaEventCreate(&start)); CHECK(cudaEventCreate(&stop)); add<<>>(dev_a, dev_b, dev_c, N); cudaDeviceSynchronize(); CHECK(cudaEventRecord(start)); add<<>>(dev_a, dev_b, dev_c, N); CHECK(cudaEventRecord(stop)); CHECK(cudaEventSynchronize(stop)); float elapsedTime_ms = 0; CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop)); float elapsedTime = elapsedTime_ms * 1000.0f; printf("N=%d, Time=%.3f ms\n", N, elapsedTime); CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost)); bool success = true; for (int i = 0; i < N; i++) { if (host_c[i] != host_a[i] + host_b[i]) { success = false; break; } } if (!success) { printf("Error: Computation failed for N=%d\n", N); } CHECK(cudaEventDestroy(start)); CHECK(cudaEventDestroy(stop)); CHECK(cudaFree(dev_a)); CHECK(cudaFree(dev_b)); CHECK(cudaFree(dev_c)); free(host_a); free(host_b); free(host_c); } int main(void) { const int threadsPerBlock = 256; int testSizes[] = {128, 256, 512, 1024, 2048}; int numTests = sizeof(testSizes) / sizeof(testSizes[0]); printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock); printf("========================================================\n"); for (int i = 0; i < numTests; i++) { vectorAddTest(testSizes[i], threadsPerBlock); } printf("========================================================\n"); printf("All tests completed.\n"); return 0; }