92 lines
3.0 KiB
Plaintext
92 lines
3.0 KiB
Plaintext
#include <cuda_runtime.h>
|
|
#include <stdio.h>
|
|
#include <chrono>
|
|
|
|
#define CHECK(call) \
|
|
{ \
|
|
const cudaError_t error = call; \
|
|
if (error != cudaSuccess) \
|
|
{ \
|
|
printf("Error: %s:%d, ", __FILE__, __LINE__); \
|
|
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
|
|
exit(1); \
|
|
} \
|
|
}
|
|
|
|
// 向量加法核函数
|
|
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
|
|
{
|
|
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
|
if (i < N) {
|
|
dev_c[i] = dev_a[i] + dev_b[i];
|
|
}
|
|
}
|
|
|
|
// 执行一次向量加法测试并计时
|
|
void vectorAddTest(int N, int threadsPerBlock)
|
|
{
|
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
|
int *host_a = (int*)malloc(N * sizeof(int));
|
|
int *host_b = (int*)malloc(N * sizeof(int));
|
|
int *host_c = (int*)malloc(N * sizeof(int));
|
|
for (int i = 0; i < N; i++) {
|
|
host_a[i] = i;
|
|
host_b[i] = i << 1;
|
|
}
|
|
int *dev_a = NULL;
|
|
int *dev_b = NULL;
|
|
int *dev_c = NULL;
|
|
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
|
|
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
|
|
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
|
|
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
|
|
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
|
|
cudaEvent_t start, stop;
|
|
CHECK(cudaEventCreate(&start));
|
|
CHECK(cudaEventCreate(&stop));
|
|
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
|
cudaDeviceSynchronize();
|
|
CHECK(cudaEventRecord(start));
|
|
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
|
CHECK(cudaEventRecord(stop));
|
|
CHECK(cudaEventSynchronize(stop));
|
|
float elapsedTime_ms = 0;
|
|
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
|
|
float elapsedTime = elapsedTime_ms * 1000.0f;
|
|
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
|
|
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
|
|
bool success = true;
|
|
for (int i = 0; i < N; i++) {
|
|
if (host_c[i] != host_a[i] + host_b[i]) {
|
|
success = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!success) {
|
|
printf("Error: Computation failed for N=%d\n", N);
|
|
}
|
|
CHECK(cudaEventDestroy(start));
|
|
CHECK(cudaEventDestroy(stop));
|
|
CHECK(cudaFree(dev_a));
|
|
CHECK(cudaFree(dev_b));
|
|
CHECK(cudaFree(dev_c));
|
|
free(host_a);
|
|
free(host_b);
|
|
free(host_c);
|
|
}
|
|
|
|
int main(void)
|
|
{
|
|
const int threadsPerBlock = 256;
|
|
int testSizes[] = {128, 256, 512, 1024, 2048};
|
|
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
|
|
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
|
|
printf("========================================================\n");
|
|
for (int i = 0; i < numTests; i++) {
|
|
vectorAddTest(testSizes[i], threadsPerBlock);
|
|
}
|
|
printf("========================================================\n");
|
|
printf("All tests completed.\n");
|
|
return 0;
|
|
}
|