hpc-lab-code/submit/lab4/vectoradd.cu

#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>

#define CHECK(call) \
{ \
    const cudaError_t error = call; \
    if (error != cudaSuccess) \
    { \
        printf("Error: %s:%d, ", __FILE__, __LINE__); \
        printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
        exit(1); \
    } \
}

// 向量加法核函数
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N) {
        dev_c[i] = dev_a[i] + dev_b[i];
    }
}

// 执行一次向量加法测试并计时
void vectorAddTest(int N, int threadsPerBlock)
{
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    int *host_a = (int*)malloc(N * sizeof(int));
    int *host_b = (int*)malloc(N * sizeof(int));
    int *host_c = (int*)malloc(N * sizeof(int));
    for (int i = 0; i < N; i++) {
        host_a[i] = i;
        host_b[i] = i << 1;
    }
    int *dev_a = NULL;
    int *dev_b = NULL;
    int *dev_c = NULL;
    CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
    CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
    CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
    CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
    cudaEvent_t start, stop;
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&stop));
    add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
    cudaDeviceSynchronize();
    CHECK(cudaEventRecord(start));
    add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
    CHECK(cudaEventRecord(stop));
    CHECK(cudaEventSynchronize(stop));
    float elapsedTime_ms = 0;
    CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
    float elapsedTime = elapsedTime_ms * 1000.0f;
    printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
    CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
    bool success = true;
    for (int i = 0; i < N; i++) {
        if (host_c[i] != host_a[i] + host_b[i]) {
            success = false;
            break;
        }
    }
    if (!success) {
        printf("Error: Computation failed for N=%d\n", N);
    }
    CHECK(cudaEventDestroy(start));
    CHECK(cudaEventDestroy(stop));
    CHECK(cudaFree(dev_a));
    CHECK(cudaFree(dev_b));
    CHECK(cudaFree(dev_c));
    free(host_a);
    free(host_b);
    free(host_c);
}

int main(void)
{
    const int threadsPerBlock = 256;
    int testSizes[] = {128, 256, 512, 1024, 2048};
    int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
    printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
    printf("========================================================\n");
    for (int i = 0; i < numTests; i++) {
        vectorAddTest(testSizes[i], threadsPerBlock);
    }
    printf("========================================================\n");
    printf("All tests completed.\n");
    return 0;
}