#include #include #include #include #include // 测试不同的BLOCK_SIZE std::vector block_sizes = {4, 8, 16, 32}; // 测试不同的矩阵规模 std::vector matrix_sizes = {256, 512, 1024, 2048}; // 共享内存矩阵乘法核函数模板 template __global__ void matMultKernel(const float* A, const float* B, float* C, int M, int N, int K) { __shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE]; int row = blockIdx.y * BLOCK_SIZE + threadIdx.y; int col = blockIdx.x * BLOCK_SIZE + threadIdx.x; float sum = 0.0f; for (int t = 0; t < (N + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) { // 加载到共享内存 if (row < M && t * BLOCK_SIZE + threadIdx.x < N) shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * BLOCK_SIZE + threadIdx.x]; else shared_A[threadIdx.y][threadIdx.x] = 0.0f; if (col < K && t * BLOCK_SIZE + threadIdx.y < N) shared_B[threadIdx.y][threadIdx.x] = B[(t * BLOCK_SIZE + threadIdx.y) * K + col]; else shared_B[threadIdx.y][threadIdx.x] = 0.0f; __syncthreads(); // 计算当前tile for (int i = 0; i < BLOCK_SIZE; ++i) sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x]; __syncthreads(); } if (row < M && col < K) { C[row * K + col] = sum; } } void runTest() { std::cout << "BLOCK_SIZE对CUDA矩阵乘法性能影响测试\n"; std::cout << "========================================\n"; std::cout << std::setw(10) << "Matrix" << std::setw(12) << "Block" << std::setw(15) << "Time(ms)" << std::setw(15) << "FLOPS(G)" << std::endl; std::cout << "----------------------------------------\n"; // 测试每个矩阵规模 for (int mat_size : matrix_sizes) { int M = mat_size, N = mat_size, K = mat_size; // 分配主机内存 float *A = new float[M * N]; float *B = new float[N * K]; float *C = new float[M * K]; // 初始化数据 for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f; for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f; // 分配设备内存 float *d_A, *d_B, *d_C; cudaMalloc(&d_A, M * N * sizeof(float)); cudaMalloc(&d_B, N * K * sizeof(float)); cudaMalloc(&d_C, M * K * sizeof(float)); cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice); // 测试每个BLOCK_SIZE for (int block_size : block_sizes) { dim3 blockDim(block_size, block_size); dim3 gridDim((K + block_size - 1) / block_size, (M + block_size - 1) / block_size); // 预热 if (block_size == 4) matMultKernel<4><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 8) matMultKernel<8><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 16) matMultKernel<16><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 32) matMultKernel<32><<>>(d_A, d_B, d_C, M, N, K); cudaDeviceSynchronize(); // 创建CUDA事件计时 cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // 执行并计时 cudaEventRecord(start); if (block_size == 4) matMultKernel<4><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 8) matMultKernel<8><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 16) matMultKernel<16><<>>(d_A, d_B, d_C, M, N, K); else if (block_size == 32) matMultKernel<32><<>>(d_A, d_B, d_C, M, N, K); cudaEventRecord(stop); cudaEventSynchronize(stop); // 计算时间 float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); // 计算FLOPS double total_flops = 2.0 * M * N * K; // 乘加各一次 double gflops = total_flops / (milliseconds * 1e6); // 输出结果 std::cout << std::setw(10) << mat_size << "x" << mat_size << std::setw(12) << block_size << "x" << block_size << std::setw(15) << std::fixed << std::setprecision(3) << milliseconds << std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl; cudaEventDestroy(start); cudaEventDestroy(stop); } // 清理内存 cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); delete[] A; delete[] B; delete[] C; std::cout << "----------------------------------------\n"; } } int main() { runTest(); return 0; }