hpc-lab-code/lab4/MatrixMul_kernel2.cu
2026-01-21 18:02:30 +08:00

115 lines
4.1 KiB
Plaintext

#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <vector>
#include <iomanip>
#define TILE_WIDTH 4
__global__ void matMultCUDAKernel2(const float* A, const float* B, float* C, int M, int N, int K) {
__shared__ float shared_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float shared_B[TILE_WIDTH][TILE_WIDTH];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int t = 0; t < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++t) {
if (row < M && t * TILE_WIDTH + threadIdx.x < N)
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_WIDTH + threadIdx.x];
else
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
if (col < K && t * TILE_WIDTH + threadIdx.y < N)
shared_B[threadIdx.y][threadIdx.x] = B[(t * TILE_WIDTH + threadIdx.y) * K + col];
else
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads();
for (int i = 0; i < TILE_WIDTH; ++i)
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
__syncthreads();
}
if(row < M && col < K){
C[row * K + col] = sum;
}
}
int main() {
std::vector<int> sizes = {512, 1024, 2048,4096};
std::vector<float> times;
for(int idx = 0; idx < sizes.size(); ++idx) {
int M = sizes[idx];
int N = sizes[idx];
int K = sizes[idx];
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
for (int i = 0; i < M * N; ++i) A[i] = rand() % 10;
for (int i = 0; i < N * K; ++i) B[i] = rand() % 10;
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_B, N * K * sizeof(float));
cudaMalloc(&d_C, M * K * sizeof(float));
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((K + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH);
// 预热
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
std::chrono::duration<float> duration = end - start;
times.push_back(duration.count());
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] A;
delete[] B;
delete[] C;
}
std::cout << "CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果" << std::endl;
std::cout << "=================================" << std::endl;
std::cout << std::setw(12) << "Matrix Size"
<< std::setw(15) << "Time(s)"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "GFLOPS" << std::endl;
std::cout << "---------------------------------" << std::endl;
for(int i = 0; i < sizes.size(); ++i) {
int size = sizes[i];
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
double time_ms = times[i] * 1000.0; // 转换为毫秒
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
}
std::cout << "=================================" << std::endl;
return 0;
}