139 lines
5.3 KiB
Plaintext
139 lines
5.3 KiB
Plaintext
#include <iostream>
|
|
#include <cuda_runtime.h>
|
|
#include <chrono>
|
|
#include <vector>
|
|
#include <iomanip>
|
|
|
|
// 测试不同的BLOCK_SIZE
|
|
std::vector<int> block_sizes = {4, 8, 16, 32};
|
|
// 测试不同的矩阵规模
|
|
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
|
|
|
|
// 共享内存矩阵乘法核函数模板
|
|
template<int BLOCK_SIZE>
|
|
__global__ void matMultKernel(const float* A, const float* B, float* C, int M, int N, int K) {
|
|
__shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE];
|
|
__shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE];
|
|
|
|
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
|
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
|
|
|
float sum = 0.0f;
|
|
|
|
for (int t = 0; t < (N + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
|
|
// 加载到共享内存
|
|
if (row < M && t * BLOCK_SIZE + threadIdx.x < N)
|
|
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * BLOCK_SIZE + threadIdx.x];
|
|
else
|
|
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
|
|
|
|
if (col < K && t * BLOCK_SIZE + threadIdx.y < N)
|
|
shared_B[threadIdx.y][threadIdx.x] = B[(t * BLOCK_SIZE + threadIdx.y) * K + col];
|
|
else
|
|
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
|
|
|
|
__syncthreads();
|
|
|
|
// 计算当前tile
|
|
for (int i = 0; i < BLOCK_SIZE; ++i)
|
|
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
|
|
|
|
__syncthreads();
|
|
}
|
|
|
|
if (row < M && col < K) {
|
|
C[row * K + col] = sum;
|
|
}
|
|
}
|
|
|
|
void runTest() {
|
|
std::cout << "BLOCK_SIZE对CUDA矩阵乘法性能影响测试\n";
|
|
std::cout << "========================================\n";
|
|
std::cout << std::setw(10) << "Matrix"
|
|
<< std::setw(12) << "Block"
|
|
<< std::setw(15) << "Time(ms)"
|
|
<< std::setw(15) << "FLOPS(G)" << std::endl;
|
|
std::cout << "----------------------------------------\n";
|
|
|
|
// 测试每个矩阵规模
|
|
for (int mat_size : matrix_sizes) {
|
|
int M = mat_size, N = mat_size, K = mat_size;
|
|
|
|
// 分配主机内存
|
|
float *A = new float[M * N];
|
|
float *B = new float[N * K];
|
|
float *C = new float[M * K];
|
|
|
|
// 初始化数据
|
|
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
|
|
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
|
|
|
|
// 分配设备内存
|
|
float *d_A, *d_B, *d_C;
|
|
cudaMalloc(&d_A, M * N * sizeof(float));
|
|
cudaMalloc(&d_B, N * K * sizeof(float));
|
|
cudaMalloc(&d_C, M * K * sizeof(float));
|
|
|
|
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
// 测试每个BLOCK_SIZE
|
|
for (int block_size : block_sizes) {
|
|
dim3 blockDim(block_size, block_size);
|
|
dim3 gridDim((K + block_size - 1) / block_size, (M + block_size - 1) / block_size);
|
|
|
|
// 预热
|
|
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
cudaDeviceSynchronize();
|
|
|
|
// 创建CUDA事件计时
|
|
cudaEvent_t start, stop;
|
|
cudaEventCreate(&start);
|
|
cudaEventCreate(&stop);
|
|
|
|
// 执行并计时
|
|
cudaEventRecord(start);
|
|
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
|
cudaEventRecord(stop);
|
|
cudaEventSynchronize(stop);
|
|
|
|
// 计算时间
|
|
float milliseconds = 0;
|
|
cudaEventElapsedTime(&milliseconds, start, stop);
|
|
|
|
// 计算FLOPS
|
|
double total_flops = 2.0 * M * N * K; // 乘加各一次
|
|
double gflops = total_flops / (milliseconds * 1e6);
|
|
|
|
// 输出结果
|
|
std::cout << std::setw(10) << mat_size << "x" << mat_size
|
|
<< std::setw(12) << block_size << "x" << block_size
|
|
<< std::setw(15) << std::fixed << std::setprecision(3) << milliseconds
|
|
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
|
|
|
|
cudaEventDestroy(start);
|
|
cudaEventDestroy(stop);
|
|
}
|
|
|
|
// 清理内存
|
|
cudaFree(d_A);
|
|
cudaFree(d_B);
|
|
cudaFree(d_C);
|
|
delete[] A;
|
|
delete[] B;
|
|
delete[] C;
|
|
|
|
std::cout << "----------------------------------------\n";
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
runTest();
|
|
return 0;
|
|
} |