109 lines
3.8 KiB
Plaintext
109 lines
3.8 KiB
Plaintext
#include <iostream>
|
|
#include <chrono>
|
|
#include <cuda_runtime.h>
|
|
#include <vector>
|
|
#include <iomanip>
|
|
|
|
__global__ void matMultCUDAKernel1(const float* A, const float* B, float* C, int M, int N, int K) {
|
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if(row < M && col < K){
|
|
float sum = 0.0f;
|
|
for(int i = 0; i < N; ++i){
|
|
sum += A[row * N + i] * B[i * K + col];
|
|
}
|
|
C[row * K + col] = sum;
|
|
}
|
|
}
|
|
|
|
int main() {
|
|
std::vector<int> sizes = {512, 1024, 2048,4096};
|
|
std::vector<float> times;
|
|
|
|
// 遍历所有矩阵尺寸
|
|
for(int idx = 0; idx < sizes.size(); ++idx) {
|
|
int M = sizes[idx];
|
|
int N = sizes[idx];
|
|
int K = sizes[idx];
|
|
|
|
// 分配主机内存
|
|
float *A = new float[M * N];
|
|
float *B = new float[N * K];
|
|
float *C = new float[M * K];
|
|
|
|
// 初始化数据
|
|
for(int i = 0; i < M * N; ++i) A[i] = rand() % 10;
|
|
for(int i = 0; i < N * K; ++i) B[i] = rand() % 10;
|
|
|
|
// 分配设备内存
|
|
float *d_A, *d_B, *d_C;
|
|
cudaMalloc(&d_A, M * N * sizeof(float));
|
|
cudaMalloc(&d_B, N * K * sizeof(float));
|
|
cudaMalloc(&d_C, M * K * sizeof(float));
|
|
|
|
// 拷贝数据到设备
|
|
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
// 配置线程块和网格
|
|
dim3 blockSize(16, 16);
|
|
dim3 gridSize((K + blockSize.x - 1) / blockSize.x,
|
|
(M + blockSize.y - 1) / blockSize.y);
|
|
|
|
// 预热(可选)
|
|
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
|
cudaDeviceSynchronize();
|
|
|
|
// 计时开始
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
|
|
// 执行核函数
|
|
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
|
cudaDeviceSynchronize();
|
|
|
|
// 计时结束
|
|
auto end = std::chrono::high_resolution_clock::now();
|
|
|
|
// 拷贝结果回主机
|
|
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
|
|
|
|
// 计算时间
|
|
std::chrono::duration<float> duration = end - start;
|
|
times.push_back(duration.count());
|
|
|
|
// 清理设备内存
|
|
cudaFree(d_A);
|
|
cudaFree(d_B);
|
|
cudaFree(d_C);
|
|
|
|
// 清理主机内存
|
|
delete[] A;
|
|
delete[] B;
|
|
delete[] C;
|
|
}
|
|
|
|
// 输出结果
|
|
std::cout << "CUDA Kernel1 矩阵乘法性能测试结果" << std::endl;
|
|
std::cout << "=================================" << std::endl;
|
|
std::cout << std::setw(12) << "Matrix Size"
|
|
<< std::setw(15) << "Time(s)"
|
|
<< std::setw(15) << "Time(ms)"
|
|
<< std::setw(15) << "GFLOPS" << std::endl;
|
|
std::cout << "---------------------------------" << std::endl;
|
|
|
|
for(int i = 0; i < sizes.size(); ++i) {
|
|
int size = sizes[i];
|
|
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
|
|
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
|
|
double time_ms = times[i] * 1000.0; // 转换为毫秒
|
|
|
|
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
|
|
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
|
|
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
|
|
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
|
|
}
|
|
std::cout << "=================================" << std::endl;
|
|
|
|
return 0;
|
|
} |