hpc-lab-code/submit/lab4/MatrixMul_kernel1.cu
2026-01-21 18:02:30 +08:00

83 lines
3.2 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <vector>
#include <iomanip>
// 简化版CUDA 矩阵乘法核函数(直接乘加)
__global__ void matMultCUDAKernel1(const float* A, const float* B, float* C, int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row < M && col < K){
float sum = 0.0f;
for(int i = 0; i < N; ++i){
sum += A[row * N + i] * B[i * K + col];
}
C[row * K + col] = sum;
}
}
int main() {
std::vector<int> sizes = {512, 1024, 2048, 4096};
std::vector<float> times;
for(int idx = 0; idx < sizes.size(); ++idx) {
int M = sizes[idx];
int N = sizes[idx];
int K = sizes[idx];
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
for(int i = 0; i < M * N; ++i) A[i] = rand() % 10;
for(int i = 0; i < N * K; ++i) B[i] = rand() % 10;
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_B, N * K * sizeof(float));
cudaMalloc(&d_C, M * K * sizeof(float));
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockSize(16, 16);
dim3 gridSize((K + blockSize.x - 1) / blockSize.x,
(M + blockSize.y - 1) / blockSize.y);
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
std::chrono::duration<float> duration = end - start;
times.push_back(duration.count());
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] A;
delete[] B;
delete[] C;
}
std::cout << "CUDA Kernel1 矩阵乘法性能测试结果" << std::endl;
std::cout << "=================================" << std::endl;
std::cout << std::setw(12) << "Matrix Size"
<< std::setw(15) << "Time(s)"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "GFLOPS" << std::endl;
std::cout << "---------------------------------" << std::endl;
for(int i = 0; i < sizes.size(); ++i) {
int size = sizes[i];
double total_flops = 2.0 * size * size * size;
double gflops = total_flops / (times[i] * 1e9);
double time_ms = times[i] * 1000.0;
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
}
std::cout << "=================================" << std::endl;
return 0;
}