hpc-lab-code/lab4/MatrixMul_cpu.cu
2026-01-21 18:30:58 +08:00

110 lines
4.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <iostream>
#include <omp.h>
#include <chrono>
#include <vector>
#include <iomanip>
#include <cmath>
void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) {
#pragma omp parallel for num_threads(num_threads)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
float sum = 0.0f;
for (int k = 0; k < N; ++k) {
sum += A[i * N + k] * B[k * K + j];
}
C[i * K + j] = sum;
}
}
}
void runCPUTest() {
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
std::vector<int> thread_counts = {8, 64, 256};
std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n";
std::cout << "=================================================================\n";
std::cout << std::setw(12) << "Matrix"
<< std::setw(12) << "Threads"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "FLOPS(G)"
<< std::setw(15) << "Speedup" << std::endl;
std::cout << "-----------------------------------------------------------------\n";
// 存储基准性能(单线程)
std::vector<double> baseline_times(matrix_sizes.size());
for (size_t m = 0; m < matrix_sizes.size(); ++m) {
int size = matrix_sizes[m];
int M = size, N = size, K = size;
// 分配内存
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
// 初始化数据
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
// 首先测试单线程作为基准
auto start = std::chrono::high_resolution_clock::now();
matrixMultiplyCPU(A, B, C, M, N, K, 1);
auto end = std::chrono::high_resolution_clock::now();
auto single_duration = std::chrono::duration<float, std::milli>(end - start).count();
baseline_times[m] = single_duration;
// 测试多线程
for (int threads : thread_counts) {
start = std::chrono::high_resolution_clock::now();
matrixMultiplyCPU(A, B, C, M, N, K, threads);
end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<float, std::milli>(end - start).count();
// 计算FLOPS
double total_flops = 2.0 * M * N * K;
double gflops = total_flops / (duration * 1e6);
// 计算加速比
double speedup = baseline_times[m] / duration;
std::cout << std::setw(12) << size << "x" << size
<< std::setw(12) << threads
<< std::setw(15) << std::fixed << std::setprecision(3) << duration
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops
<< std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl;
}
delete[] A;
delete[] B;
delete[] C;
std::cout << "-----------------------------------------------------------------\n";
}
}
void plotData() {
std::cout << "\n\nASCII图表CPU性能分析\n";
std::cout << "=================================================================\n";
std::cout << "1. 不同线程数下的加速比趋势\n";
std::cout << " Matrix Threads=8 Threads=64 Threads=256\n";
// 这里可以添加具体的绘图逻辑
// 由于是文本输出可以使用简单的ASCII字符绘制柱状图
std::cout << "\n2. 不同矩阵规模下的性能趋势\n";
std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n";
std::cout << "\n注意完整图表建议使用Python (matplotlib) 生成。\n";
std::cout << "推荐生成以下图表:\n";
std::cout << "- 折线图:不同线程数下的加速比 vs 矩阵规模\n";
std::cout << "- 柱状图不同配置下的GFLOPS对比\n";
std::cout << "- 热力图:线程数 × 矩阵规模 的性能分布\n";
}
int main() {
runCPUTest();
plotData();
return 0;
}