110 lines
4.3 KiB
Plaintext
110 lines
4.3 KiB
Plaintext
#include <iostream>
|
||
#include <omp.h>
|
||
#include <chrono>
|
||
#include <vector>
|
||
#include <iomanip>
|
||
#include <cmath>
|
||
|
||
void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) {
|
||
#pragma omp parallel for num_threads(num_threads)
|
||
for (int i = 0; i < M; ++i) {
|
||
for (int j = 0; j < K; ++j) {
|
||
float sum = 0.0f;
|
||
for (int k = 0; k < N; ++k) {
|
||
sum += A[i * N + k] * B[k * K + j];
|
||
}
|
||
C[i * K + j] = sum;
|
||
}
|
||
}
|
||
}
|
||
|
||
void runCPUTest() {
|
||
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
|
||
std::vector<int> thread_counts = {8, 64, 256};
|
||
|
||
std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n";
|
||
std::cout << "=================================================================\n";
|
||
std::cout << std::setw(12) << "Matrix"
|
||
<< std::setw(12) << "Threads"
|
||
<< std::setw(15) << "Time(ms)"
|
||
<< std::setw(15) << "FLOPS(G)"
|
||
<< std::setw(15) << "Speedup" << std::endl;
|
||
std::cout << "-----------------------------------------------------------------\n";
|
||
|
||
// 存储基准性能(单线程)
|
||
std::vector<double> baseline_times(matrix_sizes.size());
|
||
|
||
for (size_t m = 0; m < matrix_sizes.size(); ++m) {
|
||
int size = matrix_sizes[m];
|
||
int M = size, N = size, K = size;
|
||
|
||
// 分配内存
|
||
float *A = new float[M * N];
|
||
float *B = new float[N * K];
|
||
float *C = new float[M * K];
|
||
|
||
// 初始化数据
|
||
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
|
||
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
|
||
|
||
// 首先测试单线程作为基准
|
||
auto start = std::chrono::high_resolution_clock::now();
|
||
matrixMultiplyCPU(A, B, C, M, N, K, 1);
|
||
auto end = std::chrono::high_resolution_clock::now();
|
||
auto single_duration = std::chrono::duration<float, std::milli>(end - start).count();
|
||
baseline_times[m] = single_duration;
|
||
|
||
// 测试多线程
|
||
for (int threads : thread_counts) {
|
||
start = std::chrono::high_resolution_clock::now();
|
||
matrixMultiplyCPU(A, B, C, M, N, K, threads);
|
||
end = std::chrono::high_resolution_clock::now();
|
||
auto duration = std::chrono::duration<float, std::milli>(end - start).count();
|
||
|
||
// 计算FLOPS
|
||
double total_flops = 2.0 * M * N * K;
|
||
double gflops = total_flops / (duration * 1e6);
|
||
|
||
// 计算加速比
|
||
double speedup = baseline_times[m] / duration;
|
||
|
||
std::cout << std::setw(12) << size << "x" << size
|
||
<< std::setw(12) << threads
|
||
<< std::setw(15) << std::fixed << std::setprecision(3) << duration
|
||
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops
|
||
<< std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl;
|
||
}
|
||
|
||
delete[] A;
|
||
delete[] B;
|
||
delete[] C;
|
||
|
||
std::cout << "-----------------------------------------------------------------\n";
|
||
}
|
||
}
|
||
|
||
void plotData() {
|
||
std::cout << "\n\nASCII图表:CPU性能分析\n";
|
||
std::cout << "=================================================================\n";
|
||
std::cout << "1. 不同线程数下的加速比趋势\n";
|
||
std::cout << " Matrix Threads=8 Threads=64 Threads=256\n";
|
||
|
||
// 这里可以添加具体的绘图逻辑
|
||
// 由于是文本输出,可以使用简单的ASCII字符绘制柱状图
|
||
|
||
std::cout << "\n2. 不同矩阵规模下的性能趋势\n";
|
||
std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n";
|
||
|
||
std::cout << "\n注意:完整图表建议使用Python (matplotlib) 生成。\n";
|
||
std::cout << "推荐生成以下图表:\n";
|
||
std::cout << "- 折线图:不同线程数下的加速比 vs 矩阵规模\n";
|
||
std::cout << "- 柱状图:不同配置下的GFLOPS对比\n";
|
||
std::cout << "- 热力图:线程数 × 矩阵规模 的性能分布\n";
|
||
}
|
||
|
||
int main() {
|
||
runCPUTest();
|
||
plotData();
|
||
return 0;
|
||
}
|