#include #include #include #include #include #include void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) { #pragma omp parallel for num_threads(num_threads) for (int i = 0; i < M; ++i) { for (int j = 0; j < K; ++j) { float sum = 0.0f; for (int k = 0; k < N; ++k) { sum += A[i * N + k] * B[k * K + j]; } C[i * K + j] = sum; } } } void runCPUTest() { std::vector matrix_sizes = {256, 512, 1024, 2048}; std::vector thread_counts = {8, 64, 256}; std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n"; std::cout << "=================================================================\n"; std::cout << std::setw(12) << "Matrix" << std::setw(12) << "Threads" << std::setw(15) << "Time(ms)" << std::setw(15) << "FLOPS(G)" << std::setw(15) << "Speedup" << std::endl; std::cout << "-----------------------------------------------------------------\n"; // 存储基准性能（单线程） std::vector baseline_times(matrix_sizes.size()); for (size_t m = 0; m < matrix_sizes.size(); ++m) { int size = matrix_sizes[m]; int M = size, N = size, K = size; // 分配内存 float *A = new float[M * N]; float *B = new float[N * K]; float *C = new float[M * K]; // 初始化数据 for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f; for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f; // 首先测试单线程作为基准 auto start = std::chrono::high_resolution_clock::now(); matrixMultiplyCPU(A, B, C, M, N, K, 1); auto end = std::chrono::high_resolution_clock::now(); auto single_duration = std::chrono::duration(end - start).count(); baseline_times[m] = single_duration; // 测试多线程 for (int threads : thread_counts) { start = std::chrono::high_resolution_clock::now(); matrixMultiplyCPU(A, B, C, M, N, K, threads); end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration(end - start).count(); // 计算FLOPS double total_flops = 2.0 * M * N * K; double gflops = total_flops / (duration * 1e6); // 计算加速比 double speedup = baseline_times[m] / duration; std::cout << std::setw(12) << size << "x" << size << std::setw(12) << threads << std::setw(15) << std::fixed << std::setprecision(3) << duration << std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl; } delete[] A; delete[] B; delete[] C; std::cout << "-----------------------------------------------------------------\n"; } } void plotData() { std::cout << "\n\nASCII图表：CPU性能分析\n"; std::cout << "=================================================================\n"; std::cout << "1. 不同线程数下的加速比趋势\n"; std::cout << " Matrix Threads=8 Threads=64 Threads=256\n"; // 这里可以添加具体的绘图逻辑 // 由于是文本输出，可以使用简单的ASCII字符绘制柱状图 std::cout << "\n2. 不同矩阵规模下的性能趋势\n"; std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n"; std::cout << "\n注意：完整图表建议使用Python (matplotlib) 生成。\n"; std::cout << "推荐生成以下图表：\n"; std::cout << "- 折线图：不同线程数下的加速比 vs 矩阵规模\n"; std::cout << "- 柱状图：不同配置下的GFLOPS对比\n"; std::cout << "- 热力图：线程数 × 矩阵规模的性能分布\n"; } int main() { runCPUTest(); plotData(); return 0; }