#!/usr/bin/env python3 """ 矩阵乘法性能实验数据分析脚本 分析CPU、CUDA Kernel1、CUDA Kernel2的性能对比 以及不同BLOCK_SIZE对性能的影响 """ import matplotlib.pyplot as plt import numpy as np import matplotlib from matplotlib import rcParams # 设置中文字体支持 matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans'] matplotlib.rcParams['axes.unicode_minus'] = False # 实验一数据 # CPU (OpenMP) 不同线程数的数据 cpu_data = { '256': { 8: {'time': 86.012, 'flops': 0.39, 'speedup': 1.14}, 64: {'time': 78.420, 'flops': 0.43, 'speedup': 1.25}, 256: {'time': 76.496, 'flops': 0.44, 'speedup': 1.28} }, '512': { 8: {'time': 747.483, 'flops': 0.36, 'speedup': 1.00}, 64: {'time': 743.606, 'flops': 0.36, 'speedup': 1.01}, 256: {'time': 748.649, 'flops': 0.36, 'speedup': 1.00} }, '1024': { 8: {'time': 6033.205, 'flops': 0.36, 'speedup': 1.00}, 64: {'time': 6049.318, 'flops': 0.35, 'speedup': 1.00}, 256: {'time': 6051.757, 'flops': 0.35, 'speedup': 1.00} }, '2048': { 8: {'time': 51065.609, 'flops': 0.34, 'speedup': 1.00}, 64: {'time': 50995.406, 'flops': 0.34, 'speedup': 1.00}, 256: {'time': 51083.363, 'flops': 0.34, 'speedup': 1.00} } } # CUDA Kernel1 数据 cuda_kernel1_data = { '512': {'time': 0.316, 'flops': 849.49}, '1024': {'time': 2.374, 'flops': 904.75}, '2048': {'time': 19.190, 'flops': 895.23}, '4096': {'time': 152.897, 'flops': 898.90} } # CUDA Kernel2 数据 (TILE_WIDTH=4) cuda_kernel2_data = { '512': {'time': 0.827, 'flops': 324.65}, '1024': {'time': 6.484, 'flops': 331.22}, '2048': {'time': 53.599, 'flops': 320.52}, '4096': {'time': 433.242, 'flops': 317.23} } # 实验二数据:不同BLOCK_SIZE的影响 blocksize_data = { '256': { 4: {'time': 0.116, 'flops': 289.26}, 8: {'time': 0.040, 'flops': 838.19}, 16: {'time': 0.029, 'flops': 1170.29}, 32: {'time': 0.026, 'flops': 1292.94} }, '512': { 4: {'time': 0.831, 'flops': 323.04}, 8: {'time': 0.265, 'flops': 1014.10}, 16: {'time': 0.189, 'flops': 1423.49}, 32: {'time': 0.178, 'flops': 1506.57} }, '1024': { 4: {'time': 6.539, 'flops': 328.40}, 8: {'time': 2.022, 'flops': 1061.88}, 16: {'time': 1.397, 'flops': 1536.94}, 32: {'time': 1.364, 'flops': 1574.44} }, '2048': { 4: {'time': 54.023, 'flops': 318.01}, 8: {'time': 16.080, 'flops': 1068.38}, 16: {'time': 11.454, 'flops': 1499.84}, 32: {'time': 11.019, 'flops': 1559.16} } } def print_experiment1_table(): """打印实验一的数据表格""" print("=" * 100) print("实验一:CPU、CUDA Kernel1、CUDA Kernel2 性能对比") print("=" * 100) matrix_sizes = ['512', '1024', '2048', '4096'] thread_counts = [8, 64, 256] for size in matrix_sizes: print(f"\n矩阵规模: {size}x{size}") print("-" * 100) print(f"{'实现方式':<20} {'线程数':<10} {'时间(ms)':<15} {'GFLOPS':<15} {'加速比':<15}") print("-" * 100) # CPU数据 if size in cpu_data: for threads in thread_counts: data = cpu_data[size][threads] print(f"{'CPU (OpenMP)':<20} {threads:<10} {data['time']:<15.3f} {data['flops']:<15.2f} {data['speedup']:<15.2f}") # CUDA Kernel1数据 if size in cuda_kernel1_data: data = cuda_kernel1_data[size] # 计算相对于CPU(8线程)的加速比 cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time'] speedup = cpu_time / data['time'] print(f"{'CUDA Kernel1':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}") # CUDA Kernel2数据 if size in cuda_kernel2_data: data = cuda_kernel2_data[size] cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time'] speedup = cpu_time / data['time'] print(f"{'CUDA Kernel2':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}") print("\n" + "=" * 100) def print_experiment2_table(): """打印实验二的数据表格""" print("\n" + "=" * 100) print("实验二:不同BLOCK_SIZE对CUDA程序性能的影响") print("=" * 100) matrix_sizes = ['256', '512', '1024', '2048'] block_sizes = [4, 8, 16, 32] for size in matrix_sizes: print(f"\n矩阵规模: {size}x{size}") print("-" * 80) print(f"{'BLOCK_SIZE':<15} {'时间(ms)':<20} {'GFLOPS':<20} {'相对4x4加速比':<20}") print("-" * 80) baseline_time = blocksize_data[size][4]['time'] for bs in block_sizes: data = blocksize_data[size][bs] speedup = baseline_time / data['time'] print(f"{bs}x{bs:<10} {data['time']:<20.3f} {data['flops']:<20.2f} {speedup:<20.2f}") print("\n" + "=" * 100) def plot_experiment1(): """绘制实验一的图表""" matrix_sizes = ['512', '1024', '2048', '4096'] size_numeric = [int(s) for s in matrix_sizes] # 准备数据 cpu_8_threads = [cpu_data[s][8]['time'] if s in cpu_data else 0 for s in matrix_sizes] cpu_64_threads = [cpu_data[s][64]['time'] if s in cpu_data else 0 for s in matrix_sizes] cpu_256_threads = [cpu_data[s][256]['time'] if s in cpu_data else 0 for s in matrix_sizes] kernel1_times = [cuda_kernel1_data[s]['time'] for s in matrix_sizes] kernel2_times = [cuda_kernel2_data[s]['time'] for s in matrix_sizes] # 创建图表 fig, axes = plt.subplots(2, 2, figsize=(15, 12)) # 图1:执行时间对比(对数坐标) ax1 = axes[0, 0] x = np.arange(len(matrix_sizes)) width = 0.15 ax1.bar(x - 1.5*width, cpu_8_threads, width, label='CPU (8 threads)', color='#1f77b4') ax1.bar(x - 0.5*width, cpu_64_threads, width, label='CPU (64 threads)', color='#ff7f0e') ax1.bar(x + 0.5*width, cpu_256_threads, width, label='CPU (256 threads)', color='#2ca02c') ax1.bar(x + 1.5*width, kernel1_times, width, label='CUDA Kernel1', color='#d62728') ax1.set_xlabel('Matrix Size') ax1.set_ylabel('Time (ms)') ax1.set_title('Execution Time Comparison (Log Scale)') ax1.set_xticks(x) ax1.set_xticklabels([f'{s}x{s}' for s in matrix_sizes]) ax1.set_yscale('log') ax1.legend() ax1.grid(True, alpha=0.3) # 图2:GFLOPS对比 ax2 = axes[0, 1] cpu_8_flops = [cpu_data[s][8]['flops'] if s in cpu_data else 0 for s in matrix_sizes] cpu_64_flops = [cpu_data[s][64]['flops'] if s in cpu_data else 0 for s in matrix_sizes] cpu_256_flops = [cpu_data[s][256]['flops'] if s in cpu_data else 0 for s in matrix_sizes] kernel1_flops = [cuda_kernel1_data[s]['flops'] for s in matrix_sizes] kernel2_flops = [cuda_kernel2_data[s]['flops'] for s in matrix_sizes] ax2.bar(x - 2*width, cpu_8_flops, width, label='CPU (8 threads)', color='#1f77b4') ax2.bar(x - width, cpu_64_flops, width, label='CPU (64 threads)', color='#ff7f0e') ax2.bar(x, cpu_256_flops, width, label='CPU (256 threads)', color='#2ca02c') ax2.bar(x + width, kernel1_flops, width, label='CUDA Kernel1', color='#d62728') ax2.bar(x + 2*width, kernel2_flops, width, label='CUDA Kernel2', color='#9467bd') ax2.set_xlabel('Matrix Size') ax2.set_ylabel('GFLOPS') ax2.set_title('Performance Comparison (GFLOPS)') ax2.set_xticks(x) ax2.set_xticklabels([f'{s}x{s}' for s in matrix_sizes]) ax2.legend() ax2.grid(True, alpha=0.3) # 图3:加速比(相对于CPU 8线程) ax3 = axes[1, 0] kernel1_speedup = [cpu_data[s][8]['time'] / cuda_kernel1_data[s]['time'] if s in cpu_data else 0 for s in matrix_sizes] kernel2_speedup = [cpu_data[s][8]['time'] / cuda_kernel2_data[s]['time'] if s in cpu_data else 0 for s in matrix_sizes] ax3.plot(size_numeric, kernel1_speedup, marker='o', linewidth=2, label='CUDA Kernel1 vs CPU', color='#d62728') ax3.plot(size_numeric, kernel2_speedup, marker='s', linewidth=2, label='CUDA Kernel2 vs CPU', color='#9467bd') ax3.set_xlabel('Matrix Size') ax3.set_ylabel('Speedup') ax3.set_title('Speedup over CPU (8 threads)') ax3.legend() ax3.grid(True, alpha=0.3) # 图4:CUDA Kernel1 vs Kernel2 性能对比 ax4 = axes[1, 1] kernel_kernel_speedup = [cuda_kernel2_data[s]['time'] / cuda_kernel1_data[s]['time'] for s in matrix_sizes] ax4.bar(size_numeric, kernel_kernel_speedup, color='#e377c2', alpha=0.7) ax4.axhline(y=1, color='gray', linestyle='--', linewidth=2) ax4.set_xlabel('Matrix Size') ax4.set_ylabel('Speedup (Kernel2/Kernel1)') ax4.set_title('Kernel2 vs Kernel1 Performance Ratio') ax4.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment1_analysis.png', dpi=300, bbox_inches='tight') print("\n图表已保存至: experiment_data/experiment1_analysis.png") def plot_experiment2(): """绘制实验二的图表""" matrix_sizes = ['256', '512', '1024', '2048'] block_sizes = [4, 8, 16, 32] fig, axes = plt.subplots(2, 2, figsize=(15, 12)) colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'] markers = ['o', 's', '^', 'd'] # 图1:不同矩阵规模下,BLOCK_SIZE对执行时间的影响 ax1 = axes[0, 0] for i, size in enumerate(matrix_sizes): times = [blocksize_data[size][bs]['time'] for bs in block_sizes] ax1.plot(block_sizes, times, marker=markers[i], linewidth=2, label=f'{size}x{size}', color=colors[i]) ax1.set_xlabel('BLOCK_SIZE') ax1.set_ylabel('Time (ms)') ax1.set_title('Execution Time vs BLOCK_SIZE') ax1.legend() ax1.grid(True, alpha=0.3) # 图2:不同矩阵规模下,BLOCK_SIZE对GFLOPS的影响 ax2 = axes[0, 1] for i, size in enumerate(matrix_sizes): flops = [blocksize_data[size][bs]['flops'] for bs in block_sizes] ax2.plot(block_sizes, flops, marker=markers[i], linewidth=2, label=f'{size}x{size}', color=colors[i]) ax2.set_xlabel('BLOCK_SIZE') ax2.set_ylabel('GFLOPS') ax2.set_title('Performance vs BLOCK_SIZE') ax2.legend() ax2.grid(True, alpha=0.3) # 图3:相对于4x4的加速比 ax3 = axes[1, 0] for i, size in enumerate(matrix_sizes): baseline = blocksize_data[size][4]['time'] speedups = [baseline / blocksize_data[size][bs]['time'] for bs in block_sizes] ax3.plot(block_sizes, speedups, marker=markers[i], linewidth=2, label=f'{size}x{size}', color=colors[i]) ax3.set_xlabel('BLOCK_SIZE') ax3.set_ylabel('Speedup over 4x4') ax3.set_title('Performance Improvement Relative to 4x4') ax3.legend() ax3.grid(True, alpha=0.3) # 图4:性能提升趋势(从4x4到32x32) ax4 = axes[1, 1] size_numeric = [int(s) for s in matrix_sizes] speedup_4_to_32 = [blocksize_data[s][4]['time'] / blocksize_data[s][32]['time'] for s in matrix_sizes] ax4.bar(size_numeric, speedup_4_to_32, color='#9467bd', alpha=0.7) ax4.set_xlabel('Matrix Size') ax4.set_ylabel('Speedup (32x32 / 4x4)') ax4.set_title('Performance Gain: 32x32 vs 4x4') ax4.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment2_analysis.png', dpi=300, bbox_inches='tight') print("图表已保存至: experiment_data/experiment2_analysis.png") def analyze_results(): """分析实验结果""" print("\n" + "=" * 100) print("实验结果分析") print("=" * 100) print("\n【实验一分析】") print("-" * 100) print("\n1. CPU性能分析:") print(" - 在小矩阵规模(256x256)下,增加线程数能带来一定性能提升(最高1.28倍加速比)") print(" - 在中大矩阵规模(512x512及以上)下,增加线程数几乎无性能提升") print(" - 原因:小矩阵数据可以放入CPU缓存,多线程扩展性好;大矩阵受内存带宽限制") print(" - CPU性能始终在0.34-0.44 GFLOPS之间,远低于GPU") print("\n2. CUDA Kernel1性能分析:") print(" - 性能稳定在850-905 GFLOPS之间,不随矩阵规模明显变化") print(" - 相比CPU(8线程)实现了约2000-3000倍的加速比") print(" - 优势:简单的线程映射,良好的内存合并访问") print(" - 劣势:每个线程需要重复访问全局内存,没有数据重用") print("\n3. CUDA Kernel2性能分析:") print(" - 性能稳定在317-331 GFLOPS之间") print(" - 相比Kernel1性能下降了约2.7-2.8倍") print(" - 原因分析:") print(" a) TILE_WIDTH=4太小,共享内存开销大于收益") print(" b) 频繁的__syncthreads()同步开销") print(" c) 小tile导致数据重用率低") print(" - 教训:共享内存优化需要合理的tile size,并非所有情况下都有效") print("\n4. 总体结论:") print(" - GPU相比CPU有巨大的性能优势(2000-3000倍)") print(" - 简单的Kernel1反而优于设计不当的Kernel2") print(" - 优化需要考虑硬件特性,盲目优化可能适得其反") print("\n" + "-" * 100) print("\n【实验二分析】") print("-" * 100) print("\n1. BLOCK_SIZE对性能的影响规律:") print(" - 4x4: 性能最差(289-328 GFLOPS)") print(" - 8x8: 性能提升3倍左右(838-1068 GFLOPS)") print(" - 16x16: 性能进一步提升到1423-1537 GFLOPS") print(" - 32x32: 性能最优,达到1506-1574 GFLOPS") print("\n2. 性能提升原因分析:") print(" a) 共享内存利用率提升:") print(" - 更大的tile意味着更多的数据重用") print(" - 减少了全局内存访问次数") print(" b) 线程级并行提升:") print(" - 更大的block包含更多线程,更好的隐藏延迟") print(" c) 计算与内存访问重叠:") print(" - 大tile使得计算时间与内存访问时间更平衡") print("\n3. 性能饱和现象:") print(" - 从16x16到32x32,性能提升幅度减小") print(" - 原因:") print(" a) 共享内存容量限制(每个SM的共享内存有限)") print(" b) 寄存器压力增加") print(" c) 线程块调度效率下降") print("\n4. 最优BLOCK_SIZE选择:") print(" - 对于当前GPU架构,32x32是最优选择") print(" - 不同GPU架构可能有不同的最优值") print(" - 需要根据具体硬件和问题规模进行调优") print("\n5. 与Kernel1对比:") print(" - Kernel1(无共享内存): ~900 GFLOPS") print(" - Kernel2(32x32共享内存): ~1574 GFLOPS") print(" - 正确的共享内存优化可以带来约1.7倍性能提升") print("\n" + "=" * 100) if __name__ == "__main__": print("\n开始分析实验数据...\n") # 打印数据表格 print_experiment1_table() print_experiment2_table() # 绘制图表 print("\n正在生成图表...") plot_experiment1() plot_experiment2() # 分析结果 analyze_results() print("\n分析完成!")