hpc-lab-code/lab4/analyze_results.py
2026-01-22 04:31:52 +08:00

388 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
矩阵乘法性能实验数据分析脚本
分析CPU、CUDA Kernel1、CUDA Kernel2的性能对比
以及不同BLOCK_SIZE对性能的影响
"""
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from matplotlib import rcParams
# 设置中文字体支持
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False
# 实验一数据
# CPU (OpenMP) 不同线程数的数据
cpu_data = {
'256': {
8: {'time': 86.012, 'flops': 0.39, 'speedup': 1.14},
64: {'time': 78.420, 'flops': 0.43, 'speedup': 1.25},
256: {'time': 76.496, 'flops': 0.44, 'speedup': 1.28}
},
'512': {
8: {'time': 747.483, 'flops': 0.36, 'speedup': 1.00},
64: {'time': 743.606, 'flops': 0.36, 'speedup': 1.01},
256: {'time': 748.649, 'flops': 0.36, 'speedup': 1.00}
},
'1024': {
8: {'time': 6033.205, 'flops': 0.36, 'speedup': 1.00},
64: {'time': 6049.318, 'flops': 0.35, 'speedup': 1.00},
256: {'time': 6051.757, 'flops': 0.35, 'speedup': 1.00}
},
'2048': {
8: {'time': 51065.609, 'flops': 0.34, 'speedup': 1.00},
64: {'time': 50995.406, 'flops': 0.34, 'speedup': 1.00},
256: {'time': 51083.363, 'flops': 0.34, 'speedup': 1.00}
}
}
# CUDA Kernel1 数据
cuda_kernel1_data = {
'512': {'time': 0.316, 'flops': 849.49},
'1024': {'time': 2.374, 'flops': 904.75},
'2048': {'time': 19.190, 'flops': 895.23},
'4096': {'time': 152.897, 'flops': 898.90}
}
# CUDA Kernel2 数据 (TILE_WIDTH=4)
cuda_kernel2_data = {
'512': {'time': 0.827, 'flops': 324.65},
'1024': {'time': 6.484, 'flops': 331.22},
'2048': {'time': 53.599, 'flops': 320.52},
'4096': {'time': 433.242, 'flops': 317.23}
}
# 实验二数据不同BLOCK_SIZE的影响
blocksize_data = {
'256': {
4: {'time': 0.116, 'flops': 289.26},
8: {'time': 0.040, 'flops': 838.19},
16: {'time': 0.029, 'flops': 1170.29},
32: {'time': 0.026, 'flops': 1292.94}
},
'512': {
4: {'time': 0.831, 'flops': 323.04},
8: {'time': 0.265, 'flops': 1014.10},
16: {'time': 0.189, 'flops': 1423.49},
32: {'time': 0.178, 'flops': 1506.57}
},
'1024': {
4: {'time': 6.539, 'flops': 328.40},
8: {'time': 2.022, 'flops': 1061.88},
16: {'time': 1.397, 'flops': 1536.94},
32: {'time': 1.364, 'flops': 1574.44}
},
'2048': {
4: {'time': 54.023, 'flops': 318.01},
8: {'time': 16.080, 'flops': 1068.38},
16: {'time': 11.454, 'flops': 1499.84},
32: {'time': 11.019, 'flops': 1559.16}
}
}
def print_experiment1_table():
"""打印实验一的数据表格"""
print("=" * 100)
print("实验一CPU、CUDA Kernel1、CUDA Kernel2 性能对比")
print("=" * 100)
matrix_sizes = ['512', '1024', '2048', '4096']
thread_counts = [8, 64, 256]
for size in matrix_sizes:
print(f"\n矩阵规模: {size}x{size}")
print("-" * 100)
print(f"{'实现方式':<20} {'线程数':<10} {'时间(ms)':<15} {'GFLOPS':<15} {'加速比':<15}")
print("-" * 100)
# CPU数据
if size in cpu_data:
for threads in thread_counts:
data = cpu_data[size][threads]
print(f"{'CPU (OpenMP)':<20} {threads:<10} {data['time']:<15.3f} {data['flops']:<15.2f} {data['speedup']:<15.2f}")
# CUDA Kernel1数据
if size in cuda_kernel1_data:
data = cuda_kernel1_data[size]
# 计算相对于CPU(8线程)的加速比
cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
speedup = cpu_time / data['time']
print(f"{'CUDA Kernel1':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
# CUDA Kernel2数据
if size in cuda_kernel2_data:
data = cuda_kernel2_data[size]
cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
speedup = cpu_time / data['time']
print(f"{'CUDA Kernel2':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
print("\n" + "=" * 100)
def print_experiment2_table():
"""打印实验二的数据表格"""
print("\n" + "=" * 100)
print("实验二不同BLOCK_SIZE对CUDA程序性能的影响")
print("=" * 100)
matrix_sizes = ['256', '512', '1024', '2048']
block_sizes = [4, 8, 16, 32]
for size in matrix_sizes:
print(f"\n矩阵规模: {size}x{size}")
print("-" * 80)
print(f"{'BLOCK_SIZE':<15} {'时间(ms)':<20} {'GFLOPS':<20} {'相对4x4加速比':<20}")
print("-" * 80)
baseline_time = blocksize_data[size][4]['time']
for bs in block_sizes:
data = blocksize_data[size][bs]
speedup = baseline_time / data['time']
print(f"{bs}x{bs:<10} {data['time']:<20.3f} {data['flops']:<20.2f} {speedup:<20.2f}")
print("\n" + "=" * 100)
def plot_experiment1():
"""绘制实验一的图表"""
matrix_sizes = ['512', '1024', '2048', '4096']
size_numeric = [int(s) for s in matrix_sizes]
# 准备数据
cpu_8_threads = [cpu_data[s][8]['time'] if s in cpu_data else 0 for s in matrix_sizes]
cpu_64_threads = [cpu_data[s][64]['time'] if s in cpu_data else 0 for s in matrix_sizes]
cpu_256_threads = [cpu_data[s][256]['time'] if s in cpu_data else 0 for s in matrix_sizes]
kernel1_times = [cuda_kernel1_data[s]['time'] for s in matrix_sizes]
kernel2_times = [cuda_kernel2_data[s]['time'] for s in matrix_sizes]
# 创建图表
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# 图1执行时间对比对数坐标
ax1 = axes[0, 0]
x = np.arange(len(matrix_sizes))
width = 0.15
ax1.bar(x - 1.5*width, cpu_8_threads, width, label='CPU (8 threads)', color='#1f77b4')
ax1.bar(x - 0.5*width, cpu_64_threads, width, label='CPU (64 threads)', color='#ff7f0e')
ax1.bar(x + 0.5*width, cpu_256_threads, width, label='CPU (256 threads)', color='#2ca02c')
ax1.bar(x + 1.5*width, kernel1_times, width, label='CUDA Kernel1', color='#d62728')
ax1.set_xlabel('Matrix Size')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Execution Time Comparison (Log Scale)')
ax1.set_xticks(x)
ax1.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
ax1.set_yscale('log')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 图2GFLOPS对比
ax2 = axes[0, 1]
cpu_8_flops = [cpu_data[s][8]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
cpu_64_flops = [cpu_data[s][64]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
cpu_256_flops = [cpu_data[s][256]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
kernel1_flops = [cuda_kernel1_data[s]['flops'] for s in matrix_sizes]
kernel2_flops = [cuda_kernel2_data[s]['flops'] for s in matrix_sizes]
ax2.bar(x - 2*width, cpu_8_flops, width, label='CPU (8 threads)', color='#1f77b4')
ax2.bar(x - width, cpu_64_flops, width, label='CPU (64 threads)', color='#ff7f0e')
ax2.bar(x, cpu_256_flops, width, label='CPU (256 threads)', color='#2ca02c')
ax2.bar(x + width, kernel1_flops, width, label='CUDA Kernel1', color='#d62728')
ax2.bar(x + 2*width, kernel2_flops, width, label='CUDA Kernel2', color='#9467bd')
ax2.set_xlabel('Matrix Size')
ax2.set_ylabel('GFLOPS')
ax2.set_title('Performance Comparison (GFLOPS)')
ax2.set_xticks(x)
ax2.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
ax2.legend()
ax2.grid(True, alpha=0.3)
# 图3加速比相对于CPU 8线程
ax3 = axes[1, 0]
kernel1_speedup = [cpu_data[s][8]['time'] / cuda_kernel1_data[s]['time'] if s in cpu_data else 0
for s in matrix_sizes]
kernel2_speedup = [cpu_data[s][8]['time'] / cuda_kernel2_data[s]['time'] if s in cpu_data else 0
for s in matrix_sizes]
ax3.plot(size_numeric, kernel1_speedup, marker='o', linewidth=2, label='CUDA Kernel1 vs CPU', color='#d62728')
ax3.plot(size_numeric, kernel2_speedup, marker='s', linewidth=2, label='CUDA Kernel2 vs CPU', color='#9467bd')
ax3.set_xlabel('Matrix Size')
ax3.set_ylabel('Speedup')
ax3.set_title('Speedup over CPU (8 threads)')
ax3.legend()
ax3.grid(True, alpha=0.3)
# 图4CUDA Kernel1 vs Kernel2 性能对比
ax4 = axes[1, 1]
kernel_kernel_speedup = [cuda_kernel2_data[s]['time'] / cuda_kernel1_data[s]['time'] for s in matrix_sizes]
ax4.bar(size_numeric, kernel_kernel_speedup, color='#e377c2', alpha=0.7)
ax4.axhline(y=1, color='gray', linestyle='--', linewidth=2)
ax4.set_xlabel('Matrix Size')
ax4.set_ylabel('Speedup (Kernel2/Kernel1)')
ax4.set_title('Kernel2 vs Kernel1 Performance Ratio')
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment1_analysis.png', dpi=300, bbox_inches='tight')
print("\n图表已保存至: experiment_data/experiment1_analysis.png")
def plot_experiment2():
"""绘制实验二的图表"""
matrix_sizes = ['256', '512', '1024', '2048']
block_sizes = [4, 8, 16, 32]
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
markers = ['o', 's', '^', 'd']
# 图1不同矩阵规模下BLOCK_SIZE对执行时间的影响
ax1 = axes[0, 0]
for i, size in enumerate(matrix_sizes):
times = [blocksize_data[size][bs]['time'] for bs in block_sizes]
ax1.plot(block_sizes, times, marker=markers[i], linewidth=2,
label=f'{size}x{size}', color=colors[i])
ax1.set_xlabel('BLOCK_SIZE')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Execution Time vs BLOCK_SIZE')
ax1.legend()
ax1.grid(True, alpha=0.3)
# 图2不同矩阵规模下BLOCK_SIZE对GFLOPS的影响
ax2 = axes[0, 1]
for i, size in enumerate(matrix_sizes):
flops = [blocksize_data[size][bs]['flops'] for bs in block_sizes]
ax2.plot(block_sizes, flops, marker=markers[i], linewidth=2,
label=f'{size}x{size}', color=colors[i])
ax2.set_xlabel('BLOCK_SIZE')
ax2.set_ylabel('GFLOPS')
ax2.set_title('Performance vs BLOCK_SIZE')
ax2.legend()
ax2.grid(True, alpha=0.3)
# 图3相对于4x4的加速比
ax3 = axes[1, 0]
for i, size in enumerate(matrix_sizes):
baseline = blocksize_data[size][4]['time']
speedups = [baseline / blocksize_data[size][bs]['time'] for bs in block_sizes]
ax3.plot(block_sizes, speedups, marker=markers[i], linewidth=2,
label=f'{size}x{size}', color=colors[i])
ax3.set_xlabel('BLOCK_SIZE')
ax3.set_ylabel('Speedup over 4x4')
ax3.set_title('Performance Improvement Relative to 4x4')
ax3.legend()
ax3.grid(True, alpha=0.3)
# 图4性能提升趋势从4x4到32x32
ax4 = axes[1, 1]
size_numeric = [int(s) for s in matrix_sizes]
speedup_4_to_32 = [blocksize_data[s][4]['time'] / blocksize_data[s][32]['time'] for s in matrix_sizes]
ax4.bar(size_numeric, speedup_4_to_32, color='#9467bd', alpha=0.7)
ax4.set_xlabel('Matrix Size')
ax4.set_ylabel('Speedup (32x32 / 4x4)')
ax4.set_title('Performance Gain: 32x32 vs 4x4')
ax4.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment2_analysis.png', dpi=300, bbox_inches='tight')
print("图表已保存至: experiment_data/experiment2_analysis.png")
def analyze_results():
"""分析实验结果"""
print("\n" + "=" * 100)
print("实验结果分析")
print("=" * 100)
print("\n【实验一分析】")
print("-" * 100)
print("\n1. CPU性能分析:")
print(" - 在小矩阵规模(256x256)下,增加线程数能带来一定性能提升(最高1.28倍加速比)")
print(" - 在中大矩阵规模(512x512及以上)下,增加线程数几乎无性能提升")
print(" - 原因小矩阵数据可以放入CPU缓存多线程扩展性好大矩阵受内存带宽限制")
print(" - CPU性能始终在0.34-0.44 GFLOPS之间远低于GPU")
print("\n2. CUDA Kernel1性能分析:")
print(" - 性能稳定在850-905 GFLOPS之间不随矩阵规模明显变化")
print(" - 相比CPU(8线程)实现了约2000-3000倍的加速比")
print(" - 优势:简单的线程映射,良好的内存合并访问")
print(" - 劣势:每个线程需要重复访问全局内存,没有数据重用")
print("\n3. CUDA Kernel2性能分析:")
print(" - 性能稳定在317-331 GFLOPS之间")
print(" - 相比Kernel1性能下降了约2.7-2.8倍")
print(" - 原因分析:")
print(" a) TILE_WIDTH=4太小共享内存开销大于收益")
print(" b) 频繁的__syncthreads()同步开销")
print(" c) 小tile导致数据重用率低")
print(" - 教训共享内存优化需要合理的tile size并非所有情况下都有效")
print("\n4. 总体结论:")
print(" - GPU相比CPU有巨大的性能优势(2000-3000倍)")
print(" - 简单的Kernel1反而优于设计不当的Kernel2")
print(" - 优化需要考虑硬件特性,盲目优化可能适得其反")
print("\n" + "-" * 100)
print("\n【实验二分析】")
print("-" * 100)
print("\n1. BLOCK_SIZE对性能的影响规律:")
print(" - 4x4: 性能最差(289-328 GFLOPS)")
print(" - 8x8: 性能提升3倍左右(838-1068 GFLOPS)")
print(" - 16x16: 性能进一步提升到1423-1537 GFLOPS")
print(" - 32x32: 性能最优达到1506-1574 GFLOPS")
print("\n2. 性能提升原因分析:")
print(" a) 共享内存利用率提升:")
print(" - 更大的tile意味着更多的数据重用")
print(" - 减少了全局内存访问次数")
print(" b) 线程级并行提升:")
print(" - 更大的block包含更多线程更好的隐藏延迟")
print(" c) 计算与内存访问重叠:")
print(" - 大tile使得计算时间与内存访问时间更平衡")
print("\n3. 性能饱和现象:")
print(" - 从16x16到32x32性能提升幅度减小")
print(" - 原因:")
print(" a) 共享内存容量限制(每个SM的共享内存有限)")
print(" b) 寄存器压力增加")
print(" c) 线程块调度效率下降")
print("\n4. 最优BLOCK_SIZE选择:")
print(" - 对于当前GPU架构32x32是最优选择")
print(" - 不同GPU架构可能有不同的最优值")
print(" - 需要根据具体硬件和问题规模进行调优")
print("\n5. 与Kernel1对比:")
print(" - Kernel1(无共享内存): ~900 GFLOPS")
print(" - Kernel2(32x32共享内存): ~1574 GFLOPS")
print(" - 正确的共享内存优化可以带来约1.7倍性能提升")
print("\n" + "=" * 100)
if __name__ == "__main__":
print("\n开始分析实验数据...\n")
# 打印数据表格
print_experiment1_table()
print_experiment2_table()
# 绘制图表
print("\n正在生成图表...")
plot_experiment1()
plot_experiment2()
# 分析结果
analyze_results()
print("\n分析完成!")