388 lines
15 KiB
Python
388 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
矩阵乘法性能实验数据分析脚本
|
||
分析CPU、CUDA Kernel1、CUDA Kernel2的性能对比
|
||
以及不同BLOCK_SIZE对性能的影响
|
||
"""
|
||
|
||
import matplotlib.pyplot as plt
|
||
import numpy as np
|
||
import matplotlib
|
||
from matplotlib import rcParams
|
||
|
||
# 设置中文字体支持
|
||
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
|
||
matplotlib.rcParams['axes.unicode_minus'] = False
|
||
|
||
# 实验一数据
|
||
# CPU (OpenMP) 不同线程数的数据
|
||
cpu_data = {
|
||
'256': {
|
||
8: {'time': 86.012, 'flops': 0.39, 'speedup': 1.14},
|
||
64: {'time': 78.420, 'flops': 0.43, 'speedup': 1.25},
|
||
256: {'time': 76.496, 'flops': 0.44, 'speedup': 1.28}
|
||
},
|
||
'512': {
|
||
8: {'time': 747.483, 'flops': 0.36, 'speedup': 1.00},
|
||
64: {'time': 743.606, 'flops': 0.36, 'speedup': 1.01},
|
||
256: {'time': 748.649, 'flops': 0.36, 'speedup': 1.00}
|
||
},
|
||
'1024': {
|
||
8: {'time': 6033.205, 'flops': 0.36, 'speedup': 1.00},
|
||
64: {'time': 6049.318, 'flops': 0.35, 'speedup': 1.00},
|
||
256: {'time': 6051.757, 'flops': 0.35, 'speedup': 1.00}
|
||
},
|
||
'2048': {
|
||
8: {'time': 51065.609, 'flops': 0.34, 'speedup': 1.00},
|
||
64: {'time': 50995.406, 'flops': 0.34, 'speedup': 1.00},
|
||
256: {'time': 51083.363, 'flops': 0.34, 'speedup': 1.00}
|
||
}
|
||
}
|
||
|
||
# CUDA Kernel1 数据
|
||
cuda_kernel1_data = {
|
||
'512': {'time': 0.316, 'flops': 849.49},
|
||
'1024': {'time': 2.374, 'flops': 904.75},
|
||
'2048': {'time': 19.190, 'flops': 895.23},
|
||
'4096': {'time': 152.897, 'flops': 898.90}
|
||
}
|
||
|
||
# CUDA Kernel2 数据 (TILE_WIDTH=4)
|
||
cuda_kernel2_data = {
|
||
'512': {'time': 0.827, 'flops': 324.65},
|
||
'1024': {'time': 6.484, 'flops': 331.22},
|
||
'2048': {'time': 53.599, 'flops': 320.52},
|
||
'4096': {'time': 433.242, 'flops': 317.23}
|
||
}
|
||
|
||
# 实验二数据:不同BLOCK_SIZE的影响
|
||
blocksize_data = {
|
||
'256': {
|
||
4: {'time': 0.116, 'flops': 289.26},
|
||
8: {'time': 0.040, 'flops': 838.19},
|
||
16: {'time': 0.029, 'flops': 1170.29},
|
||
32: {'time': 0.026, 'flops': 1292.94}
|
||
},
|
||
'512': {
|
||
4: {'time': 0.831, 'flops': 323.04},
|
||
8: {'time': 0.265, 'flops': 1014.10},
|
||
16: {'time': 0.189, 'flops': 1423.49},
|
||
32: {'time': 0.178, 'flops': 1506.57}
|
||
},
|
||
'1024': {
|
||
4: {'time': 6.539, 'flops': 328.40},
|
||
8: {'time': 2.022, 'flops': 1061.88},
|
||
16: {'time': 1.397, 'flops': 1536.94},
|
||
32: {'time': 1.364, 'flops': 1574.44}
|
||
},
|
||
'2048': {
|
||
4: {'time': 54.023, 'flops': 318.01},
|
||
8: {'time': 16.080, 'flops': 1068.38},
|
||
16: {'time': 11.454, 'flops': 1499.84},
|
||
32: {'time': 11.019, 'flops': 1559.16}
|
||
}
|
||
}
|
||
|
||
def print_experiment1_table():
|
||
"""打印实验一的数据表格"""
|
||
print("=" * 100)
|
||
print("实验一:CPU、CUDA Kernel1、CUDA Kernel2 性能对比")
|
||
print("=" * 100)
|
||
|
||
matrix_sizes = ['512', '1024', '2048', '4096']
|
||
thread_counts = [8, 64, 256]
|
||
|
||
for size in matrix_sizes:
|
||
print(f"\n矩阵规模: {size}x{size}")
|
||
print("-" * 100)
|
||
print(f"{'实现方式':<20} {'线程数':<10} {'时间(ms)':<15} {'GFLOPS':<15} {'加速比':<15}")
|
||
print("-" * 100)
|
||
|
||
# CPU数据
|
||
if size in cpu_data:
|
||
for threads in thread_counts:
|
||
data = cpu_data[size][threads]
|
||
print(f"{'CPU (OpenMP)':<20} {threads:<10} {data['time']:<15.3f} {data['flops']:<15.2f} {data['speedup']:<15.2f}")
|
||
|
||
# CUDA Kernel1数据
|
||
if size in cuda_kernel1_data:
|
||
data = cuda_kernel1_data[size]
|
||
# 计算相对于CPU(8线程)的加速比
|
||
cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
|
||
speedup = cpu_time / data['time']
|
||
print(f"{'CUDA Kernel1':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
|
||
|
||
# CUDA Kernel2数据
|
||
if size in cuda_kernel2_data:
|
||
data = cuda_kernel2_data[size]
|
||
cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
|
||
speedup = cpu_time / data['time']
|
||
print(f"{'CUDA Kernel2':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
|
||
|
||
print("\n" + "=" * 100)
|
||
|
||
def print_experiment2_table():
|
||
"""打印实验二的数据表格"""
|
||
print("\n" + "=" * 100)
|
||
print("实验二:不同BLOCK_SIZE对CUDA程序性能的影响")
|
||
print("=" * 100)
|
||
|
||
matrix_sizes = ['256', '512', '1024', '2048']
|
||
block_sizes = [4, 8, 16, 32]
|
||
|
||
for size in matrix_sizes:
|
||
print(f"\n矩阵规模: {size}x{size}")
|
||
print("-" * 80)
|
||
print(f"{'BLOCK_SIZE':<15} {'时间(ms)':<20} {'GFLOPS':<20} {'相对4x4加速比':<20}")
|
||
print("-" * 80)
|
||
|
||
baseline_time = blocksize_data[size][4]['time']
|
||
for bs in block_sizes:
|
||
data = blocksize_data[size][bs]
|
||
speedup = baseline_time / data['time']
|
||
print(f"{bs}x{bs:<10} {data['time']:<20.3f} {data['flops']:<20.2f} {speedup:<20.2f}")
|
||
|
||
print("\n" + "=" * 100)
|
||
|
||
def plot_experiment1():
|
||
"""绘制实验一的图表"""
|
||
matrix_sizes = ['512', '1024', '2048', '4096']
|
||
size_numeric = [int(s) for s in matrix_sizes]
|
||
|
||
# 准备数据
|
||
cpu_8_threads = [cpu_data[s][8]['time'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
cpu_64_threads = [cpu_data[s][64]['time'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
cpu_256_threads = [cpu_data[s][256]['time'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
kernel1_times = [cuda_kernel1_data[s]['time'] for s in matrix_sizes]
|
||
kernel2_times = [cuda_kernel2_data[s]['time'] for s in matrix_sizes]
|
||
|
||
# 创建图表
|
||
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
||
|
||
# 图1:执行时间对比(对数坐标)
|
||
ax1 = axes[0, 0]
|
||
x = np.arange(len(matrix_sizes))
|
||
width = 0.15
|
||
|
||
ax1.bar(x - 1.5*width, cpu_8_threads, width, label='CPU (8 threads)', color='#1f77b4')
|
||
ax1.bar(x - 0.5*width, cpu_64_threads, width, label='CPU (64 threads)', color='#ff7f0e')
|
||
ax1.bar(x + 0.5*width, cpu_256_threads, width, label='CPU (256 threads)', color='#2ca02c')
|
||
ax1.bar(x + 1.5*width, kernel1_times, width, label='CUDA Kernel1', color='#d62728')
|
||
|
||
ax1.set_xlabel('Matrix Size')
|
||
ax1.set_ylabel('Time (ms)')
|
||
ax1.set_title('Execution Time Comparison (Log Scale)')
|
||
ax1.set_xticks(x)
|
||
ax1.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
|
||
ax1.set_yscale('log')
|
||
ax1.legend()
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
# 图2:GFLOPS对比
|
||
ax2 = axes[0, 1]
|
||
cpu_8_flops = [cpu_data[s][8]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
cpu_64_flops = [cpu_data[s][64]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
cpu_256_flops = [cpu_data[s][256]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
|
||
kernel1_flops = [cuda_kernel1_data[s]['flops'] for s in matrix_sizes]
|
||
kernel2_flops = [cuda_kernel2_data[s]['flops'] for s in matrix_sizes]
|
||
|
||
ax2.bar(x - 2*width, cpu_8_flops, width, label='CPU (8 threads)', color='#1f77b4')
|
||
ax2.bar(x - width, cpu_64_flops, width, label='CPU (64 threads)', color='#ff7f0e')
|
||
ax2.bar(x, cpu_256_flops, width, label='CPU (256 threads)', color='#2ca02c')
|
||
ax2.bar(x + width, kernel1_flops, width, label='CUDA Kernel1', color='#d62728')
|
||
ax2.bar(x + 2*width, kernel2_flops, width, label='CUDA Kernel2', color='#9467bd')
|
||
|
||
ax2.set_xlabel('Matrix Size')
|
||
ax2.set_ylabel('GFLOPS')
|
||
ax2.set_title('Performance Comparison (GFLOPS)')
|
||
ax2.set_xticks(x)
|
||
ax2.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
|
||
ax2.legend()
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
# 图3:加速比(相对于CPU 8线程)
|
||
ax3 = axes[1, 0]
|
||
kernel1_speedup = [cpu_data[s][8]['time'] / cuda_kernel1_data[s]['time'] if s in cpu_data else 0
|
||
for s in matrix_sizes]
|
||
kernel2_speedup = [cpu_data[s][8]['time'] / cuda_kernel2_data[s]['time'] if s in cpu_data else 0
|
||
for s in matrix_sizes]
|
||
|
||
ax3.plot(size_numeric, kernel1_speedup, marker='o', linewidth=2, label='CUDA Kernel1 vs CPU', color='#d62728')
|
||
ax3.plot(size_numeric, kernel2_speedup, marker='s', linewidth=2, label='CUDA Kernel2 vs CPU', color='#9467bd')
|
||
|
||
ax3.set_xlabel('Matrix Size')
|
||
ax3.set_ylabel('Speedup')
|
||
ax3.set_title('Speedup over CPU (8 threads)')
|
||
ax3.legend()
|
||
ax3.grid(True, alpha=0.3)
|
||
|
||
# 图4:CUDA Kernel1 vs Kernel2 性能对比
|
||
ax4 = axes[1, 1]
|
||
kernel_kernel_speedup = [cuda_kernel2_data[s]['time'] / cuda_kernel1_data[s]['time'] for s in matrix_sizes]
|
||
|
||
ax4.bar(size_numeric, kernel_kernel_speedup, color='#e377c2', alpha=0.7)
|
||
ax4.axhline(y=1, color='gray', linestyle='--', linewidth=2)
|
||
ax4.set_xlabel('Matrix Size')
|
||
ax4.set_ylabel('Speedup (Kernel2/Kernel1)')
|
||
ax4.set_title('Kernel2 vs Kernel1 Performance Ratio')
|
||
ax4.grid(True, alpha=0.3)
|
||
|
||
plt.tight_layout()
|
||
plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment1_analysis.png', dpi=300, bbox_inches='tight')
|
||
print("\n图表已保存至: experiment_data/experiment1_analysis.png")
|
||
|
||
def plot_experiment2():
|
||
"""绘制实验二的图表"""
|
||
matrix_sizes = ['256', '512', '1024', '2048']
|
||
block_sizes = [4, 8, 16, 32]
|
||
|
||
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
||
|
||
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
|
||
markers = ['o', 's', '^', 'd']
|
||
|
||
# 图1:不同矩阵规模下,BLOCK_SIZE对执行时间的影响
|
||
ax1 = axes[0, 0]
|
||
for i, size in enumerate(matrix_sizes):
|
||
times = [blocksize_data[size][bs]['time'] for bs in block_sizes]
|
||
ax1.plot(block_sizes, times, marker=markers[i], linewidth=2,
|
||
label=f'{size}x{size}', color=colors[i])
|
||
|
||
ax1.set_xlabel('BLOCK_SIZE')
|
||
ax1.set_ylabel('Time (ms)')
|
||
ax1.set_title('Execution Time vs BLOCK_SIZE')
|
||
ax1.legend()
|
||
ax1.grid(True, alpha=0.3)
|
||
|
||
# 图2:不同矩阵规模下,BLOCK_SIZE对GFLOPS的影响
|
||
ax2 = axes[0, 1]
|
||
for i, size in enumerate(matrix_sizes):
|
||
flops = [blocksize_data[size][bs]['flops'] for bs in block_sizes]
|
||
ax2.plot(block_sizes, flops, marker=markers[i], linewidth=2,
|
||
label=f'{size}x{size}', color=colors[i])
|
||
|
||
ax2.set_xlabel('BLOCK_SIZE')
|
||
ax2.set_ylabel('GFLOPS')
|
||
ax2.set_title('Performance vs BLOCK_SIZE')
|
||
ax2.legend()
|
||
ax2.grid(True, alpha=0.3)
|
||
|
||
# 图3:相对于4x4的加速比
|
||
ax3 = axes[1, 0]
|
||
for i, size in enumerate(matrix_sizes):
|
||
baseline = blocksize_data[size][4]['time']
|
||
speedups = [baseline / blocksize_data[size][bs]['time'] for bs in block_sizes]
|
||
ax3.plot(block_sizes, speedups, marker=markers[i], linewidth=2,
|
||
label=f'{size}x{size}', color=colors[i])
|
||
|
||
ax3.set_xlabel('BLOCK_SIZE')
|
||
ax3.set_ylabel('Speedup over 4x4')
|
||
ax3.set_title('Performance Improvement Relative to 4x4')
|
||
ax3.legend()
|
||
ax3.grid(True, alpha=0.3)
|
||
|
||
# 图4:性能提升趋势(从4x4到32x32)
|
||
ax4 = axes[1, 1]
|
||
size_numeric = [int(s) for s in matrix_sizes]
|
||
speedup_4_to_32 = [blocksize_data[s][4]['time'] / blocksize_data[s][32]['time'] for s in matrix_sizes]
|
||
|
||
ax4.bar(size_numeric, speedup_4_to_32, color='#9467bd', alpha=0.7)
|
||
ax4.set_xlabel('Matrix Size')
|
||
ax4.set_ylabel('Speedup (32x32 / 4x4)')
|
||
ax4.set_title('Performance Gain: 32x32 vs 4x4')
|
||
ax4.grid(True, alpha=0.3, axis='y')
|
||
|
||
plt.tight_layout()
|
||
plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment2_analysis.png', dpi=300, bbox_inches='tight')
|
||
print("图表已保存至: experiment_data/experiment2_analysis.png")
|
||
|
||
def analyze_results():
|
||
"""分析实验结果"""
|
||
print("\n" + "=" * 100)
|
||
print("实验结果分析")
|
||
print("=" * 100)
|
||
|
||
print("\n【实验一分析】")
|
||
print("-" * 100)
|
||
|
||
print("\n1. CPU性能分析:")
|
||
print(" - 在小矩阵规模(256x256)下,增加线程数能带来一定性能提升(最高1.28倍加速比)")
|
||
print(" - 在中大矩阵规模(512x512及以上)下,增加线程数几乎无性能提升")
|
||
print(" - 原因:小矩阵数据可以放入CPU缓存,多线程扩展性好;大矩阵受内存带宽限制")
|
||
print(" - CPU性能始终在0.34-0.44 GFLOPS之间,远低于GPU")
|
||
|
||
print("\n2. CUDA Kernel1性能分析:")
|
||
print(" - 性能稳定在850-905 GFLOPS之间,不随矩阵规模明显变化")
|
||
print(" - 相比CPU(8线程)实现了约2000-3000倍的加速比")
|
||
print(" - 优势:简单的线程映射,良好的内存合并访问")
|
||
print(" - 劣势:每个线程需要重复访问全局内存,没有数据重用")
|
||
|
||
print("\n3. CUDA Kernel2性能分析:")
|
||
print(" - 性能稳定在317-331 GFLOPS之间")
|
||
print(" - 相比Kernel1性能下降了约2.7-2.8倍")
|
||
print(" - 原因分析:")
|
||
print(" a) TILE_WIDTH=4太小,共享内存开销大于收益")
|
||
print(" b) 频繁的__syncthreads()同步开销")
|
||
print(" c) 小tile导致数据重用率低")
|
||
print(" - 教训:共享内存优化需要合理的tile size,并非所有情况下都有效")
|
||
|
||
print("\n4. 总体结论:")
|
||
print(" - GPU相比CPU有巨大的性能优势(2000-3000倍)")
|
||
print(" - 简单的Kernel1反而优于设计不当的Kernel2")
|
||
print(" - 优化需要考虑硬件特性,盲目优化可能适得其反")
|
||
|
||
print("\n" + "-" * 100)
|
||
print("\n【实验二分析】")
|
||
print("-" * 100)
|
||
|
||
print("\n1. BLOCK_SIZE对性能的影响规律:")
|
||
print(" - 4x4: 性能最差(289-328 GFLOPS)")
|
||
print(" - 8x8: 性能提升3倍左右(838-1068 GFLOPS)")
|
||
print(" - 16x16: 性能进一步提升到1423-1537 GFLOPS")
|
||
print(" - 32x32: 性能最优,达到1506-1574 GFLOPS")
|
||
|
||
print("\n2. 性能提升原因分析:")
|
||
print(" a) 共享内存利用率提升:")
|
||
print(" - 更大的tile意味着更多的数据重用")
|
||
print(" - 减少了全局内存访问次数")
|
||
print(" b) 线程级并行提升:")
|
||
print(" - 更大的block包含更多线程,更好的隐藏延迟")
|
||
print(" c) 计算与内存访问重叠:")
|
||
print(" - 大tile使得计算时间与内存访问时间更平衡")
|
||
|
||
print("\n3. 性能饱和现象:")
|
||
print(" - 从16x16到32x32,性能提升幅度减小")
|
||
print(" - 原因:")
|
||
print(" a) 共享内存容量限制(每个SM的共享内存有限)")
|
||
print(" b) 寄存器压力增加")
|
||
print(" c) 线程块调度效率下降")
|
||
|
||
print("\n4. 最优BLOCK_SIZE选择:")
|
||
print(" - 对于当前GPU架构,32x32是最优选择")
|
||
print(" - 不同GPU架构可能有不同的最优值")
|
||
print(" - 需要根据具体硬件和问题规模进行调优")
|
||
|
||
print("\n5. 与Kernel1对比:")
|
||
print(" - Kernel1(无共享内存): ~900 GFLOPS")
|
||
print(" - Kernel2(32x32共享内存): ~1574 GFLOPS")
|
||
print(" - 正确的共享内存优化可以带来约1.7倍性能提升")
|
||
|
||
print("\n" + "=" * 100)
|
||
|
||
if __name__ == "__main__":
|
||
print("\n开始分析实验数据...\n")
|
||
|
||
# 打印数据表格
|
||
print_experiment1_table()
|
||
print_experiment2_table()
|
||
|
||
# 绘制图表
|
||
print("\n正在生成图表...")
|
||
plot_experiment1()
|
||
plot_experiment2()
|
||
|
||
# 分析结果
|
||
analyze_results()
|
||
|
||
print("\n分析完成!")
|