#!/usr/bin/env python3
"""
矩阵乘法性能实验数据分析脚本
分析CPU、CUDA Kernel1、CUDA Kernel2的性能对比
以及不同BLOCK_SIZE对性能的影响
"""

import matplotlib.pyplot as plt
import numpy as np
import matplotlib
from matplotlib import rcParams

# 设置中文字体支持
matplotlib.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False

# 实验一数据
# CPU (OpenMP) 不同线程数的数据
cpu_data = {
    '256': {
        8: {'time': 86.012, 'flops': 0.39, 'speedup': 1.14},
        64: {'time': 78.420, 'flops': 0.43, 'speedup': 1.25},
        256: {'time': 76.496, 'flops': 0.44, 'speedup': 1.28}
    },
    '512': {
        8: {'time': 747.483, 'flops': 0.36, 'speedup': 1.00},
        64: {'time': 743.606, 'flops': 0.36, 'speedup': 1.01},
        256: {'time': 748.649, 'flops': 0.36, 'speedup': 1.00}
    },
    '1024': {
        8: {'time': 6033.205, 'flops': 0.36, 'speedup': 1.00},
        64: {'time': 6049.318, 'flops': 0.35, 'speedup': 1.00},
        256: {'time': 6051.757, 'flops': 0.35, 'speedup': 1.00}
    },
    '2048': {
        8: {'time': 51065.609, 'flops': 0.34, 'speedup': 1.00},
        64: {'time': 50995.406, 'flops': 0.34, 'speedup': 1.00},
        256: {'time': 51083.363, 'flops': 0.34, 'speedup': 1.00}
    }
}

# CUDA Kernel1 数据
cuda_kernel1_data = {
    '512': {'time': 0.316, 'flops': 849.49},
    '1024': {'time': 2.374, 'flops': 904.75},
    '2048': {'time': 19.190, 'flops': 895.23},
    '4096': {'time': 152.897, 'flops': 898.90}
}

# CUDA Kernel2 数据 (TILE_WIDTH=4)
cuda_kernel2_data = {
    '512': {'time': 0.827, 'flops': 324.65},
    '1024': {'time': 6.484, 'flops': 331.22},
    '2048': {'time': 53.599, 'flops': 320.52},
    '4096': {'time': 433.242, 'flops': 317.23}
}

# 实验二数据：不同BLOCK_SIZE的影响
blocksize_data = {
    '256': {
        4: {'time': 0.116, 'flops': 289.26},
        8: {'time': 0.040, 'flops': 838.19},
        16: {'time': 0.029, 'flops': 1170.29},
        32: {'time': 0.026, 'flops': 1292.94}
    },
    '512': {
        4: {'time': 0.831, 'flops': 323.04},
        8: {'time': 0.265, 'flops': 1014.10},
        16: {'time': 0.189, 'flops': 1423.49},
        32: {'time': 0.178, 'flops': 1506.57}
    },
    '1024': {
        4: {'time': 6.539, 'flops': 328.40},
        8: {'time': 2.022, 'flops': 1061.88},
        16: {'time': 1.397, 'flops': 1536.94},
        32: {'time': 1.364, 'flops': 1574.44}
    },
    '2048': {
        4: {'time': 54.023, 'flops': 318.01},
        8: {'time': 16.080, 'flops': 1068.38},
        16: {'time': 11.454, 'flops': 1499.84},
        32: {'time': 11.019, 'flops': 1559.16}
    }
}

def print_experiment1_table():
    """打印实验一的数据表格"""
    print("=" * 100)
    print("实验一：CPU、CUDA Kernel1、CUDA Kernel2 性能对比")
    print("=" * 100)
    
    matrix_sizes = ['512', '1024', '2048', '4096']
    thread_counts = [8, 64, 256]
    
    for size in matrix_sizes:
        print(f"\n矩阵规模: {size}x{size}")
        print("-" * 100)
        print(f"{'实现方式':<20} {'线程数':<10} {'时间(ms)':<15} {'GFLOPS':<15} {'加速比':<15}")
        print("-" * 100)
        
        # CPU数据
        if size in cpu_data:
            for threads in thread_counts:
                data = cpu_data[size][threads]
                print(f"{'CPU (OpenMP)':<20} {threads:<10} {data['time']:<15.3f} {data['flops']:<15.2f} {data['speedup']:<15.2f}")
        
        # CUDA Kernel1数据
        if size in cuda_kernel1_data:
            data = cuda_kernel1_data[size]
            # 计算相对于CPU(8线程)的加速比
            cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
            speedup = cpu_time / data['time']
            print(f"{'CUDA Kernel1':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
        
        # CUDA Kernel2数据
        if size in cuda_kernel2_data:
            data = cuda_kernel2_data[size]
            cpu_time = cpu_data[size][8]['time'] if size in cpu_data else data['time']
            speedup = cpu_time / data['time']
            print(f"{'CUDA Kernel2':<20} {'-':<10} {data['time']:<15.3f} {data['flops']:<15.2f} {speedup:<15.2f}")
    
    print("\n" + "=" * 100)

def print_experiment2_table():
    """打印实验二的数据表格"""
    print("\n" + "=" * 100)
    print("实验二：不同BLOCK_SIZE对CUDA程序性能的影响")
    print("=" * 100)
    
    matrix_sizes = ['256', '512', '1024', '2048']
    block_sizes = [4, 8, 16, 32]
    
    for size in matrix_sizes:
        print(f"\n矩阵规模: {size}x{size}")
        print("-" * 80)
        print(f"{'BLOCK_SIZE':<15} {'时间(ms)':<20} {'GFLOPS':<20} {'相对4x4加速比':<20}")
        print("-" * 80)
        
        baseline_time = blocksize_data[size][4]['time']
        for bs in block_sizes:
            data = blocksize_data[size][bs]
            speedup = baseline_time / data['time']
            print(f"{bs}x{bs:<10} {data['time']:<20.3f} {data['flops']:<20.2f} {speedup:<20.2f}")
    
    print("\n" + "=" * 100)

def plot_experiment1():
    """绘制实验一的图表"""
    matrix_sizes = ['512', '1024', '2048', '4096']
    size_numeric = [int(s) for s in matrix_sizes]
    
    # 准备数据
    cpu_8_threads = [cpu_data[s][8]['time'] if s in cpu_data else 0 for s in matrix_sizes]
    cpu_64_threads = [cpu_data[s][64]['time'] if s in cpu_data else 0 for s in matrix_sizes]
    cpu_256_threads = [cpu_data[s][256]['time'] if s in cpu_data else 0 for s in matrix_sizes]
    kernel1_times = [cuda_kernel1_data[s]['time'] for s in matrix_sizes]
    kernel2_times = [cuda_kernel2_data[s]['time'] for s in matrix_sizes]
    
    # 创建图表
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # 图1：执行时间对比（对数坐标）
    ax1 = axes[0, 0]
    x = np.arange(len(matrix_sizes))
    width = 0.15
    
    ax1.bar(x - 1.5*width, cpu_8_threads, width, label='CPU (8 threads)', color='#1f77b4')
    ax1.bar(x - 0.5*width, cpu_64_threads, width, label='CPU (64 threads)', color='#ff7f0e')
    ax1.bar(x + 0.5*width, cpu_256_threads, width, label='CPU (256 threads)', color='#2ca02c')
    ax1.bar(x + 1.5*width, kernel1_times, width, label='CUDA Kernel1', color='#d62728')
    
    ax1.set_xlabel('Matrix Size')
    ax1.set_ylabel('Time (ms)')
    ax1.set_title('Execution Time Comparison (Log Scale)')
    ax1.set_xticks(x)
    ax1.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
    ax1.set_yscale('log')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 图2：GFLOPS对比
    ax2 = axes[0, 1]
    cpu_8_flops = [cpu_data[s][8]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
    cpu_64_flops = [cpu_data[s][64]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
    cpu_256_flops = [cpu_data[s][256]['flops'] if s in cpu_data else 0 for s in matrix_sizes]
    kernel1_flops = [cuda_kernel1_data[s]['flops'] for s in matrix_sizes]
    kernel2_flops = [cuda_kernel2_data[s]['flops'] for s in matrix_sizes]
    
    ax2.bar(x - 2*width, cpu_8_flops, width, label='CPU (8 threads)', color='#1f77b4')
    ax2.bar(x - width, cpu_64_flops, width, label='CPU (64 threads)', color='#ff7f0e')
    ax2.bar(x, cpu_256_flops, width, label='CPU (256 threads)', color='#2ca02c')
    ax2.bar(x + width, kernel1_flops, width, label='CUDA Kernel1', color='#d62728')
    ax2.bar(x + 2*width, kernel2_flops, width, label='CUDA Kernel2', color='#9467bd')
    
    ax2.set_xlabel('Matrix Size')
    ax2.set_ylabel('GFLOPS')
    ax2.set_title('Performance Comparison (GFLOPS)')
    ax2.set_xticks(x)
    ax2.set_xticklabels([f'{s}x{s}' for s in matrix_sizes])
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 图3：加速比（相对于CPU 8线程）
    ax3 = axes[1, 0]
    kernel1_speedup = [cpu_data[s][8]['time'] / cuda_kernel1_data[s]['time'] if s in cpu_data else 0 
                       for s in matrix_sizes]
    kernel2_speedup = [cpu_data[s][8]['time'] / cuda_kernel2_data[s]['time'] if s in cpu_data else 0 
                       for s in matrix_sizes]
    
    ax3.plot(size_numeric, kernel1_speedup, marker='o', linewidth=2, label='CUDA Kernel1 vs CPU', color='#d62728')
    ax3.plot(size_numeric, kernel2_speedup, marker='s', linewidth=2, label='CUDA Kernel2 vs CPU', color='#9467bd')
    
    ax3.set_xlabel('Matrix Size')
    ax3.set_ylabel('Speedup')
    ax3.set_title('Speedup over CPU (8 threads)')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 图4：CUDA Kernel1 vs Kernel2 性能对比
    ax4 = axes[1, 1]
    kernel_kernel_speedup = [cuda_kernel2_data[s]['time'] / cuda_kernel1_data[s]['time'] for s in matrix_sizes]
    
    ax4.bar(size_numeric, kernel_kernel_speedup, color='#e377c2', alpha=0.7)
    ax4.axhline(y=1, color='gray', linestyle='--', linewidth=2)
    ax4.set_xlabel('Matrix Size')
    ax4.set_ylabel('Speedup (Kernel2/Kernel1)')
    ax4.set_title('Kernel2 vs Kernel1 Performance Ratio')
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment1_analysis.png', dpi=300, bbox_inches='tight')
    print("\n图表已保存至: experiment_data/experiment1_analysis.png")

def plot_experiment2():
    """绘制实验二的图表"""
    matrix_sizes = ['256', '512', '1024', '2048']
    block_sizes = [4, 8, 16, 32]
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
    markers = ['o', 's', '^', 'd']
    
    # 图1：不同矩阵规模下，BLOCK_SIZE对执行时间的影响
    ax1 = axes[0, 0]
    for i, size in enumerate(matrix_sizes):
        times = [blocksize_data[size][bs]['time'] for bs in block_sizes]
        ax1.plot(block_sizes, times, marker=markers[i], linewidth=2, 
                label=f'{size}x{size}', color=colors[i])
    
    ax1.set_xlabel('BLOCK_SIZE')
    ax1.set_ylabel('Time (ms)')
    ax1.set_title('Execution Time vs BLOCK_SIZE')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 图2：不同矩阵规模下，BLOCK_SIZE对GFLOPS的影响
    ax2 = axes[0, 1]
    for i, size in enumerate(matrix_sizes):
        flops = [blocksize_data[size][bs]['flops'] for bs in block_sizes]
        ax2.plot(block_sizes, flops, marker=markers[i], linewidth=2, 
                label=f'{size}x{size}', color=colors[i])
    
    ax2.set_xlabel('BLOCK_SIZE')
    ax2.set_ylabel('GFLOPS')
    ax2.set_title('Performance vs BLOCK_SIZE')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # 图3：相对于4x4的加速比
    ax3 = axes[1, 0]
    for i, size in enumerate(matrix_sizes):
        baseline = blocksize_data[size][4]['time']
        speedups = [baseline / blocksize_data[size][bs]['time'] for bs in block_sizes]
        ax3.plot(block_sizes, speedups, marker=markers[i], linewidth=2, 
                label=f'{size}x{size}', color=colors[i])
    
    ax3.set_xlabel('BLOCK_SIZE')
    ax3.set_ylabel('Speedup over 4x4')
    ax3.set_title('Performance Improvement Relative to 4x4')
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # 图4：性能提升趋势（从4x4到32x32）
    ax4 = axes[1, 1]
    size_numeric = [int(s) for s in matrix_sizes]
    speedup_4_to_32 = [blocksize_data[s][4]['time'] / blocksize_data[s][32]['time'] for s in matrix_sizes]
    
    ax4.bar(size_numeric, speedup_4_to_32, color='#9467bd', alpha=0.7)
    ax4.set_xlabel('Matrix Size')
    ax4.set_ylabel('Speedup (32x32 / 4x4)')
    ax4.set_title('Performance Gain: 32x32 vs 4x4')
    ax4.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('/home/yly/dev/hpc-lab-code/lab4/experiment_data/experiment2_analysis.png', dpi=300, bbox_inches='tight')
    print("图表已保存至: experiment_data/experiment2_analysis.png")

def analyze_results():
    """分析实验结果"""
    print("\n" + "=" * 100)
    print("实验结果分析")
    print("=" * 100)
    
    print("\n【实验一分析】")
    print("-" * 100)
    
    print("\n1. CPU性能分析:")
    print("   - 在小矩阵规模(256x256)下，增加线程数能带来一定性能提升(最高1.28倍加速比)")
    print("   - 在中大矩阵规模(512x512及以上)下，增加线程数几乎无性能提升")
    print("   - 原因：小矩阵数据可以放入CPU缓存，多线程扩展性好；大矩阵受内存带宽限制")
    print("   - CPU性能始终在0.34-0.44 GFLOPS之间，远低于GPU")
    
    print("\n2. CUDA Kernel1性能分析:")
    print("   - 性能稳定在850-905 GFLOPS之间，不随矩阵规模明显变化")
    print("   - 相比CPU(8线程)实现了约2000-3000倍的加速比")
    print("   - 优势：简单的线程映射，良好的内存合并访问")
    print("   - 劣势：每个线程需要重复访问全局内存，没有数据重用")
    
    print("\n3. CUDA Kernel2性能分析:")
    print("   - 性能稳定在317-331 GFLOPS之间")
    print("   - 相比Kernel1性能下降了约2.7-2.8倍")
    print("   - 原因分析：")
    print("     a) TILE_WIDTH=4太小，共享内存开销大于收益")
    print("     b) 频繁的__syncthreads()同步开销")
    print("     c) 小tile导致数据重用率低")
    print("   - 教训：共享内存优化需要合理的tile size，并非所有情况下都有效")
    
    print("\n4. 总体结论:")
    print("   - GPU相比CPU有巨大的性能优势(2000-3000倍)")
    print("   - 简单的Kernel1反而优于设计不当的Kernel2")
    print("   - 优化需要考虑硬件特性，盲目优化可能适得其反")
    
    print("\n" + "-" * 100)
    print("\n【实验二分析】")
    print("-" * 100)
    
    print("\n1. BLOCK_SIZE对性能的影响规律:")
    print("   - 4x4: 性能最差(289-328 GFLOPS)")
    print("   - 8x8: 性能提升3倍左右(838-1068 GFLOPS)")
    print("   - 16x16: 性能进一步提升到1423-1537 GFLOPS")
    print("   - 32x32: 性能最优，达到1506-1574 GFLOPS")
    
    print("\n2. 性能提升原因分析:")
    print("   a) 共享内存利用率提升:")
    print("      - 更大的tile意味着更多的数据重用")
    print("      - 减少了全局内存访问次数")
    print("   b) 线程级并行提升:")
    print("      - 更大的block包含更多线程，更好的隐藏延迟")
    print("   c) 计算与内存访问重叠:")
    print("      - 大tile使得计算时间与内存访问时间更平衡")
    
    print("\n3. 性能饱和现象:")
    print("   - 从16x16到32x32，性能提升幅度减小")
    print("   - 原因：")
    print("     a) 共享内存容量限制(每个SM的共享内存有限)")
    print("     b) 寄存器压力增加")
    print("     c) 线程块调度效率下降")
    
    print("\n4. 最优BLOCK_SIZE选择:")
    print("   - 对于当前GPU架构，32x32是最优选择")
    print("   - 不同GPU架构可能有不同的最优值")
    print("   - 需要根据具体硬件和问题规模进行调优")
    
    print("\n5. 与Kernel1对比:")
    print("   - Kernel1(无共享内存): ~900 GFLOPS")
    print("   - Kernel2(32x32共享内存): ~1574 GFLOPS")
    print("   - 正确的共享内存优化可以带来约1.7倍性能提升")
    
    print("\n" + "=" * 100)

if __name__ == "__main__":
    print("\n开始分析实验数据...\n")
    
    # 打印数据表格
    print_experiment1_table()
    print_experiment2_table()
    
    # 绘制图表
    print("\n正在生成图表...")
    plot_experiment1()
    plot_experiment2()
    
    # 分析结果
    analyze_results()
    
    print("\n分析完成！")