diff --git a/lab1/lab1.sh b/lab1/lab1.sh index 5308d97..789889c 100755 --- a/lab1/lab1.sh +++ b/lab1/lab1.sh @@ -1,9 +1,16 @@ #!/bin/bash echo "Current directory: $PWD" - +# get arch using uname -m +# if aarch64 then use arm64-v8a else use x86_64 +ARCH=$(uname -m) +if [ "$ARCH" == "aarch64" ]; then + BUILD_ARCH="arm64-v8a" +else + BUILD_ARCH="x86_64" +fi # Build directory -BUILD_DIR="./build/linux/x86_64/release" +BUILD_DIR="./build/linux/$BUILD_ARCH/release" # Programs MPI_HELLO="$BUILD_DIR/mpi_hello_world" diff --git a/lab2/omp/main.cpp b/lab2/omp/main.cpp new file mode 100644 index 0000000..7c775d2 --- /dev/null +++ b/lab2/omp/main.cpp @@ -0,0 +1,6 @@ +#include + +int main(int argc, char** argv) { + std::cout << "hello world!" << std::endl; + return 0; +} diff --git a/lab2/omp/openmp_hello_world.c b/lab2/omp/openmp_hello_world.c new file mode 100644 index 0000000..ca23a6c --- /dev/null +++ b/lab2/omp/openmp_hello_world.c @@ -0,0 +1,17 @@ +#include +#include + +int main() { + int i; + + #pragma omp parallel + { + printf("Hello World\n"); + for(i=0; i<4; i++) { + printf("Iter:%d\n",i); + } + printf("GoodBye World\n"); + } + + return 0; +} diff --git a/lab2/omp/pi.c b/lab2/omp/pi.c new file mode 100644 index 0000000..d02d21a --- /dev/null +++ b/lab2/omp/pi.c @@ -0,0 +1,33 @@ +#include +#include + +long long num_steps = 1000000000; +double step; + +int main(int argc, char* argv[]) +{ + struct timeval TimeStampStart, TimeStampStop; + double ExeTime; + double x, pi, sum=0.0; + int i; + step = 1./(double)num_steps; + + gettimeofday(&TimeStampStart, NULL); + + for (i=0; i +#include +#include + +long long num_steps = 1000000000; +double step; + +int main(int argc, char* argv[]) +{ + struct timeval TimeStampStart, TimeStampStop; + double ExeTime; + double x, pi, sum=0.0; + int i; + step = 1./(double)num_steps; + + gettimeofday(&TimeStampStart, NULL); + + #pragma omp parallel private(x) reduction(+:sum) + { + #pragma omp for + for (i=0; i +#include +#include +#include +#include + +#define BLOCK_SIZE 500 + +int main(){ + struct timeval TimeStampStart, TimeStampStop; + double ExeTime; + unsigned int iter=200000000; + int i, j; + double x, y; + double dUnderCurve=0.0; + double pi=0.0; + double r[BLOCK_SIZE*2]; + + gettimeofday(&TimeStampStart, NULL); + + #pragma omp parallel private(i, j, x, y, r) reduction(+:dUnderCurve) + { + unsigned int seed = omp_get_thread_num() + 1; + + #pragma omp for + for(j=0; j +#include +#include +#include + +#define BLOCK_SIZE 500 + +int main(){ + struct timeval TimeStampStart, TimeStampStop; + double ExeTime; + unsigned int iter=200000000; + int i, j; + double x, y; + double dUnderCurve=0.0; + double pi=0.0; + double r[BLOCK_SIZE*2]; //Careful!!! + //you need a private copy of whole array for each thread + + srand((unsigned)time(NULL)); + + gettimeofday(&TimeStampStart, NULL); + + for(j=0; j #include -#define NUM_THREADS 4 +int NUM_THREADS= 4; FILE *fd; int TotalEvenWords = 0, TotalOddWords = 0, TotalWords = 0; @@ -66,14 +66,16 @@ void *count_words_thread(void *arg) return NULL; } -int main() +int main(int argc, char** argv) { fd = fopen("./InFile1.txt", "r"); // Open file for read if (fd == NULL) { perror("Failed to open file"); return 1; } - + if (argc > 1){ + NUM_THREADS = atoi(argv[1]); + } // Read all lines char **lines = NULL; int total_lines = 0; diff --git a/lab2/pthread/pi_par.c b/lab2/pthread/pi_par.c index 3323a3c..ece22d6 100644 --- a/lab2/pthread/pi_par.c +++ b/lab2/pthread/pi_par.c @@ -3,7 +3,7 @@ #include #include -#define NUM_THREADS 4 +int NUM_THREADS=4; long long num_steps = 1000000000; double step; @@ -34,6 +34,9 @@ int main(int argc, char* argv[]) struct timeval TimeStampStart, TimeStampStop; double ExeTime; double pi; + if (argc > 1) { + NUM_THREADS = atoi(argv[1]); + } int thread_ids[NUM_THREADS]; pthread_t threads[NUM_THREADS]; diff --git a/lab3/nbody/lab3_nbody.sh b/lab3/nbody/lab3_nbody.sh index bb1febb..b333249 100755 --- a/lab3/nbody/lab3_nbody.sh +++ b/lab3/nbody/lab3_nbody.sh @@ -1,26 +1,222 @@ #!/bin/bash # N体问题实验脚本 +# 收集串行和并行程序的性能数据 +# 多机环境:hpc-ecs-1, hpc-ecs-2, hpc-ecs-3(每台2线程) + +set -e # 遇到错误立即退出 +set -u # 使用未定义变量时报错 +set -o pipefail # 管道命令中任何错误都会导致整个管道失败 + +OUTPUT_CSV="nbody_results.csv" +LOG_FILE="nbody_experiment.log" + +# 主机配置 +HOST1="hpc-ecs-1" +HOST2="hpc-ecs-2" +HOST3="hpc-ecs-3" + +# 记录日志函数 +log_error() { + echo "[ERROR] $*" | tee -a "$LOG_FILE" +} + +log_info() { + echo "[INFO] $*" | tee -a "$LOG_FILE" +} + +# 清空或创建CSV文件 +echo "实验,数据规模,每机进程数,机器配置,运行时间(s)" > "$OUTPUT_CSV" echo "==========================================" -echo "N体问题串行模拟实验" +echo "N体问题性能测试实验" echo "==========================================" +echo "主机配置: $HOST1, $HOST2, $HOST3" echo "" -# 默认天体数量 -N=${1:-4} - -echo "运行参数:" -echo " 天体数量: $N" -echo " 时间步长: 0.01 s" -echo " 总步数: 100" -echo "" # 编译程序 -xmake build nbody_ser -# 运行程序 -./build/linux/x86_64/release/nbody_ser $N - +echo "编译程序..." +log_info "开始编译程序..." +if ! xmake build nbody_ser; then + log_error "编译 nbody_ser 失败" + exit 1 +fi +if ! xmake build nbody_par; then + log_error "编译 nbody_par 失败" + exit 1 +fi +log_info "编译完成" echo "" + +# 固定数据规模 +FIXED_N=6000 + +# 实验一:单机上,数据规模为6000时,随每机进程数变化的运行时间(串行程序) +echo "==========================================" +echo "实验一:串行程序 - 数据规模6000" +echo "==========================================" +log_info "运行串行程序..." +ser_output=$(./build/linux/arm64-v8a/release/nbody_ser $FIXED_N 2>&1) +ser_exit_code=$? +if [ $ser_exit_code -ne 0 ]; then + log_error "串行程序执行失败,退出码: $ser_exit_code" + echo "$ser_output" | tee -a "$LOG_FILE" + exit 1 +fi +time_output=$(echo "$ser_output" | grep "模拟用时" | awk '{print $2}') +if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间" + echo "$ser_output" | tee -a "$LOG_FILE" + exit 1 +fi +echo "实验一,6000,1,单机,$time_output" >> "$OUTPUT_CSV" +echo " 时间: $time_output s" +log_info "实验一完成" +echo "" + +# 实验二:多机环境下,数据规模为6000,随每机进程数变化的运行时间 +echo "==========================================" +echo "实验二:并行程序 - 数据规模6000,不同每机进程数" +echo "==========================================" + +# 测试不同的每机进程数和机器配置 +for ppn in 1 2 3 4; do + # 单机测试 + echo "每机进程数: $ppn, 单机" + log_info "实验二: 单机, ppn=$ppn" + par_output=$(mpirun --host "$HOST1:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(单机 ppn=$ppn),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(单机 ppn=$ppn)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验二,6000,$ppn,单机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + echo "" + + # 双机测试 + echo "每机进程数: $ppn, 双机" + log_info "实验二: 双机, ppn=$ppn" + par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(双机 ppn=$ppn),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(双机 ppn=$ppn)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验二,6000,$ppn,双机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + echo "" + + # 三机测试 + echo "每机进程数: $ppn, 三机" + log_info "实验二: 三机, ppn=$ppn" + par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn,$HOST3:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(三机 ppn=$ppn),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(三机 ppn=$ppn)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验二,6000,$ppn,三机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + echo "" +done + +# 实验三:每机1个进程,随数据规模变化的并行程序运行时间 +echo "==========================================" +echo "实验三:并行程序 - 每机1进程,不同数据规模" +echo "==========================================" + +# 测试不同的数据规模 +for N in 150 300 600 1200 2400 4800 9600; do + echo "数据规模: $N" + log_info "实验三: 数据规模=$N" + + # 单机测试 + echo " 单机..." + par_output=$(mpirun --host "$HOST1:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(单机 N=$N),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(单机 N=$N)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验三,$N,单机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + + # 双机测试 + echo " 双机..." + par_output=$(mpirun --host "$HOST1:1,$HOST2:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(双机 N=$N),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(双机 N=$N)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验三,$N,双机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + + # 三机测试 + echo " 三机..." + par_output=$(mpirun --host "$HOST1:1,$HOST2:1,$HOST3:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1) + par_exit_code=$? + if [ $par_exit_code -ne 0 ]; then + log_error "并行程序执行失败(三机 N=$N),退出码: $par_exit_code" + echo "$par_output" | tee -a "$LOG_FILE" + else + time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}') + if [ -z "$time_output" ]; then + log_error "无法从输出中提取运行时间(三机 N=$N)" + echo "$par_output" | tee -a "$LOG_FILE" + else + echo "实验三,$N,三机,$time_output" >> "$OUTPUT_CSV" + echo " 时间: $time_output s" + fi + fi + echo "" +done + echo "==========================================" echo "实验完成" echo "==========================================" +echo "" +log_info "所有实验完成" +echo "结果已保存到: $OUTPUT_CSV" +echo "日志已保存到: $LOG_FILE" +echo "" +echo "数据预览:" +cat "$OUTPUT_CSV" +echo "" +echo "如有错误,请查看日志文件: $LOG_FILE" diff --git a/lab3/nbody/nbody_par.cpp b/lab3/nbody/nbody_par.cpp index c4e5780..efae7bb 100644 --- a/lab3/nbody/nbody_par.cpp +++ b/lab3/nbody/nbody_par.cpp @@ -163,7 +163,7 @@ int main(int argc, char **argv) { verbose = (strcmp(argv[2], "--verbose") == 0 || strcmp(argv[2], "-v") == 0); } // 只有rank 0打印初始信息 - if (verbose && world_rank == 0) { + if (world_rank == 0) { cout << "N体问题并行模拟" << endl; cout << "天体数量: " << n << endl; cout << "进程数量: " << world_size << endl; diff --git a/lab3/prime/lab3_prime.sh b/lab3/prime/lab3_prime.sh index 277c28b..e9f24ab 100755 --- a/lab3/prime/lab3_prime.sh +++ b/lab3/prime/lab3_prime.sh @@ -7,7 +7,14 @@ echo "==========================================" echo "Lab 3: Prime Number Calculation Performance Test" echo "==========================================" echo "" - +# get arch using uname -m +# if aarch64 then use arm64-v8a else use x86_64 +ARCH=$(uname -m) +if [ "$ARCH" == "aarch64" ]; then + BUILD_ARCH="arm64-v8a" +else + BUILD_ARCH="x86_64" +fi # Array of N values N_VALUES=(100000 200000 400000 800000) @@ -21,7 +28,7 @@ OUTPUT_FILE="prime_results.txt" > $OUTPUT_FILE # Print header -echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE +echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE echo "--------------------------------------------------------" | tee -a $OUTPUT_FILE # Loop through each N value @@ -29,29 +36,28 @@ for N in "${N_VALUES[@]}"; do echo "" echo "Testing N = $N" echo "------------------------" - + # Loop through each process count for P in "${PROCESS_COUNTS[@]}"; do echo -n "Running with $P process(es)... " - + # Run the program and capture output - OUTPUT=$(mpirun -n $P ./build/linux/x86_64/release/prime_par_naive $N 2>&1) - + OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par_naive $N 2>&1) + # Extract prime count and time from output PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)') TIME=$(echo "$OUTPUT" | grep "Time =" | grep -oP '[0-9.]+(?= seconds)') - + # Print result if [ ! -z "$PRIME_COUNT" ] && [ ! -z "$TIME" ]; then - echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE + echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE echo "Done! (Primes: $PRIME_COUNT, Time: ${TIME}s)" else echo "Error running program!" - echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE + echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE fi done done - echo "" echo "==========================================" echo "Test completed!" @@ -63,3 +69,73 @@ echo "Summary Table:" echo "--------------------------------------------------------" cat $OUTPUT_FILE echo "--------------------------------------------------------" + + +echo "" +echo "==========================================" +echo "Begin Optimized Test!" +echo "==========================================" +echo "" +ARCH=$(uname -m) +if [ "$ARCH" == "aarch64" ]; then + BUILD_ARCH="arm64-v8a" +else + BUILD_ARCH="x86_64" +fi +# Array of N values +N_VALUES=(100000 200000 400000 800000) + +# Array of process counts +PROCESS_COUNTS=(1 2 4 6 8) + +# Output file for results +OUTPUT_FILE="prime_results_opt.txt" + +# Clear previous results +> $OUTPUT_FILE + +# Print header +echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE +echo "--------------------------------------------------------" | tee -a $OUTPUT_FILE + +# Loop through each N value +for N in "${N_VALUES[@]}"; do + echo "" + echo "Testing N = $N" + echo "------------------------" + + # Loop through each process count + for P in "${PROCESS_COUNTS[@]}"; do + echo -n "Running with $P process(es)... " + + # Run the program and capture output + OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par_naive $N $(echo "$N/$P" | bc) 2>&1) + + # Extract prime count and time from output + PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)') + TIME=$(echo "$OUTPUT" | grep "Time =" | grep -oP '[0-9.]+(?= seconds)') + + # Print result + if [ ! -z "$PRIME_COUNT" ] && [ ! -z "$TIME" ]; then + echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE + echo "Done! (Primes: $PRIME_COUNT, Time: ${TIME}s)" + else + echo "Error running program!" + echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE + fi + done +done + + +$(echo "$N/$P" | bc) +echo "" +echo "==========================================" +echo "Test completed!" +echo "==========================================" +echo "" +echo "Results saved to: $OUTPUT_FILE" +echo "" +echo "Summary Table:" +echo "--------------------------------------------------------" +cat $OUTPUT_FILE +echo "--------------------------------------------------------" \ No newline at end of file diff --git a/lab3/prime/src/prime_par.cpp b/lab3/prime/src/prime_par.cpp index c05f76f..5114f24 100644 --- a/lab3/prime/src/prime_par.cpp +++ b/lab3/prime/src/prime_par.cpp @@ -103,7 +103,8 @@ int main(int argc, char* argv[]) { // No range to distribute, all primes are base primes int total_count = base_primes.size(); if (rank == 0) { - std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl; + std::cout << "Between 2 and " << N << ", there are " << total_count + << " primes." << std::endl; } MPI_Finalize(); return 0; @@ -172,7 +173,8 @@ int main(int argc, char* argv[]) { if (rank == 0) { end_wtime = MPI_Wtime ( ) - wtime; int total_count = base_primes.size() + global_prime_count; - std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl; + std::cout << "Between 2 and " << N << ", there are " << total_count + << " primes." << std::endl; std::cout << "Time = " << end_wtime << " seconds" << std::endl; } diff --git a/lab4/MatrixMul_cpu.cu b/lab4/MatrixMul_cpu.cu new file mode 100644 index 0000000..8227c8d --- /dev/null +++ b/lab4/MatrixMul_cpu.cu @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#include + +void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) { + #pragma omp parallel for num_threads(num_threads) + for (int i = 0; i < M; ++i) { + for (int j = 0; j < K; ++j) { + float sum = 0.0f; + for (int k = 0; k < N; ++k) { + sum += A[i * N + k] * B[k * K + j]; + } + C[i * K + j] = sum; + } + } +} + +void runCPUTest() { + std::vector matrix_sizes = {256, 512, 1024, 2048}; + std::vector thread_counts = {8, 64, 256}; + + std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n"; + std::cout << "=================================================================\n"; + std::cout << std::setw(12) << "Matrix" + << std::setw(12) << "Threads" + << std::setw(15) << "Time(ms)" + << std::setw(15) << "FLOPS(G)" + << std::setw(15) << "Speedup" << std::endl; + std::cout << "-----------------------------------------------------------------\n"; + + // 存储基准性能(单线程) + std::vector baseline_times(matrix_sizes.size()); + + for (size_t m = 0; m < matrix_sizes.size(); ++m) { + int size = matrix_sizes[m]; + int M = size, N = size, K = size; + + // 分配内存 + float *A = new float[M * N]; + float *B = new float[N * K]; + float *C = new float[M * K]; + + // 初始化数据 + for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f; + for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f; + + // 首先测试单线程作为基准 + auto start = std::chrono::high_resolution_clock::now(); + matrixMultiplyCPU(A, B, C, M, N, K, 1); + auto end = std::chrono::high_resolution_clock::now(); + auto single_duration = std::chrono::duration(end - start).count(); + baseline_times[m] = single_duration; + + // 测试多线程 + for (int threads : thread_counts) { + start = std::chrono::high_resolution_clock::now(); + matrixMultiplyCPU(A, B, C, M, N, K, threads); + end = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration(end - start).count(); + + // 计算FLOPS + double total_flops = 2.0 * M * N * K; + double gflops = total_flops / (duration * 1e6); + + // 计算加速比 + double speedup = baseline_times[m] / duration; + + std::cout << std::setw(12) << size << "x" << size + << std::setw(12) << threads + << std::setw(15) << std::fixed << std::setprecision(3) << duration + << std::setw(15) << std::fixed << std::setprecision(2) << gflops + << std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << "-----------------------------------------------------------------\n"; + } +} + +void plotData() { + std::cout << "\n\nASCII图表:CPU性能分析\n"; + std::cout << "=================================================================\n"; + std::cout << "1. 不同线程数下的加速比趋势\n"; + std::cout << " Matrix Threads=8 Threads=64 Threads=256\n"; + + // 这里可以添加具体的绘图逻辑 + // 由于是文本输出,可以使用简单的ASCII字符绘制柱状图 + + std::cout << "\n2. 不同矩阵规模下的性能趋势\n"; + std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n"; + + std::cout << "\n注意:完整图表建议使用Python (matplotlib) 生成。\n"; + std::cout << "推荐生成以下图表:\n"; + std::cout << "- 折线图:不同线程数下的加速比 vs 矩阵规模\n"; + std::cout << "- 柱状图:不同配置下的GFLOPS对比\n"; + std::cout << "- 热力图:线程数 × 矩阵规模 的性能分布\n"; +} + +int main() { + runCPUTest(); + plotData(); + return 0; +} diff --git a/lab4/MatrixMul_kernel1.cu b/lab4/MatrixMul_kernel1.cu new file mode 100644 index 0000000..802969e --- /dev/null +++ b/lab4/MatrixMul_kernel1.cu @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include + +__global__ void matMultCUDAKernel1(const float* A, const float* B, float* C, int M, int N, int K) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + if(row < M && col < K){ + float sum = 0.0f; + for(int i = 0; i < N; ++i){ + sum += A[row * N + i] * B[i * K + col]; + } + C[row * K + col] = sum; + } +} + +int main() { + std::vector sizes = {512, 1024, 2048,4096}; + std::vector times; + + // 遍历所有矩阵尺寸 + for(int idx = 0; idx < sizes.size(); ++idx) { + int M = sizes[idx]; + int N = sizes[idx]; + int K = sizes[idx]; + + // 分配主机内存 + float *A = new float[M * N]; + float *B = new float[N * K]; + float *C = new float[M * K]; + + // 初始化数据 + for(int i = 0; i < M * N; ++i) A[i] = rand() % 10; + for(int i = 0; i < N * K; ++i) B[i] = rand() % 10; + + // 分配设备内存 + float *d_A, *d_B, *d_C; + cudaMalloc(&d_A, M * N * sizeof(float)); + cudaMalloc(&d_B, N * K * sizeof(float)); + cudaMalloc(&d_C, M * K * sizeof(float)); + + // 拷贝数据到设备 + cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice); + + // 配置线程块和网格 + dim3 blockSize(16, 16); + dim3 gridSize((K + blockSize.x - 1) / blockSize.x, + (M + blockSize.y - 1) / blockSize.y); + + // 预热(可选) + matMultCUDAKernel1<<>>(d_A, d_B, d_C, M, N, K); + cudaDeviceSynchronize(); + + // 计时开始 + auto start = std::chrono::high_resolution_clock::now(); + + // 执行核函数 + matMultCUDAKernel1<<>>(d_A, d_B, d_C, M, N, K); + cudaDeviceSynchronize(); + + // 计时结束 + auto end = std::chrono::high_resolution_clock::now(); + + // 拷贝结果回主机 + cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost); + + // 计算时间 + std::chrono::duration duration = end - start; + times.push_back(duration.count()); + + // 清理设备内存 + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + // 清理主机内存 + delete[] A; + delete[] B; + delete[] C; + } + + // 输出结果 + std::cout << "CUDA Kernel1 矩阵乘法性能测试结果" << std::endl; + std::cout << "=================================" << std::endl; + std::cout << std::setw(12) << "Matrix Size" + << std::setw(15) << "Time(s)" + << std::setw(15) << "Time(ms)" + << std::setw(15) << "GFLOPS" << std::endl; + std::cout << "---------------------------------" << std::endl; + + for(int i = 0; i < sizes.size(); ++i) { + int size = sizes[i]; + double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数 + double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS + double time_ms = times[i] * 1000.0; // 转换为毫秒 + + std::cout << std::setw(8) << size << "x" << std::setw(3) << size + << std::setw(15) << std::fixed << std::setprecision(6) << times[i] + << std::setw(15) << std::fixed << std::setprecision(3) << time_ms + << std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl; + } + std::cout << "=================================" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/lab4/MatrixMul_kernel2.cu b/lab4/MatrixMul_kernel2.cu new file mode 100644 index 0000000..a64fb08 --- /dev/null +++ b/lab4/MatrixMul_kernel2.cu @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include + +#define TILE_WIDTH 4 + +__global__ void matMultCUDAKernel2(const float* A, const float* B, float* C, int M, int N, int K) { + __shared__ float shared_A[TILE_WIDTH][TILE_WIDTH]; + __shared__ float shared_B[TILE_WIDTH][TILE_WIDTH]; + + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + float sum = 0.0f; + + for (int t = 0; t < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++t) { + if (row < M && t * TILE_WIDTH + threadIdx.x < N) + shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_WIDTH + threadIdx.x]; + else + shared_A[threadIdx.y][threadIdx.x] = 0.0f; + + if (col < K && t * TILE_WIDTH + threadIdx.y < N) + shared_B[threadIdx.y][threadIdx.x] = B[(t * TILE_WIDTH + threadIdx.y) * K + col]; + else + shared_B[threadIdx.y][threadIdx.x] = 0.0f; + + __syncthreads(); + + for (int i = 0; i < TILE_WIDTH; ++i) + sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x]; + + __syncthreads(); + } + + if(row < M && col < K){ + C[row * K + col] = sum; + } +} + +int main() { + std::vector sizes = {512, 1024, 2048,4096}; + std::vector times; + + for(int idx = 0; idx < sizes.size(); ++idx) { + int M = sizes[idx]; + int N = sizes[idx]; + int K = sizes[idx]; + + float *A = new float[M * N]; + float *B = new float[N * K]; + float *C = new float[M * K]; + + for (int i = 0; i < M * N; ++i) A[i] = rand() % 10; + for (int i = 0; i < N * K; ++i) B[i] = rand() % 10; + + float *d_A, *d_B, *d_C; + cudaMalloc(&d_A, M * N * sizeof(float)); + cudaMalloc(&d_B, N * K * sizeof(float)); + cudaMalloc(&d_C, M * K * sizeof(float)); + + cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice); + + dim3 blockSize(TILE_WIDTH, TILE_WIDTH); + dim3 gridSize((K + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH); + + // 预热 + matMultCUDAKernel2<<>>(d_A, d_B, d_C, M, N, K); + cudaDeviceSynchronize(); + + auto start = std::chrono::high_resolution_clock::now(); + matMultCUDAKernel2<<>>(d_A, d_B, d_C, M, N, K); + cudaDeviceSynchronize(); + auto end = std::chrono::high_resolution_clock::now(); + + cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost); + + std::chrono::duration duration = end - start; + times.push_back(duration.count()); + + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + + delete[] A; + delete[] B; + delete[] C; + } + + std::cout << "CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果" << std::endl; + std::cout << "=================================" << std::endl; + std::cout << std::setw(12) << "Matrix Size" + << std::setw(15) << "Time(s)" + << std::setw(15) << "Time(ms)" + << std::setw(15) << "GFLOPS" << std::endl; + std::cout << "---------------------------------" << std::endl; + + for(int i = 0; i < sizes.size(); ++i) { + int size = sizes[i]; + double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数 + double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS + double time_ms = times[i] * 1000.0; // 转换为毫秒 + + std::cout << std::setw(8) << size << "x" << std::setw(3) << size + << std::setw(15) << std::fixed << std::setprecision(6) << times[i] + << std::setw(15) << std::fixed << std::setprecision(3) << time_ms + << std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl; + } + std::cout << "=================================" << std::endl; + + return 0; +} diff --git a/lab4/experiment_data/blocksize_analysis.txt b/lab4/experiment_data/blocksize_analysis.txt new file mode 100644 index 0000000..81a34e1 --- /dev/null +++ b/lab4/experiment_data/blocksize_analysis.txt @@ -0,0 +1,24 @@ +BLOCK_SIZE对CUDA矩阵乘法性能影响测试 +======================================== + Matrix Block Time(ms) FLOPS(G) +---------------------------------------- + 256x256 4x4 0.115 292.57 + 256x256 8x8 0.040 836.85 + 256x256 16x16 0.029 1151.02 + 256x256 32x32 0.026 1315.65 +---------------------------------------- + 512x512 4x4 0.831 323.00 + 512x512 8x8 0.264 1018.65 + 512x512 16x16 0.190 1416.04 + 512x512 32x32 0.174 1542.02 +---------------------------------------- + 1024x1024 4x4 6.541 328.33 + 1024x1024 8x8 2.021 1062.62 + 1024x1024 16x16 1.393 1541.24 + 1024x1024 32x32 1.353 1586.69 +---------------------------------------- + 2048x2048 4x4 54.011 318.08 + 2048x2048 8x8 16.104 1066.82 + 2048x2048 16x16 11.355 1512.97 + 2048x2048 32x32 10.978 1565.00 +---------------------------------------- diff --git a/lab4/experiment_data/gpu_info.txt b/lab4/experiment_data/gpu_info.txt new file mode 100644 index 0000000..edaab81 --- /dev/null +++ b/lab4/experiment_data/gpu_info.txt @@ -0,0 +1,20 @@ +Wed Jan 21 16:23:03 2026 ++---------------------------------------------------------------------------------------+ +| NVIDIA-SMI 535.247.01 Driver Version: 535.247.01 CUDA Version: 12.2 | +|-----------------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+======================+======================| +| 0 NVIDIA GeForce RTX 2080 Ti On | 00000000:03:00.0 On | N/A | +| 34% 27C P8 20W / 250W | 1MiB / 22528MiB | 0% Default | +| | | N/A | ++-----------------------------------------+----------------------+----------------------+ + ++---------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=======================================================================================| +| No running processes found | ++---------------------------------------------------------------------------------------+ diff --git a/lab4/experiment_data/matrixmul_comparison.txt b/lab4/experiment_data/matrixmul_comparison.txt new file mode 100644 index 0000000..7e31fd6 --- /dev/null +++ b/lab4/experiment_data/matrixmul_comparison.txt @@ -0,0 +1,112 @@ +=== CPU (OpenMP) 不同线程数 === +CPU矩阵乘法性能测试 (OpenMP多线程) +================================================================= + Matrix Threads Time(ms) FLOPS(G) Speedup +----------------------------------------------------------------- + 256x256 8 90.372 0.37 1.07 + 256x256 64 83.707 0.40 1.16 + 256x256 256 84.262 0.40 1.15 +----------------------------------------------------------------- + 512x512 8 815.295 0.33 1.01 + 512x512 64 813.476 0.33 1.01 + 512x512 256 812.463 0.33 1.01 +----------------------------------------------------------------- + 1024x1024 8 6571.000 0.33 1.00 + 1024x1024 64 6586.094 0.33 1.00 + 1024x1024 256 6569.582 0.33 1.00 +----------------------------------------------------------------- + 2048x2048 8 55244.488 0.31 1.00 + 2048x2048 64 55211.832 0.31 1.00 + 2048x2048 256 55239.930 0.31 1.00 +----------------------------------------------------------------- + + +ASCII图表:CPU性能分析 +================================================================= +1. 不同线程数下的加速比趋势 + Matrix Threads=8 Threads=64 Threads=256 + +2. 不同矩阵规模下的性能趋势 + Threads 256x256 512x512 1024x1024 2048x2048 + +注意:完整图表建议使用Python (matplotlib) 生成。 +推荐生成以下图表: +- 折线图:不同线程数下的加速比 vs 矩阵规模 +- 柱状图:不同配置下的GFLOPS对比 +- 热力图:线程数 × 矩阵规模 的性能分布 +=== CUDA Kernel1 (基础版本) === +CUDA Kernel1 矩阵乘法性能测试结果 +================================= + Matrix Size Time(s) Time(ms) GFLOPS +--------------------------------- + 512x512 0.000312 0.312 860.70 + 1024x1024 0.002373 2.373 905.03 + 2048x2048 0.019180 19.180 895.72 + 4096x4096 0.129868 129.868 1058.30 +================================= +=== CUDA Kernel2 (共享内存优化) === +CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果 +================================= + Matrix Size Time(s) Time(ms) GFLOPS +--------------------------------- + 512x512 0.000826 0.826 324.87 + 1024x1024 0.006479 6.479 331.43 + 2048x2048 0.053598 53.598 320.53 + 4096x4096 0.432496 432.496 317.78 +================================= +=== CPU (OpenMP) 不同线程数 === +CPU矩阵乘法性能测试 (OpenMP多线程) +================================================================= + Matrix Threads Time(ms) FLOPS(G) Speedup +----------------------------------------------------------------- + 256x256 8 90.532 0.37 1.08 + 256x256 64 83.896 0.40 1.17 + 256x256 256 83.807 0.40 1.17 +----------------------------------------------------------------- + 512x512 8 814.564 0.33 1.00 + 512x512 64 817.633 0.33 1.00 + 512x512 256 812.408 0.33 1.01 +----------------------------------------------------------------- + 1024x1024 8 6639.308 0.32 1.00 + 1024x1024 64 6627.468 0.32 1.00 + 1024x1024 256 6656.504 0.32 1.00 +----------------------------------------------------------------- + 2048x2048 8 55719.875 0.31 1.00 + 2048x2048 64 55636.734 0.31 1.00 + 2048x2048 256 55657.629 0.31 1.00 +----------------------------------------------------------------- + + +ASCII图表:CPU性能分析 +================================================================= +1. 不同线程数下的加速比趋势 + Matrix Threads=8 Threads=64 Threads=256 + +2. 不同矩阵规模下的性能趋势 + Threads 256x256 512x512 1024x1024 2048x2048 + +注意:完整图表建议使用Python (matplotlib) 生成。 +推荐生成以下图表: +- 折线图:不同线程数下的加速比 vs 矩阵规模 +- 柱状图:不同配置下的GFLOPS对比 +- 热力图:线程数 × 矩阵规模 的性能分布 +=== CUDA Kernel1 (基础版本) === +CUDA Kernel1 矩阵乘法性能测试结果 +================================= + Matrix Size Time(s) Time(ms) GFLOPS +--------------------------------- + 512x512 0.000316 0.316 848.68 + 1024x1024 0.002367 2.367 907.12 + 2048x2048 0.019190 19.190 895.24 + 4096x4096 0.138181 138.181 994.63 +================================= +=== CUDA Kernel2 (共享内存优化) === +CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果 +================================= + Matrix Size Time(s) Time(ms) GFLOPS +--------------------------------- + 512x512 0.000828 0.828 324.24 + 1024x1024 0.006483 6.483 331.27 + 2048x2048 0.053603 53.603 320.50 + 4096x4096 0.432285 432.285 317.94 +================================= diff --git a/lab4/experiment_data/vectoradd_results.txt b/lab4/experiment_data/vectoradd_results.txt new file mode 100644 index 0000000..0c0aa1e --- /dev/null +++ b/lab4/experiment_data/vectoradd_results.txt @@ -0,0 +1,9 @@ +Vector Addition Performance Test (Threads per block: 256) +======================================================== +N=128, Time=9.472 ms +N=256, Time=4.992 ms +N=512, Time=4.928 ms +N=1024, Time=5.696 ms +N=2048, Time=4.928 ms +======================================================== +All tests completed. diff --git a/lab4/lab4.sh b/lab4/lab4.sh new file mode 100755 index 0000000..a1aaae1 --- /dev/null +++ b/lab4/lab4.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Lab4 CUDA 程序实验数据收集脚本 + +SCRIPT_DIR="$(dirname "$0")" +OUTPUT_DIR="$SCRIPT_DIR/experiment_data" +mkdir -p "$OUTPUT_DIR" +ARCH=$(uname -m) +if [ "$ARCH" == "aarch64" ]; then + BUILD_ARCH="arm64-v8a" +else + BUILD_ARCH="x86_64" +fi +echo "==========================================" +echo "Lab4 CUDA 实验数据收集" +echo "==========================================" +echo "数据输出目录: $OUTPUT_DIR" +echo "" + +# 检查 CUDA 设备 +echo "检查 CUDA 设备..." +nvidia-smi | tee "$OUTPUT_DIR/gpu_info.txt" +echo "" + +# 进入构建目录 +# cd "$SCRIPT_DIR/build/linux/$BUILD_ARCH/release" || exit 1 + +echo "==========================================" +echo "实验 4.2: 向量加法 - 不同数据规模测试" +echo "==========================================" +$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/vectoradd | tee "$OUTPUT_DIR/vectoradd_results.txt" +echo "" + +echo "==========================================" +echo "实验 4.3.1: CPU vs GPU 矩阵乘法性能对比" +echo "==========================================" +echo "=== CPU (OpenMP) 不同线程数 ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_cpu | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +echo "" + +echo "=== CUDA Kernel1 (基础版本) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel1 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +echo "" + +echo "=== CUDA Kernel2 (共享内存优化) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel2 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt" +echo "" + +echo "==========================================" +echo "实验 4.3.2: 不同 BLOCK_SIZE 对性能的影响" +echo "==========================================" +$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/matrixmultiply_block_size_change | tee "$OUTPUT_DIR/blocksize_analysis.txt" +echo "" + +echo "==========================================" +echo "实验数据收集完成!" +echo "数据保存在: $OUTPUT_DIR" +echo "==========================================" diff --git a/lab4/matrixmultiply_block_size_change.cu b/lab4/matrixmultiply_block_size_change.cu new file mode 100644 index 0000000..c0babc3 --- /dev/null +++ b/lab4/matrixmultiply_block_size_change.cu @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include + +// 测试不同的BLOCK_SIZE +std::vector block_sizes = {4, 8, 16, 32}; +// 测试不同的矩阵规模 +std::vector matrix_sizes = {256, 512, 1024, 2048}; + +// 共享内存矩阵乘法核函数模板 +template +__global__ void matMultKernel(const float* A, const float* B, float* C, int M, int N, int K) { + __shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE]; + + int row = blockIdx.y * BLOCK_SIZE + threadIdx.y; + int col = blockIdx.x * BLOCK_SIZE + threadIdx.x; + + float sum = 0.0f; + + for (int t = 0; t < (N + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) { + // 加载到共享内存 + if (row < M && t * BLOCK_SIZE + threadIdx.x < N) + shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * BLOCK_SIZE + threadIdx.x]; + else + shared_A[threadIdx.y][threadIdx.x] = 0.0f; + + if (col < K && t * BLOCK_SIZE + threadIdx.y < N) + shared_B[threadIdx.y][threadIdx.x] = B[(t * BLOCK_SIZE + threadIdx.y) * K + col]; + else + shared_B[threadIdx.y][threadIdx.x] = 0.0f; + + __syncthreads(); + + // 计算当前tile + for (int i = 0; i < BLOCK_SIZE; ++i) + sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x]; + + __syncthreads(); + } + + if (row < M && col < K) { + C[row * K + col] = sum; + } +} + +void runTest() { + std::cout << "BLOCK_SIZE对CUDA矩阵乘法性能影响测试\n"; + std::cout << "========================================\n"; + std::cout << std::setw(10) << "Matrix" + << std::setw(12) << "Block" + << std::setw(15) << "Time(ms)" + << std::setw(15) << "FLOPS(G)" << std::endl; + std::cout << "----------------------------------------\n"; + + // 测试每个矩阵规模 + for (int mat_size : matrix_sizes) { + int M = mat_size, N = mat_size, K = mat_size; + + // 分配主机内存 + float *A = new float[M * N]; + float *B = new float[N * K]; + float *C = new float[M * K]; + + // 初始化数据 + for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f; + for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f; + + // 分配设备内存 + float *d_A, *d_B, *d_C; + cudaMalloc(&d_A, M * N * sizeof(float)); + cudaMalloc(&d_B, N * K * sizeof(float)); + cudaMalloc(&d_C, M * K * sizeof(float)); + + cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice); + + // 测试每个BLOCK_SIZE + for (int block_size : block_sizes) { + dim3 blockDim(block_size, block_size); + dim3 gridDim((K + block_size - 1) / block_size, (M + block_size - 1) / block_size); + + // 预热 + if (block_size == 4) matMultKernel<4><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 8) matMultKernel<8><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 16) matMultKernel<16><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 32) matMultKernel<32><<>>(d_A, d_B, d_C, M, N, K); + cudaDeviceSynchronize(); + + // 创建CUDA事件计时 + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // 执行并计时 + cudaEventRecord(start); + if (block_size == 4) matMultKernel<4><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 8) matMultKernel<8><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 16) matMultKernel<16><<>>(d_A, d_B, d_C, M, N, K); + else if (block_size == 32) matMultKernel<32><<>>(d_A, d_B, d_C, M, N, K); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + // 计算时间 + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + + // 计算FLOPS + double total_flops = 2.0 * M * N * K; // 乘加各一次 + double gflops = total_flops / (milliseconds * 1e6); + + // 输出结果 + std::cout << std::setw(10) << mat_size << "x" << mat_size + << std::setw(12) << block_size << "x" << block_size + << std::setw(15) << std::fixed << std::setprecision(3) << milliseconds + << std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl; + + cudaEventDestroy(start); + cudaEventDestroy(stop); + } + + // 清理内存 + cudaFree(d_A); + cudaFree(d_B); + cudaFree(d_C); + delete[] A; + delete[] B; + delete[] C; + + std::cout << "----------------------------------------\n"; + } +} + +int main() { + runTest(); + return 0; +} \ No newline at end of file diff --git a/lab4/vectoradd.cu b/lab4/vectoradd.cu new file mode 100644 index 0000000..7a21f62 --- /dev/null +++ b/lab4/vectoradd.cu @@ -0,0 +1,123 @@ +#include +#include +#include + +#define CHECK(call) \ +{ \ + const cudaError_t error = call; \ + if (error != cudaSuccess) \ + { \ + printf("Error: %s:%d, ", __FILE__, __LINE__); \ + printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \ + exit(1); \ + } \ +} + +__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N) +{ + int i = threadIdx.x + blockIdx.x * blockDim.x; + if (i < N) { + dev_c[i] = dev_a[i] + dev_b[i]; + } +} + +void vectorAddTest(int N, int threadsPerBlock) +{ + // 计算块数 + int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; + + // 分配主机内存 + int *host_a = (int*)malloc(N * sizeof(int)); + int *host_b = (int*)malloc(N * sizeof(int)); + int *host_c = (int*)malloc(N * sizeof(int)); + + // 初始化数据 + for (int i = 0; i < N; i++) { + host_a[i] = i; + host_b[i] = i << 1; // 相当于乘以2 + } + + // 分配设备内存 + int *dev_a = NULL; + int *dev_b = NULL; + int *dev_c = NULL; + CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int))); + CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int))); + CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int))); + + // 拷贝数据到设备 + CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice)); + + // 创建CUDA事件用于计时 + cudaEvent_t start, stop; + CHECK(cudaEventCreate(&start)); + CHECK(cudaEventCreate(&stop)); + + // 预热一次,避免首次启动的额外开销 + add<<>>(dev_a, dev_b, dev_c, N); + cudaDeviceSynchronize(); + + // 记录开始时间 + CHECK(cudaEventRecord(start)); + + // 执行核函数 + add<<>>(dev_a, dev_b, dev_c, N); + + // 记录结束时间并等待完成 + CHECK(cudaEventRecord(stop)); + CHECK(cudaEventSynchronize(stop)); + + // 计算耗时(毫秒) + float elapsedTime_ms = 0; + CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop)); + float elapsedTime = elapsedTime_ms * 1000.0f; // 转换为微秒 + + // 输出结果 + printf("N=%d, Time=%.3f ms\n", N, elapsedTime); + + // 验证结果(可选) + CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost)); + bool success = true; + for (int i = 0; i < N; i++) { + if (host_c[i] != host_a[i] + host_b[i]) { + success = false; + break; + } + } + if (!success) { + printf("Error: Computation failed for N=%d\n", N); + } + + // 清理资源 + CHECK(cudaEventDestroy(start)); + CHECK(cudaEventDestroy(stop)); + CHECK(cudaFree(dev_a)); + CHECK(cudaFree(dev_b)); + CHECK(cudaFree(dev_c)); + free(host_a); + free(host_b); + free(host_c); +} + +int main(void) +{ + // 设置线程数(保持不变) + const int threadsPerBlock = 256; + + // 测试不同向量长度 + int testSizes[] = {128, 256, 512, 1024, 2048}; // 注意:2056改为2048(2的幂次) + int numTests = sizeof(testSizes) / sizeof(testSizes[0]); + + printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock); + printf("========================================================\n"); + + for (int i = 0; i < numTests; i++) { + vectorAddTest(testSizes[i], threadsPerBlock); + } + + printf("========================================================\n"); + printf("All tests completed.\n"); + + return 0; +} \ No newline at end of file diff --git a/lab4/xmake.lua b/lab4/xmake.lua new file mode 100644 index 0000000..ecb0ca0 --- /dev/null +++ b/lab4/xmake.lua @@ -0,0 +1,56 @@ +set_project("lab4_cuda_programs") +set_version("1.0") + +-- 设置 CUDA 工具链 +toolchain("cuda") + set_kind("standalone") + set_sdkdir(os.getenv("CUDA_HOME") or "/usr/local/cuda") + set_description("CUDA Toolkit") +toolchain_end() + +-- vectoradd 程序 +target("vectoradd") + set_kind("binary") + set_languages("c++14") + set_toolchains("cuda") + add_rules("cuda") + add_files("vectoradd.cu") +target_end() + +-- MatrixMul_cpu 程序 (使用 OpenMP) +target("MatrixMul_cpu") + set_kind("binary") + set_languages("c++14") + set_toolchains("cuda") + add_rules("cuda") + add_files("MatrixMul_cpu.cu") + add_ldflags("-lgomp", {force = true}) + add_cxxflags("-fopenmp", {force = true}) +target_end() + +-- MatrixMul_kernel1 程序 +target("MatrixMul_kernel1") + set_kind("binary") + set_languages("c++14") + set_toolchains("cuda") + add_rules("cuda") + add_files("MatrixMul_kernel1.cu") +target_end() + +-- MatrixMul_kernel2 程序 +target("MatrixMul_kernel2") + set_kind("binary") + set_languages("c++14") + set_toolchains("cuda") + add_rules("cuda") + add_files("MatrixMul_kernel2.cu") +target_end() + +-- matrixmultiply_block_size_change 程序 +target("matrixmultiply_block_size_change") + set_kind("binary") + set_languages("c++14") + set_toolchains("cuda") + add_rules("cuda") + add_files("matrixmultiply_block_size_change.cu") +target_end() diff --git a/work/gemm_optimized.cpp b/work/gemm_optimized.cpp new file mode 100644 index 0000000..54229e9 --- /dev/null +++ b/work/gemm_optimized.cpp @@ -0,0 +1,302 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +void randMat(int rows, int cols, float *&Mat) { + Mat = new float[rows * cols]; + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + Mat[i * cols + j] = 1.0; +} + +// 优化版本:使用循环展开和更好的缓存局部性 +void openmp_sgemm_optimized(int m, int n, int k, float *leftMat, float *rightMat, + float *resultMat) { + // 使用更大的分块以提高缓存利用率 + const int BLOCK_SIZE = 64; + + #pragma omp parallel for collapse(2) + for (int row = 0; row < m; row++) { + for (int col = 0; col < k; col++) { + resultMat[row * k + col] = 0.0; + } + } + + // 分块计算以提高缓存命中率 + #pragma omp parallel for collapse(2) + for (int row_block = 0; row_block < m; row_block += BLOCK_SIZE) { + for (int col_block = 0; col_block < k; col_block += BLOCK_SIZE) { + for (int i_block = 0; i_block < n; i_block += BLOCK_SIZE) { + + int row_end = min(row_block + BLOCK_SIZE, m); + int col_end = min(col_block + BLOCK_SIZE, k); + int i_end = min(i_block + BLOCK_SIZE, n); + + for (int row = row_block; row < row_end; row++) { + for (int col = col_block; col < col_end; col++) { + float sum = resultMat[row * k + col]; + for (int i = i_block; i < i_end; i++) { + sum += leftMat[row * n + i] * rightMat[col * n + i]; + } + resultMat[row * k + col] = sum; + } + } + } + } + } +} + +void mpi_sgemm_optimized(int m, int n, int k, float *&leftMat, float *&rightMat, + float *&resultMat, int rank, int worldsize) { + + // 计算行列分块数 + int rowBlock = (int)sqrt((double)worldsize); + while (rowBlock > 0 && worldsize % rowBlock != 0) { + rowBlock--; + } + int colBlock = worldsize / rowBlock; + + int rowStride, colStride; + float *res = nullptr; + float *localLeftMat = leftMat; + float *localRightMat = rightMat; + + if (rank == 0) { + // 矩阵转置 - 使用OpenMP加速 + float *buf = new float[k * n]; + #pragma omp parallel for collapse(2) + for (int r = 0; r < n; r++) { + for (int c = 0; c < k; c++) { + buf[c * n + r] = rightMat[r * k + c]; + } + } + + #pragma omp parallel for collapse(2) + for (int r = 0; r < k; r++) { + for (int c = 0; c < n; c++) { + rightMat[r * n + c] = buf[r * n + c]; + } + } + delete[] buf; + + // 使用非阻塞通信重叠计算和通信 + std::vector sendRequests; + sendRequests.reserve(1000); + + for (int rowB = 0; rowB < rowBlock; rowB++) { + for (int colB = 0; colB < colBlock; colB++) { + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + int sendto = rowB * colBlock + colB; + if (sendto == 0) { + res = new float[rowStride * colStride]; + localLeftMat = leftMat + rowStart * n; + localRightMat = rightMat + colStart * n; + continue; + } + + // 发送分块大小 + MPI_Request req; + MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + + // 发送矩阵数据 + for (int r = 0; r < rowStride; r++) { + MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto, + 1, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + } + + for (int c = 0; c < colStride; c++) { + MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto, + 2, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + } + } + } + + // 等待所有发送完成 + for (size_t i = 0; i < sendRequests.size(); i++) { + MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE); + } + } else { + if (rank < worldsize) { + int rowB = rank / colBlock; + int colB = rank % colBlock; + + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + localLeftMat = new float[rowStride * n]; + localRightMat = new float[colStride * n]; + + for (int r = 0; r < rowStride; r++) { + MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + + for (int c = 0; c < colStride; c++) { + MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + + res = new float[rowStride * colStride]; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + // 本地计算 - 使用优化版本 + if (rank < worldsize) { + int rowB = rank / colBlock; + int colB = rank % colBlock; + + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + openmp_sgemm_optimized(rowStride, n, colStride, localLeftMat, localRightMat, res); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // 收集结果 + if (rank == 0) { + int rowB = 0; + int colB = 0; + int rowStart = rowB * (m / rowBlock); + int colStart = colB * (k / colBlock); + + for (int r = 0; r < rowStride; r++) { + for (int c = 0; c < colStride; c++) { + resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c]; + } + } + delete[] res; + + for (int rowB = 0; rowB < rowBlock; rowB++) { + for (int colB = 0; colB < colBlock; colB++) { + int recvfrom = rowB * colBlock + colB; + if (recvfrom == 0) continue; + + MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + float *tmpRes = new float[rowStride * colStride]; + MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + int rowStart = rowB * (m / rowBlock); + int colStart = colB * (k / colBlock); + + for (int r = 0; r < rowStride; r++) { + for (int c = 0; c < colStride; c++) { + resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c]; + } + } + delete[] tmpRes; + } + } + } else { + if (rank < worldsize) { + MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); + MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); + MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD); + + delete[] res; + delete[] localLeftMat; + delete[] localRightMat; + } + } + + MPI_Barrier(MPI_COMM_WORLD); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + cout << "Usage: " << argv[0] << " M N K\n"; + exit(-1); + } + + int rank; + int worldSize; + MPI_Init(&argc, &argv); + + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + + float *leftMat, *rightMat, *resMat; + struct timeval start, stop; + + if (rank == 0) { + randMat(m, n, leftMat); + randMat(n, k, rightMat); + randMat(m, k, resMat); + } + + gettimeofday(&start, NULL); + mpi_sgemm_optimized(m, n, k, leftMat, rightMat, resMat, rank, worldSize); + gettimeofday(&stop, NULL); + + if (rank == 0) { + double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 + + (stop.tv_usec - start.tv_usec) / 1000.0; + cout << "optimized mpi matmul: " << elapsed << " ms" << endl; + + bool correct = true; + for (int i = 0; i < m; i++) { + for (int j = 0; j < k; j++){ + if (int(resMat[i * k + j]) != n) { + cout << "Error at [" << i << "][" << j << "]: " + << resMat[i * k + j] << " (expected " << n << ")\n"; + correct = false; + goto end_check; + } + } + } + end_check: + if (correct) { + cout << "Result verification: PASSED" << endl; + } else { + cout << "Result verification: FAILED" << endl; + } + + delete[] leftMat; + delete[] rightMat; + delete[] resMat; + } + + MPI_Finalize(); + return 0; +} diff --git a/work/gemm_parallel.cpp b/work/gemm_parallel.cpp new file mode 100644 index 0000000..2235036 --- /dev/null +++ b/work/gemm_parallel.cpp @@ -0,0 +1,312 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +void randMat(int rows, int cols, float *&Mat) { + Mat = new float[rows * cols]; + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + Mat[i * cols + j] = 1.0; +} + +void openmp_sgemm(int m, int n, int k, float *leftMat, float *rightMat, + float *resultMat) { + // rightMat is transposed + // 使用OpenMP并行化外层循环 + #pragma omp parallel for collapse(2) + for (int row = 0; row < m; row++) { + for (int col = 0; col < k; col++) { + resultMat[row * k + col] = 0.0; + for (int i = 0; i < n; i++) { + resultMat[row * k + col] += + leftMat[row * n + i] * rightMat[col * n + i]; + } + } + } +} + +void mpi_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat, + float *&resultMat, int rank, int worldsize) { + + // 计算行列分块数(尽量接近平方数) + int rowBlock = (int)sqrt((double)worldsize); + while (rowBlock > 0 && worldsize % rowBlock != 0) { + rowBlock--; + } + int colBlock = worldsize / rowBlock; + + int rowStride, colStride; + + float *res = nullptr; + float *localLeftMat = leftMat; + float *localRightMat = rightMat; + + if (rank == 0) { + // 矩阵转置 + float *buf = new float[k * n]; + #pragma omp parallel for collapse(2) + for (int r = 0; r < n; r++) { + for (int c = 0; c < k; c++) { + buf[c * n + r] = rightMat[r * k + c]; + } + } + + #pragma omp parallel for collapse(2) + for (int r = 0; r < k; r++) { + for (int c = 0; c < n; c++) { + rightMat[r * n + c] = buf[r * n + c]; + } + } + delete[] buf; + + // Master-Slave模式,将子矩阵发送到各子进程 + // 使用vector来动态分配足够的请求空间 + std::vector sendRequests; + sendRequests.reserve(1000); // 预分配足够空间 + + for (int rowB = 0; rowB < rowBlock; rowB++) { + for (int colB = 0; colB < colBlock; colB++) { + // 计算分块大小(带状分块) + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + int sendto = rowB * colBlock + colB; + if (sendto == 0) { + // Rank 0 保留自己的分块 + res = new float[rowStride * colStride]; + localLeftMat = leftMat + rowStart * n; + localRightMat = rightMat + colStart * n; + continue; + } + + // 发送左矩阵分块 + MPI_Request req; + MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + + // 发送左矩阵数据 + for (int r = 0; r < rowStride; r++) { + MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto, + 1, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + } + + // 发送右矩阵数据 + for (int c = 0; c < colStride; c++) { + MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto, + 2, MPI_COMM_WORLD, &req); + sendRequests.push_back(req); + } + } + } + + // 等待所有发送完成 + for (size_t i = 0; i < sendRequests.size(); i++) { + MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE); + } + } else { + // 接收从主进程发送来的数据 + if (rank < worldsize) { + // 计算当前rank的分块位置 + int rowB = rank / colBlock; + int colB = rank % colBlock; + + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + // 接收分块大小 + MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + // 分配内存并接收数据 + localLeftMat = new float[rowStride * n]; + localRightMat = new float[colStride * n]; + + for (int r = 0; r < rowStride; r++) { + MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + + for (int c = 0; c < colStride; c++) { + MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + + res = new float[rowStride * colStride]; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + // 本地子矩阵相乘 + if (rank < worldsize) { + // 重新计算分块大小 + int rowB = rank / colBlock; + int colB = rank % colBlock; + + int rowStart = rowB * (m / rowBlock); + int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock); + rowStride = rowEnd - rowStart; + + int colStart = colB * (k / colBlock); + int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock); + colStride = colEnd - colStart; + + // 调用OpenMP加速本地子矩阵相乘运算 + openmp_sgemm(rowStride, n, colStride, localLeftMat, localRightMat, res); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // 将计算结果传送回rank 0 + if (rank == 0) { + // Rank 0 直接复制自己的结果 + int rowB = 0; + int colB = 0; + int rowStart = rowB * (m / rowBlock); + int colStart = colB * (k / colBlock); + + for (int r = 0; r < rowStride; r++) { + for (int c = 0; c < colStride; c++) { + resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c]; + } + } + delete[] res; + + // 接收其他进程的结果 + for (int rowB = 0; rowB < rowBlock; rowB++) { + for (int colB = 0; colB < colBlock; colB++) { + int recvfrom = rowB * colBlock + colB; + if (recvfrom == 0) continue; + + // 接收分块大小 + MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + // 接收结果数据 + float *tmpRes = new float[rowStride * colStride]; + MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + + // 组装到全局矩阵 + int rowStart = rowB * (m / rowBlock); + int colStart = colB * (k / colBlock); + + for (int r = 0; r < rowStride; r++) { + for (int c = 0; c < colStride; c++) { + resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c]; + } + } + delete[] tmpRes; + } + } + } else { + if (rank < worldsize) { + // 发送分块大小 + MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); + MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD); + + // 发送结果数据 + MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD); + + delete[] res; + delete[] localLeftMat; + delete[] localRightMat; + } + } + + MPI_Barrier(MPI_COMM_WORLD); +} + +int main(int argc, char *argv[]) { + if (argc != 4) { + if (argc == 0) { + cout << "Usage: program M N K" << endl; + } else { + cout << "Usage: " << argv[0] << " M N K\n"; + } + exit(-1); + } + + int rank; + int worldSize; + MPI_Init(&argc, &argv); + + MPI_Comm_size(MPI_COMM_WORLD, &worldSize); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // 矩阵尺寸 + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + + float *leftMat, *rightMat, *resMat; + + struct timeval start, stop; + + // 矩阵初始化 + if (rank == 0) { + randMat(m, n, leftMat); + randMat(n, k, rightMat); + randMat(m, k, resMat); + } + + gettimeofday(&start, NULL); + + // 使用MPI-OpenMP加速矩阵相乘 + mpi_sgemm(m, n, k, leftMat, rightMat, resMat, rank, worldSize); + + gettimeofday(&stop, NULL); + + // 打印结果 + if (rank == 0) { + double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 + + (stop.tv_usec - start.tv_usec) / 1000.0; + cout << "mpi matmul: " << elapsed << " ms" << endl; + + // 验证结果 + bool correct = true; + for (int i = 0; i < m; i++) { + for (int j = 0; j < k; j++){ + if (int(resMat[i * k + j]) != n) { + cout << "Error at [" << i << "][" << j << "]: " + << resMat[i * k + j] << " (expected " << n << ")\n"; + correct = false; + goto end_check; + } + } + } + end_check: + if (correct) { + cout << "Result verification: PASSED" << endl; + } else { + cout << "Result verification: FAILED" << endl; + } + + delete[] leftMat; + delete[] rightMat; + delete[] resMat; + } + + MPI_Finalize(); + return 0; +} diff --git a/work/gemm_serial.cpp b/work/gemm_serial.cpp new file mode 100644 index 0000000..49ef3e3 --- /dev/null +++ b/work/gemm_serial.cpp @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include + +using namespace std; + +void randMat(int rows, int cols, float *&Mat) { + Mat = new float[rows * cols]; + for (int i = 0; i < rows; i++) + for (int j = 0; j < cols; j++) + Mat[i * cols + j] = 1.0; +} + +void serial_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat, + float *&resultMat) { + // rightMat is transposed + float *buf = new float[k * n]; + // transpose right Mat + for (int r = 0; r < n; r++) { + for (int c = 0; c < k; c++) { + buf[c * n + r] = rightMat[r * k + c]; + } + } + for (int r = 0; r < k; r++) { + for (int c = 0; c < n; c++) { + rightMat[r * n + c] = buf[r * n + c]; + } + } + + for (int row = 0; row < m; row++) { + for (int col = 0; col < k; col++) { + resultMat[row * k + col] = 0.0; + for (int i = 0; i < n; i++) { + resultMat[row * k + col] += + leftMat[row * n + i] * rightMat[col * n + i]; + } + } + } + delete[] buf; + return; +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + cout << "Usage: " << argv[0] << " M N K use-blas\n"; + exit(-1); + } + + int m = atoi(argv[1]); + int n = atoi(argv[2]); + int k = atoi(argv[3]); + int blas = atoi(argv[4]); + + float *leftMat, *rightMat, *resMat; + + struct timeval start, stop; + randMat(m, n, leftMat); + randMat(n, k, rightMat); + randMat(m, k, resMat); + + gettimeofday(&start, NULL); + + serial_sgemm(m, n, k, leftMat, rightMat, resMat); + + gettimeofday(&stop, NULL); + cout << "matmul: " + << (stop.tv_sec - start.tv_sec) * 1000.0 + + (stop.tv_usec - start.tv_usec) / 1000.0 + << " ms" << endl; + + // 验证结果 + bool correct = true; + for (int i = 0; i < m; i++) { + for (int j = 0; j < k; j++){ + if (int(resMat[i * k + j]) != n) { + cout << "Error at [" << i << "][" << j << "]: " + << resMat[i * k + j] << " (expected " << n << ")\n"; + correct = false; + goto end_check; + } + } + } + end_check: + if (correct) { + cout << "Result verification: PASSED" << endl; + } else { + cout << "Result verification: FAILED" << endl; + } + + delete[] leftMat; + delete[] rightMat; + delete[] resMat; + + return 0; +} diff --git a/work/run_experiments.sh b/work/run_experiments.sh new file mode 100755 index 0000000..6b14baa --- /dev/null +++ b/work/run_experiments.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# MPI-OpenMP矩阵乘法性能测试脚本 +# 用于收集实验数据 + +# 设置环境变量 +export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1} + +# 输出文件 +OUTPUT_FILE="experiment_results.csv" +SERIAL_OUTPUT="serial_results.csv" +# get arch using uname -m +# if aarch64 then use arm64-v8a else use x86_64 +ARCH=$(uname -m) +if [ "$ARCH" == "aarch64" ]; then + BUILD_ARCH="arm64-v8a" +else + BUILD_ARCH="x86_64" +fi +# Build directory +BUILD_DIR="./build/linux/$BUILD_ARCH/release" +# 创建输出文件并写入表头 +echo "Experiment,M,N,K,MPI_Processes,OpenMP_Threads,Time_ms,Speedup,Efficiency" > $OUTPUT_FILE +echo "M,N,K,Time_ms" > $SERIAL_OUTPUT + +# 矩阵尺寸配置(可以根据需要修改) +MATRIX_SIZES="512 1024 2048 4096" + +# MPI进程数配置 +MPI_PROCESSES="1 2 4 9 16" + +# OpenMP线程数配置 +OPENMP_THREADS="1 2 4 8" + +echo "==========================================" +echo "MPI-OpenMP矩阵乘法性能测试" +echo "==========================================" + +# 编译程序 +echo "编译程序..." +xmake +if [ $? -ne 0 ]; then + echo "编译失败!" + exit 1 +fi +echo "编译完成!" +echo "" + +# 获取串行基准时间 +echo "==========================================" +echo "实验0: 串行基准测试" +echo "==========================================" + +for SIZE in $MATRIX_SIZES; do + echo "测试矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}" + TIME=$($BUILD_DIR/gemm_serial $SIZE $SIZE $SIZE 0 | grep "matmul:" | awk '{print $2}') + echo " 时间: ${TIME} ms" + echo "$SIZE,$SIZE,$SIZE,$TIME" >> $SERIAL_OUTPUT +done +echo "" + +# 实验一:固定OpenMP线程数为1,改变MPI进程数 +echo "==========================================" +echo "实验一: OpenMP线程数=1,改变MPI进程数" +echo "==========================================" + +export OMP_NUM_THREADS=1 + +for SIZE in $MATRIX_SIZES; do + # 获取串行时间 + SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4) + + echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}" + echo "串行时间: ${SERIAL_TIME} ms" + + for NP in $MPI_PROCESSES; do + echo " MPI进程数: $NP" + TIME=$(mpirun --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}') + + if [ ! -z "$TIME" ]; then + SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc) + EFFICIENCY=$(echo "scale=4; $SPEEDUP / $NP" | bc) + echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY" + echo "Exp1,$SIZE,$SIZE,$SIZE,$NP,1,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE + fi + done + echo "" +done + +# 实验二:同时改变MPI进程数和OpenMP线程数 +echo "==========================================" +echo "实验二: 改变MPI进程数和OpenMP线程数" +echo "==========================================" + +for SIZE in $MATRIX_SIZES; do + # 获取串行时间 + SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4) + + echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}" + + for NTHREADS in $OPENMP_THREADS; do + export OMP_NUM_THREADS=$NTHREADS + echo " OpenMP线程数: $NTHREADS" + + for NP in $MPI_PROCESSES; do + TOTAL_PROCS=$((NP * NTHREADS)) + echo " MPI进程数: $NP (总处理器数: $TOTAL_PROCS)" + + TIME=$(mpirun --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}') + + if [ ! -z "$TIME" ]; then + SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc) + EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc) + echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY" + echo "Exp2,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE + fi + done + done + echo "" +done + +# 实验三:固定总处理器数,改变MPI和OpenMP的组合 +echo "==========================================" +echo "实验三: 固定总处理器数,改变MPI/OpenMP组合" +echo "==========================================" + +TOTAL_PROCS_TARGET=16 +echo "目标总处理器数: $TOTAL_PROCS_TARGET" + +for SIZE in $MATRIX_SIZES; do + # 获取串行时间 + SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4) + + echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}" + + # 不同的MPI/OpenMP组合,使得总处理器数接近16 + declare -a COMBOS=("1:16" "2:8" "4:4" "8:2" "16:1") + + for COMBO in "${COMBOS[@]}"; do + NP=$(echo $COMBO | cut -d':' -f1) + NTHREADS=$(echo $COMBO | cut -d':' -f2) + TOTAL_PROCS=$((NP * NTHREADS)) + + export OMP_NUM_THREADS=$NTHREADS + echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)" + + TIME=$(mpirun --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}') + + if [ ! -z "$TIME" ]; then + SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc) + EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc) + echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY" + echo "Exp3,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE + fi + done + echo "" +done + +# 实验三(优化实现): 固定总处理器数,使用 gemm_optimized,结果标识为 Exp3-opt +echo "==========================================" +echo "实验三(优化): 固定总处理器数,使用 gemm_optimized 的 MPI/OpenMP 组合测试" +echo "==========================================" + +for SIZE in $MATRIX_SIZES; do + # 获取串行时间 + SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4) + + echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}" + + # 与之前相同的组合 + declare -a COMBOS_OPT=("1:16" "2:8" "4:4" "8:2" "16:1") + + for COMBO in "${COMBOS_OPT[@]}"; do + NP=$(echo $COMBO | cut -d':' -f1) + NTHREADS=$(echo $COMBO | cut -d':' -f2) + TOTAL_PROCS=$((NP * NTHREADS)) + + export OMP_NUM_THREADS=$NTHREADS + echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)" + + TIME=$(mpirun --oversubscribe -np $NP $BUILD_DIR/gemm_optimized $SIZE $SIZE $SIZE | grep "optimized mpi matmul:" | awk '{print $4}') + + if [ ! -z "$TIME" ]; then + SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc) + EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc) + echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY" + echo "Exp3-opt,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE + fi + done + echo "" +done + +echo "==========================================" +echo "测试完成!" +echo "结果已保存到: $OUTPUT_FILE" +echo "串行基准已保存到: $SERIAL_OUTPUT" +echo "==========================================" +echo "" +echo "数据处理说明:" +echo "1. 使用Excel、Python或R读取CSV文件" +echo "2. 绘制图表:" +echo " - 实验一: X轴=MPI进程数, Y轴=加速比/效率, 不同矩阵尺寸用不同颜色" +echo " - 实验二: X轴=总处理器数, Y轴=加速比/效率, 不同OpenMP线程数用不同颜色" +echo " - 实验三: X轴=MPI进程数, Y轴=效率, 不同矩阵尺寸用不同颜色" +echo "3. 分析加速比和效率的变化趋势" +echo "4. 讨论MPI/OpenMP组合对性能的影响" + diff --git a/work/xmake.lua b/work/xmake.lua new file mode 100644 index 0000000..5871849 --- /dev/null +++ b/work/xmake.lua @@ -0,0 +1,32 @@ +set_project("gemm") +set_version("1.0") + +add_rules("mode.debug", "mode.release") +-- Find MPI package +add_requires("mpi", {system = true}) +add_requires("mpi_cxx", {system = true}) +-- 串行版本 +target("gemm_serial") + set_kind("binary") + add_files("gemm_serial.cpp") + add_cxxflags("-O3", "-march=native") + +-- 并行版本 +target("gemm_parallel") + set_kind("binary") + add_files("gemm_parallel.cpp") + add_cxxflags("-O3", "-march=native", "-fopenmp") + add_ldflags("-fopenmp") + -- 使用mpic++作为编译器 + add_packages("mpi") + add_packages("mpi_cxx") + +-- 优化版本 +target("gemm_optimized") + set_kind("binary") + add_files("gemm_optimized.cpp") + add_cxxflags("-O3", "-march=native", "-fopenmp") + add_ldflags("-fopenmp") + -- 使用mpic++作为编译器 + add_packages("mpi") + add_packages("mpi_cxx")