hpc-lab-code/lab3/nbody/lab3_nbody.sh
2026-01-21 18:02:30 +08:00

223 lines
7.8 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# N体问题实验脚本
# 收集串行和并行程序的性能数据
# 多机环境hpc-ecs-1, hpc-ecs-2, hpc-ecs-3每台2线程
set -e # 遇到错误立即退出
set -u # 使用未定义变量时报错
set -o pipefail # 管道命令中任何错误都会导致整个管道失败
OUTPUT_CSV="nbody_results.csv"
LOG_FILE="nbody_experiment.log"
# 主机配置
HOST1="hpc-ecs-1"
HOST2="hpc-ecs-2"
HOST3="hpc-ecs-3"
# 记录日志函数
log_error() {
echo "[ERROR] $*" | tee -a "$LOG_FILE"
}
log_info() {
echo "[INFO] $*" | tee -a "$LOG_FILE"
}
# 清空或创建CSV文件
echo "实验,数据规模,每机进程数,机器配置,运行时间(s)" > "$OUTPUT_CSV"
echo "=========================================="
echo "N体问题性能测试实验"
echo "=========================================="
echo "主机配置: $HOST1, $HOST2, $HOST3"
echo ""
# 编译程序
echo "编译程序..."
log_info "开始编译程序..."
if ! xmake build nbody_ser; then
log_error "编译 nbody_ser 失败"
exit 1
fi
if ! xmake build nbody_par; then
log_error "编译 nbody_par 失败"
exit 1
fi
log_info "编译完成"
echo ""
# 固定数据规模
FIXED_N=6000
# 实验一单机上数据规模为6000时随每机进程数变化的运行时间串行程序
echo "=========================================="
echo "实验一:串行程序 - 数据规模6000"
echo "=========================================="
log_info "运行串行程序..."
ser_output=$(./build/linux/arm64-v8a/release/nbody_ser $FIXED_N 2>&1)
ser_exit_code=$?
if [ $ser_exit_code -ne 0 ]; then
log_error "串行程序执行失败,退出码: $ser_exit_code"
echo "$ser_output" | tee -a "$LOG_FILE"
exit 1
fi
time_output=$(echo "$ser_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间"
echo "$ser_output" | tee -a "$LOG_FILE"
exit 1
fi
echo "实验一,6000,1,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
log_info "实验一完成"
echo ""
# 实验二多机环境下数据规模为6000随每机进程数变化的运行时间
echo "=========================================="
echo "实验二:并行程序 - 数据规模6000不同每机进程数"
echo "=========================================="
# 测试不同的每机进程数和机器配置
for ppn in 1 2 3 4; do
# 单机测试
echo "每机进程数: $ppn, 单机"
log_info "实验二: 单机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(单机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(单机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
# 双机测试
echo "每机进程数: $ppn, 双机"
log_info "实验二: 双机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(双机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(双机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,双机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
# 三机测试
echo "每机进程数: $ppn, 三机"
log_info "实验二: 三机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn,$HOST3:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(三机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(三机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,三机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
done
# 实验三每机1个进程随数据规模变化的并行程序运行时间
echo "=========================================="
echo "实验三:并行程序 - 每机1进程不同数据规模"
echo "=========================================="
# 测试不同的数据规模
for N in 150 300 600 1200 2400 4800 9600; do
echo "数据规模: $N"
log_info "实验三: 数据规模=$N"
# 单机测试
echo " 单机..."
par_output=$(mpirun --host "$HOST1:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(单机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(单机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
# 双机测试
echo " 双机..."
par_output=$(mpirun --host "$HOST1:1,$HOST2:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(双机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(双机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,双机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
# 三机测试
echo " 三机..."
par_output=$(mpirun --host "$HOST1:1,$HOST2:1,$HOST3:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(三机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(三机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,三机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
done
echo "=========================================="
echo "实验完成"
echo "=========================================="
echo ""
log_info "所有实验完成"
echo "结果已保存到: $OUTPUT_CSV"
echo "日志已保存到: $LOG_FILE"
echo ""
echo "数据预览:"
cat "$OUTPUT_CSV"
echo ""
echo "如有错误,请查看日志文件: $LOG_FILE"