Compare commits

...

3 Commits
dev ... master

Author SHA1 Message Date
yly
08e8addef1
准备提交 2026-01-22 04:30:04 +08:00
yly
4291d88e3b add mpi hostfile and buildall script 2026-01-21 20:10:29 +08:00
yly
734edd2aa6 finish 2026-01-21 18:30:58 +08:00
32 changed files with 2508 additions and 34 deletions

66
buildall.sh Executable file
View File

@ -0,0 +1,66 @@
#!/bin/bash
# HPC Lab 统一构建脚本
# 使用 xmake 构建所有实验项目
set -e # 遇到错误立即退出
SCRIPT_DIR="$(dirname "$0")"
cd "$SCRIPT_DIR"
# 检查 xmake 是否安装
if ! command -v xmake &> /dev/null; then
echo "错误: xmake 未安装,请先安装 xmake"
echo "安装方法: curl -fsSL https://xmake.io/shget.text | bash"
exit 1
fi
echo "=========================================="
echo "HPC Lab Code - 全项目构建"
echo "=========================================="
# 构建 lab1
echo "构建 lab1..."
cd lab1
xmake
cd ..
# 构建 lab2/omp
echo "构建 lab2/omp..."
cd lab2/omp
xmake
cd ../..
# 构建 lab2/pthread
echo "构建 lab2/pthread..."
cd lab2/pthread
xmake
cd ../..
# 构建 lab3/nbody
echo "构建 lab3/nbody..."
cd lab3/nbody
xmake
cd ../..
# 构建 lab3/prime
echo "构建 lab3/prime..."
cd lab3/prime
xmake
cd ../..
# 构建 lab4
echo "构建 lab4..."
cd lab4
xmake
cd ..
# 构建 work
echo "构建 work..."
cd work
xmake
cd ..
echo "=========================================="
echo "构建完成!"
echo "=========================================="

View File

@ -1,9 +1,16 @@
#!/bin/bash #!/bin/bash
echo "Current directory: $PWD" echo "Current directory: $PWD"
# get arch using uname -m
# if aarch64 then use arm64-v8a else use x86_64
ARCH=$(uname -m)
if [ "$ARCH" == "aarch64" ]; then
BUILD_ARCH="arm64-v8a"
else
BUILD_ARCH="x86_64"
fi
# Build directory # Build directory
BUILD_DIR="./build/linux/x86_64/release" BUILD_DIR="./build/linux/$BUILD_ARCH/release"
# Programs # Programs
MPI_HELLO="$BUILD_DIR/mpi_hello_world" MPI_HELLO="$BUILD_DIR/mpi_hello_world"
@ -28,7 +35,7 @@ echo "Programs found. Starting tests..."
# Test mpi_hello_world # Test mpi_hello_world
echo "Testing mpi_hello_world with default settings:" echo "Testing mpi_hello_world with default settings:"
mpirun "$MPI_HELLO" mpirun --hostfile ~/mpi_hosts "$MPI_HELLO"
echo "mpi_hello_world test completed." echo "mpi_hello_world test completed."
# Terms to test # Terms to test
@ -41,7 +48,7 @@ echo "Testing mpi_pi with different terms and processes:"
for procs in "${PROCS[@]}"; do for procs in "${PROCS[@]}"; do
for terms in "${TERMS[@]}"; do for terms in "${TERMS[@]}"; do
echo "Running mpi_pi with $procs processes and $terms terms:" echo "Running mpi_pi with $procs processes and $terms terms:"
mpirun -np $procs "$MPI_PI" <<< $terms mpirun --hostfile ~/mpi_hosts -np $procs "$MPI_PI" <<< $terms
echo "" echo ""
done done
done done

6
lab2/omp/main.cpp Normal file
View File

@ -0,0 +1,6 @@
#include <iostream>
int main(int argc, char** argv) {
std::cout << "hello world!" << std::endl;
return 0;
}

View File

@ -0,0 +1,17 @@
#include <stdio.h>
#include <omp.h>
int main() {
int i;
#pragma omp parallel
{
printf("Hello World\n");
for(i=0; i<4; i++) {
printf("Iter:%d\n",i);
}
printf("GoodBye World\n");
}
return 0;
}

33
lab2/omp/pi.c Normal file
View File

@ -0,0 +1,33 @@
#include <stdio.h>
#include <sys/time.h>
long long num_steps = 1000000000;
double step;
int main(int argc, char* argv[])
{
struct timeval TimeStampStart, TimeStampStop;
double ExeTime;
double x, pi, sum=0.0;
int i;
step = 1./(double)num_steps;
gettimeofday(&TimeStampStart, NULL);
for (i=0; i<num_steps; i++)
{
x = (i + .5)*step;
sum = sum + 4.0/(1.+ x*x);
}
pi = sum*step;
gettimeofday(&TimeStampStop, NULL);
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
printf("The value of PI is %15.12f\n",pi);
printf("The time to calculate PI was %f seconds\n", (ExeTime));
return 0;
}

38
lab2/omp/pi_par.c Normal file
View File

@ -0,0 +1,38 @@
#include <stdio.h>
#include <omp.h>
#include <sys/time.h>
long long num_steps = 1000000000;
double step;
int main(int argc, char* argv[])
{
struct timeval TimeStampStart, TimeStampStop;
double ExeTime;
double x, pi, sum=0.0;
int i;
step = 1./(double)num_steps;
gettimeofday(&TimeStampStart, NULL);
#pragma omp parallel private(x) reduction(+:sum)
{
#pragma omp for
for (i=0; i<num_steps; i++)
{
x = (i + .5)*step;
sum = sum + 4.0/(1.+ x*x);
}
}
pi = sum*step;
gettimeofday(&TimeStampStop, NULL);
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
printf("The value of PI is %15.12f\n",pi);
printf("The time to calculate PI was %f seconds\n", (ExeTime));
return 0;
}

53
lab2/omp/pimonte_par.c Normal file
View File

@ -0,0 +1,53 @@
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#include <sys/time.h>
#include <time.h>
#define BLOCK_SIZE 500
int main(){
struct timeval TimeStampStart, TimeStampStop;
double ExeTime;
unsigned int iter=200000000;
int i, j;
double x, y;
double dUnderCurve=0.0;
double pi=0.0;
double r[BLOCK_SIZE*2];
gettimeofday(&TimeStampStart, NULL);
#pragma omp parallel private(i, j, x, y, r) reduction(+:dUnderCurve)
{
unsigned int seed = omp_get_thread_num() + 1;
#pragma omp for
for(j=0; j<iter/BLOCK_SIZE; j++) {
// Create random numbers into array r
// 生成 BLOCK_SIZE*2 个在 0.0-1.0 内均匀分布的随机数, 作为横纵坐标
for (i=0; i<BLOCK_SIZE*2; i++) {
r[i] = 0.0 + 1.0 * rand_r(&seed) / RAND_MAX * ( 1.0 - 0.0 );
}
for (i=0; i<BLOCK_SIZE; i++) {
x=r[i]; //X Coordinate
y=r[i+BLOCK_SIZE]; //Y Coordinate
if (x*x + y*y <= 1.0) { //is distance from Origin under Curve
dUnderCurve++;
}
}
}
}
pi = dUnderCurve / (double) iter * 4;
gettimeofday(&TimeStampStop, NULL);
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
printf ("pi = %10.9f\n", pi);
printf("The time to calculate PI was %f seconds\n", (ExeTime));
return 0;
}

49
lab2/omp/pimonte_serial.c Normal file
View File

@ -0,0 +1,49 @@
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#define BLOCK_SIZE 500
int main(){
struct timeval TimeStampStart, TimeStampStop;
double ExeTime;
unsigned int iter=200000000;
int i, j;
double x, y;
double dUnderCurve=0.0;
double pi=0.0;
double r[BLOCK_SIZE*2]; //Careful!!!
//you need a private copy of whole array for each thread
srand((unsigned)time(NULL));
gettimeofday(&TimeStampStart, NULL);
for(j=0; j<iter/BLOCK_SIZE; j++) {
// Create random numbers into array r
// 生成 BLOCK_SIZE*2 个在 0.0-1.0 内均匀分布的随机数, 作为横纵坐标
for (i=0; i<BLOCK_SIZE*2; i++) {
r[i] = 0.0 + 1.0 * rand() / RAND_MAX * ( 1.0 - 0.0 );
}
for (i=0; i<BLOCK_SIZE; i++) {
x=r[i]; //X Coordinate
y=r[i+BLOCK_SIZE]; //Y Coordinate
if (x*x + y*y <= 1.0) { //is distance from Origin under Curve
dUnderCurve++;
}
}
}
pi = dUnderCurve / (double) iter * 4;
gettimeofday(&TimeStampStop, NULL);
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
printf ("pi = %10.9f\n", pi);
printf("The time to calculate PI was %f seconds\n", (ExeTime));
return 0;
}

98
lab2/omp/xmake.lua Normal file
View File

@ -0,0 +1,98 @@
add_rules("mode.debug", "mode.release")
add_requires("openmp")
-- OpenMP Hello World
target("openmp_hello_world")
set_kind("binary")
add_files("openmp_hello_world.c")
add_packages("openmp")
-- PI Serial (中值积分)
target("pi")
set_kind("binary")
add_files("pi.c")
-- PI Parallel (中值积分)
target("pi_par")
set_kind("binary")
add_files("pi_par.c")
add_packages("openmp")
-- PI Monte Carlo Serial
target("pimonte_serial")
set_kind("binary")
add_files("pimonte_serial.c")
-- PI Monte Carlo Parallel
target("pimonte_par")
set_kind("binary")
add_files("pimonte_par.c")
add_packages("openmp")
--
-- If you want to known more usage about xmake, please see https://xmake.io
--
-- ## FAQ
--
-- You can enter the project directory firstly before building project.
--
-- $ cd projectdir
--
-- 1. How to build project?
--
-- $ xmake
--
-- 2. How to configure project?
--
-- $ xmake f -p [macosx|linux|iphoneos ..] -a [x86_64|i386|arm64 ..] -m [debug|release]
--
-- 3. Where is the build output directory?
--
-- The default output directory is `./build` and you can configure the output directory.
--
-- $ xmake f -o outputdir
-- $ xmake
--
-- 4. How to run and debug target after building project?
--
-- $ xmake run [targetname]
-- $ xmake run -d [targetname]
--
-- 5. How to install target to the system directory or other output directory?
--
-- $ xmake install
-- $ xmake install -o installdir
--
-- 6. Add some frequently-used compilation flags in xmake.lua
--
-- @code
-- -- add debug and release modes
-- add_rules("mode.debug", "mode.release")
--
-- -- add macro definition
-- add_defines("NDEBUG", "_GNU_SOURCE=1")
--
-- -- set warning all as error
-- set_warnings("all", "error")
--
-- -- set language: c99, c++11
-- set_languages("c99", "c++11")
--
-- -- set optimization: none, faster, fastest, smallest
-- set_optimize("fastest")
--
-- -- add include search directories
-- add_includedirs("/usr/include", "/usr/local/include")
--
-- -- add link libraries and search directories
-- add_links("tbox")
-- add_linkdirs("/usr/local/lib", "/usr/lib")
--
-- -- add system link libraries
-- add_syslinks("z", "pthread")
--
-- -- add compilation and link flags
-- add_cxflags("-stdnolib", "-fno-strict-aliasing")
-- add_ldflags("-L/usr/local/lib", "-lpthread", {force = true})
--
-- @endcode
--

View File

@ -5,7 +5,7 @@
#include <string.h> #include <string.h>
#include <sys/time.h> #include <sys/time.h>
#define NUM_THREADS 4 int NUM_THREADS= 4;
FILE *fd; FILE *fd;
int TotalEvenWords = 0, TotalOddWords = 0, TotalWords = 0; int TotalEvenWords = 0, TotalOddWords = 0, TotalWords = 0;
@ -66,14 +66,16 @@ void *count_words_thread(void *arg)
return NULL; return NULL;
} }
int main() int main(int argc, char** argv)
{ {
fd = fopen("./InFile1.txt", "r"); // Open file for read fd = fopen("./InFile1.txt", "r"); // Open file for read
if (fd == NULL) { if (fd == NULL) {
perror("Failed to open file"); perror("Failed to open file");
return 1; return 1;
} }
if (argc > 1){
NUM_THREADS = atoi(argv[1]);
}
// Read all lines // Read all lines
char **lines = NULL; char **lines = NULL;
int total_lines = 0; int total_lines = 0;

View File

@ -3,7 +3,7 @@
#include <pthread.h> #include <pthread.h>
#include <sys/time.h> #include <sys/time.h>
#define NUM_THREADS 4 int NUM_THREADS=4;
long long num_steps = 1000000000; long long num_steps = 1000000000;
double step; double step;
@ -34,6 +34,9 @@ int main(int argc, char* argv[])
struct timeval TimeStampStart, TimeStampStop; struct timeval TimeStampStart, TimeStampStop;
double ExeTime; double ExeTime;
double pi; double pi;
if (argc > 1) {
NUM_THREADS = atoi(argv[1]);
}
int thread_ids[NUM_THREADS]; int thread_ids[NUM_THREADS];
pthread_t threads[NUM_THREADS]; pthread_t threads[NUM_THREADS];

View File

@ -10,6 +10,10 @@ target("count_words_par")
add_files("count_words_par.c") add_files("count_words_par.c")
add_links("pthread") add_links("pthread")
target("count_words_par_opt")
set_kind("binary")
add_files("count_words_par_opt.c")
add_links("pthread")
target("count_words_ser") target("count_words_ser")
set_kind("binary") set_kind("binary")
add_files("count_words_ser.c") add_files("count_words_ser.c")

View File

@ -1,26 +1,222 @@
#!/bin/bash #!/bin/bash
# N体问题实验脚本 # N体问题实验脚本
# 收集串行和并行程序的性能数据
# 多机环境hpc-ecs-1, hpc-ecs-2, hpc-ecs-3每台2线程
set -e # 遇到错误立即退出
set -u # 使用未定义变量时报错
set -o pipefail # 管道命令中任何错误都会导致整个管道失败
OUTPUT_CSV="nbody_results.csv"
LOG_FILE="nbody_experiment.log"
# 主机配置
HOST1="hpc-ecs-1"
HOST2="hpc-ecs-2"
HOST3="hpc-ecs-3"
# 记录日志函数
log_error() {
echo "[ERROR] $*" | tee -a "$LOG_FILE"
}
log_info() {
echo "[INFO] $*" | tee -a "$LOG_FILE"
}
# 清空或创建CSV文件
echo "实验,数据规模,每机进程数,机器配置,运行时间(s)" > "$OUTPUT_CSV"
echo "==========================================" echo "=========================================="
echo "N体问题串行模拟实验" echo "N体问题性能测试实验"
echo "==========================================" echo "=========================================="
echo "主机配置: $HOST1, $HOST2, $HOST3"
echo "" echo ""
# 默认天体数量
N=${1:-4}
echo "运行参数:"
echo " 天体数量: $N"
echo " 时间步长: 0.01 s"
echo " 总步数: 100"
echo ""
# 编译程序 # 编译程序
xmake build nbody_ser echo "编译程序..."
# 运行程序 log_info "开始编译程序..."
./build/linux/x86_64/release/nbody_ser $N if ! xmake build nbody_ser; then
log_error "编译 nbody_ser 失败"
exit 1
fi
if ! xmake build nbody_par; then
log_error "编译 nbody_par 失败"
exit 1
fi
log_info "编译完成"
echo "" echo ""
# 固定数据规模
FIXED_N=6000
# 实验一单机上数据规模为6000时随每机进程数变化的运行时间串行程序
echo "=========================================="
echo "实验一:串行程序 - 数据规模6000"
echo "=========================================="
log_info "运行串行程序..."
ser_output=$(./build/linux/arm64-v8a/release/nbody_ser $FIXED_N 2>&1)
ser_exit_code=$?
if [ $ser_exit_code -ne 0 ]; then
log_error "串行程序执行失败,退出码: $ser_exit_code"
echo "$ser_output" | tee -a "$LOG_FILE"
exit 1
fi
time_output=$(echo "$ser_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间"
echo "$ser_output" | tee -a "$LOG_FILE"
exit 1
fi
echo "实验一,6000,1,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
log_info "实验一完成"
echo ""
# 实验二多机环境下数据规模为6000随每机进程数变化的运行时间
echo "=========================================="
echo "实验二:并行程序 - 数据规模6000不同每机进程数"
echo "=========================================="
# 测试不同的每机进程数和机器配置
for ppn in 1 2 3 4; do
# 单机测试
echo "每机进程数: $ppn, 单机"
log_info "实验二: 单机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(单机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(单机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
# 双机测试
echo "每机进程数: $ppn, 双机"
log_info "实验二: 双机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(双机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(双机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,双机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
# 三机测试
echo "每机进程数: $ppn, 三机"
log_info "实验二: 三机, ppn=$ppn"
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn,$HOST3:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(三机 ppn=$ppn),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(三机 ppn=$ppn"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验二,6000,$ppn,三机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
done
# 实验三每机1个进程随数据规模变化的并行程序运行时间
echo "=========================================="
echo "实验三:并行程序 - 每机1进程不同数据规模"
echo "=========================================="
# 测试不同的数据规模
for N in 150 300 600 1200 2400 4800 9600; do
echo "数据规模: $N"
log_info "实验三: 数据规模=$N"
# 单机测试
echo " 单机..."
par_output=$(mpirun --host "$HOST1:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(单机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(单机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,单机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
# 双机测试
echo " 双机..."
par_output=$(mpirun --host "$HOST1:1,$HOST2:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(双机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(双机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,双机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
# 三机测试
echo " 三机..."
par_output=$(mpirun --host "$HOST1:1,$HOST2:1,$HOST3:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
par_exit_code=$?
if [ $par_exit_code -ne 0 ]; then
log_error "并行程序执行失败(三机 N=$N),退出码: $par_exit_code"
echo "$par_output" | tee -a "$LOG_FILE"
else
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
if [ -z "$time_output" ]; then
log_error "无法从输出中提取运行时间(三机 N=$N"
echo "$par_output" | tee -a "$LOG_FILE"
else
echo "实验三,$N,三机,$time_output" >> "$OUTPUT_CSV"
echo " 时间: $time_output s"
fi
fi
echo ""
done
echo "==========================================" echo "=========================================="
echo "实验完成" echo "实验完成"
echo "==========================================" echo "=========================================="
echo ""
log_info "所有实验完成"
echo "结果已保存到: $OUTPUT_CSV"
echo "日志已保存到: $LOG_FILE"
echo ""
echo "数据预览:"
cat "$OUTPUT_CSV"
echo ""
echo "如有错误,请查看日志文件: $LOG_FILE"

View File

@ -163,7 +163,7 @@ int main(int argc, char **argv) {
verbose = (strcmp(argv[2], "--verbose") == 0 || strcmp(argv[2], "-v") == 0); verbose = (strcmp(argv[2], "--verbose") == 0 || strcmp(argv[2], "-v") == 0);
} }
// 只有rank 0打印初始信息 // 只有rank 0打印初始信息
if (verbose && world_rank == 0) { if (world_rank == 0) {
cout << "N体问题并行模拟" << endl; cout << "N体问题并行模拟" << endl;
cout << "天体数量: " << n << endl; cout << "天体数量: " << n << endl;
cout << "进程数量: " << world_size << endl; cout << "进程数量: " << world_size << endl;

View File

@ -7,7 +7,14 @@ echo "=========================================="
echo "Lab 3: Prime Number Calculation Performance Test" echo "Lab 3: Prime Number Calculation Performance Test"
echo "==========================================" echo "=========================================="
echo "" echo ""
# get arch using uname -m
# if aarch64 then use arm64-v8a else use x86_64
ARCH=$(uname -m)
if [ "$ARCH" == "aarch64" ]; then
BUILD_ARCH="arm64-v8a"
else
BUILD_ARCH="x86_64"
fi
# Array of N values # Array of N values
N_VALUES=(100000 200000 400000 800000) N_VALUES=(100000 200000 400000 800000)
@ -35,7 +42,74 @@ for N in "${N_VALUES[@]}"; do
echo -n "Running with $P process(es)... " echo -n "Running with $P process(es)... "
# Run the program and capture output # Run the program and capture output
OUTPUT=$(mpirun -n $P ./build/linux/x86_64/release/prime_par_naive $N 2>&1) OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par_naive $N 2>&1)
# Extract prime count and time from output
PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)')
TIME=$(echo "$OUTPUT" | grep "Time =" | grep -oP '[0-9.]+(?= seconds)')
# Print result
if [ ! -z "$PRIME_COUNT" ] && [ ! -z "$TIME" ]; then
echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE
echo "Done! (Primes: $PRIME_COUNT, Time: ${TIME}s)"
else
echo "Error running program!"
echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE
fi
done
done
echo ""
echo "=========================================="
echo "Test completed!"
echo "=========================================="
echo ""
echo "Results saved to: $OUTPUT_FILE"
echo ""
echo "Summary Table:"
echo "--------------------------------------------------------"
cat $OUTPUT_FILE
echo "--------------------------------------------------------"
echo ""
echo "=========================================="
echo "Begin Optimized Test!"
echo "=========================================="
echo ""
ARCH=$(uname -m)
if [ "$ARCH" == "aarch64" ]; then
BUILD_ARCH="arm64-v8a"
else
BUILD_ARCH="x86_64"
fi
# Array of N values
N_VALUES=(100000 200000 400000 800000)
# Array of process counts
PROCESS_COUNTS=(1 2 4 6 8)
# Output file for results
OUTPUT_FILE="prime_results_opt.txt"
# Clear previous results
> $OUTPUT_FILE
# Print header
echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE
echo "--------------------------------------------------------" | tee -a $OUTPUT_FILE
# Loop through each N value
for N in "${N_VALUES[@]}"; do
echo ""
echo "Testing N = $N"
echo "------------------------"
# Loop through each process count
for P in "${PROCESS_COUNTS[@]}"; do
echo -n "Running with $P process(es)... "
# Run the program and capture output
OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par $N $(echo "$N/$P" | bc) 2>&1)
# Extract prime count and time from output # Extract prime count and time from output
PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)') PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)')
@ -52,6 +126,8 @@ for N in "${N_VALUES[@]}"; do
done done
done done
$(echo "$N/$P" | bc)
echo "" echo ""
echo "==========================================" echo "=========================================="
echo "Test completed!" echo "Test completed!"

View File

@ -103,7 +103,8 @@ int main(int argc, char* argv[]) {
// No range to distribute, all primes are base primes // No range to distribute, all primes are base primes
int total_count = base_primes.size(); int total_count = base_primes.size();
if (rank == 0) { if (rank == 0) {
std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl; std::cout << "Between 2 and " << N << ", there are " << total_count
<< " primes." << std::endl;
} }
MPI_Finalize(); MPI_Finalize();
return 0; return 0;
@ -172,7 +173,8 @@ int main(int argc, char* argv[]) {
if (rank == 0) { if (rank == 0) {
end_wtime = MPI_Wtime ( ) - wtime; end_wtime = MPI_Wtime ( ) - wtime;
int total_count = base_primes.size() + global_prime_count; int total_count = base_primes.size() + global_prime_count;
std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl; std::cout << "Between 2 and " << N << ", there are " << total_count
<< " primes." << std::endl;
std::cout << "Time = " << end_wtime << " seconds" << std::endl; std::cout << "Time = " << end_wtime << " seconds" << std::endl;
} }

109
lab4/MatrixMul_cpu.cu Normal file
View File

@ -0,0 +1,109 @@
#include <iostream>
#include <omp.h>
#include <chrono>
#include <vector>
#include <iomanip>
#include <cmath>
void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) {
#pragma omp parallel for num_threads(num_threads)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {
float sum = 0.0f;
for (int k = 0; k < N; ++k) {
sum += A[i * N + k] * B[k * K + j];
}
C[i * K + j] = sum;
}
}
}
void runCPUTest() {
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
std::vector<int> thread_counts = {8, 64, 256};
std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n";
std::cout << "=================================================================\n";
std::cout << std::setw(12) << "Matrix"
<< std::setw(12) << "Threads"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "FLOPS(G)"
<< std::setw(15) << "Speedup" << std::endl;
std::cout << "-----------------------------------------------------------------\n";
// 存储基准性能(单线程)
std::vector<double> baseline_times(matrix_sizes.size());
for (size_t m = 0; m < matrix_sizes.size(); ++m) {
int size = matrix_sizes[m];
int M = size, N = size, K = size;
// 分配内存
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
// 初始化数据
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
// 首先测试单线程作为基准
auto start = std::chrono::high_resolution_clock::now();
matrixMultiplyCPU(A, B, C, M, N, K, 1);
auto end = std::chrono::high_resolution_clock::now();
auto single_duration = std::chrono::duration<float, std::milli>(end - start).count();
baseline_times[m] = single_duration;
// 测试多线程
for (int threads : thread_counts) {
start = std::chrono::high_resolution_clock::now();
matrixMultiplyCPU(A, B, C, M, N, K, threads);
end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration<float, std::milli>(end - start).count();
// 计算FLOPS
double total_flops = 2.0 * M * N * K;
double gflops = total_flops / (duration * 1e6);
// 计算加速比
double speedup = baseline_times[m] / duration;
std::cout << std::setw(12) << size << "x" << size
<< std::setw(12) << threads
<< std::setw(15) << std::fixed << std::setprecision(3) << duration
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops
<< std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl;
}
delete[] A;
delete[] B;
delete[] C;
std::cout << "-----------------------------------------------------------------\n";
}
}
void plotData() {
std::cout << "\n\nASCII图表CPU性能分析\n";
std::cout << "=================================================================\n";
std::cout << "1. 不同线程数下的加速比趋势\n";
std::cout << " Matrix Threads=8 Threads=64 Threads=256\n";
// 这里可以添加具体的绘图逻辑
// 由于是文本输出可以使用简单的ASCII字符绘制柱状图
std::cout << "\n2. 不同矩阵规模下的性能趋势\n";
std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n";
std::cout << "\n注意完整图表建议使用Python (matplotlib) 生成。\n";
std::cout << "推荐生成以下图表:\n";
std::cout << "- 折线图:不同线程数下的加速比 vs 矩阵规模\n";
std::cout << "- 柱状图不同配置下的GFLOPS对比\n";
std::cout << "- 热力图:线程数 × 矩阵规模 的性能分布\n";
}
int main() {
runCPUTest();
plotData();
return 0;
}

109
lab4/MatrixMul_kernel1.cu Normal file
View File

@ -0,0 +1,109 @@
#include <iostream>
#include <chrono>
#include <cuda_runtime.h>
#include <vector>
#include <iomanip>
__global__ void matMultCUDAKernel1(const float* A, const float* B, float* C, int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row < M && col < K){
float sum = 0.0f;
for(int i = 0; i < N; ++i){
sum += A[row * N + i] * B[i * K + col];
}
C[row * K + col] = sum;
}
}
int main() {
std::vector<int> sizes = {512, 1024, 2048,4096};
std::vector<float> times;
// 遍历所有矩阵尺寸
for(int idx = 0; idx < sizes.size(); ++idx) {
int M = sizes[idx];
int N = sizes[idx];
int K = sizes[idx];
// 分配主机内存
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
// 初始化数据
for(int i = 0; i < M * N; ++i) A[i] = rand() % 10;
for(int i = 0; i < N * K; ++i) B[i] = rand() % 10;
// 分配设备内存
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_B, N * K * sizeof(float));
cudaMalloc(&d_C, M * K * sizeof(float));
// 拷贝数据到设备
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
// 配置线程块和网格
dim3 blockSize(16, 16);
dim3 gridSize((K + blockSize.x - 1) / blockSize.x,
(M + blockSize.y - 1) / blockSize.y);
// 预热(可选)
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
// 计时开始
auto start = std::chrono::high_resolution_clock::now();
// 执行核函数
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
// 计时结束
auto end = std::chrono::high_resolution_clock::now();
// 拷贝结果回主机
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
// 计算时间
std::chrono::duration<float> duration = end - start;
times.push_back(duration.count());
// 清理设备内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// 清理主机内存
delete[] A;
delete[] B;
delete[] C;
}
// 输出结果
std::cout << "CUDA Kernel1 矩阵乘法性能测试结果" << std::endl;
std::cout << "=================================" << std::endl;
std::cout << std::setw(12) << "Matrix Size"
<< std::setw(15) << "Time(s)"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "GFLOPS" << std::endl;
std::cout << "---------------------------------" << std::endl;
for(int i = 0; i < sizes.size(); ++i) {
int size = sizes[i];
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
double time_ms = times[i] * 1000.0; // 转换为毫秒
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
}
std::cout << "=================================" << std::endl;
return 0;
}

114
lab4/MatrixMul_kernel2.cu Normal file
View File

@ -0,0 +1,114 @@
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <vector>
#include <iomanip>
#define TILE_WIDTH 4
__global__ void matMultCUDAKernel2(const float* A, const float* B, float* C, int M, int N, int K) {
__shared__ float shared_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float shared_B[TILE_WIDTH][TILE_WIDTH];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int t = 0; t < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++t) {
if (row < M && t * TILE_WIDTH + threadIdx.x < N)
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_WIDTH + threadIdx.x];
else
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
if (col < K && t * TILE_WIDTH + threadIdx.y < N)
shared_B[threadIdx.y][threadIdx.x] = B[(t * TILE_WIDTH + threadIdx.y) * K + col];
else
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads();
for (int i = 0; i < TILE_WIDTH; ++i)
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
__syncthreads();
}
if(row < M && col < K){
C[row * K + col] = sum;
}
}
int main() {
std::vector<int> sizes = {512, 1024, 2048,4096};
std::vector<float> times;
for(int idx = 0; idx < sizes.size(); ++idx) {
int M = sizes[idx];
int N = sizes[idx];
int K = sizes[idx];
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
for (int i = 0; i < M * N; ++i) A[i] = rand() % 10;
for (int i = 0; i < N * K; ++i) B[i] = rand() % 10;
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_B, N * K * sizeof(float));
cudaMalloc(&d_C, M * K * sizeof(float));
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((K + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH);
// 预热
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
std::chrono::duration<float> duration = end - start;
times.push_back(duration.count());
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] A;
delete[] B;
delete[] C;
}
std::cout << "CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果" << std::endl;
std::cout << "=================================" << std::endl;
std::cout << std::setw(12) << "Matrix Size"
<< std::setw(15) << "Time(s)"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "GFLOPS" << std::endl;
std::cout << "---------------------------------" << std::endl;
for(int i = 0; i < sizes.size(); ++i) {
int size = sizes[i];
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
double time_ms = times[i] * 1000.0; // 转换为毫秒
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
}
std::cout << "=================================" << std::endl;
return 0;
}

View File

@ -0,0 +1,24 @@
BLOCK_SIZE对CUDA矩阵乘法性能影响测试
========================================
Matrix Block Time(ms) FLOPS(G)
----------------------------------------
256x256 4x4 0.115 292.57
256x256 8x8 0.040 836.85
256x256 16x16 0.029 1151.02
256x256 32x32 0.026 1315.65
----------------------------------------
512x512 4x4 0.831 323.00
512x512 8x8 0.264 1018.65
512x512 16x16 0.190 1416.04
512x512 32x32 0.174 1542.02
----------------------------------------
1024x1024 4x4 6.541 328.33
1024x1024 8x8 2.021 1062.62
1024x1024 16x16 1.393 1541.24
1024x1024 32x32 1.353 1586.69
----------------------------------------
2048x2048 4x4 54.011 318.08
2048x2048 8x8 16.104 1066.82
2048x2048 16x16 11.355 1512.97
2048x2048 32x32 10.978 1565.00
----------------------------------------

View File

@ -0,0 +1,20 @@
Wed Jan 21 16:23:03 2026
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.247.01 Driver Version: 535.247.01 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce RTX 2080 Ti On | 00000000:03:00.0 On | N/A |
| 34% 27C P8 20W / 250W | 1MiB / 22528MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| No running processes found |
+---------------------------------------------------------------------------------------+

View File

@ -0,0 +1,112 @@
=== CPU (OpenMP) 不同线程数 ===
CPU矩阵乘法性能测试 (OpenMP多线程)
=================================================================
Matrix Threads Time(ms) FLOPS(G) Speedup
-----------------------------------------------------------------
256x256 8 90.372 0.37 1.07
256x256 64 83.707 0.40 1.16
256x256 256 84.262 0.40 1.15
-----------------------------------------------------------------
512x512 8 815.295 0.33 1.01
512x512 64 813.476 0.33 1.01
512x512 256 812.463 0.33 1.01
-----------------------------------------------------------------
1024x1024 8 6571.000 0.33 1.00
1024x1024 64 6586.094 0.33 1.00
1024x1024 256 6569.582 0.33 1.00
-----------------------------------------------------------------
2048x2048 8 55244.488 0.31 1.00
2048x2048 64 55211.832 0.31 1.00
2048x2048 256 55239.930 0.31 1.00
-----------------------------------------------------------------
ASCII图表CPU性能分析
=================================================================
1. 不同线程数下的加速比趋势
Matrix Threads=8 Threads=64 Threads=256
2. 不同矩阵规模下的性能趋势
Threads 256x256 512x512 1024x1024 2048x2048
注意完整图表建议使用Python (matplotlib) 生成。
推荐生成以下图表:
- 折线图:不同线程数下的加速比 vs 矩阵规模
- 柱状图不同配置下的GFLOPS对比
- 热力图:线程数 × 矩阵规模 的性能分布
=== CUDA Kernel1 (基础版本) ===
CUDA Kernel1 矩阵乘法性能测试结果
=================================
Matrix Size Time(s) Time(ms) GFLOPS
---------------------------------
512x512 0.000312 0.312 860.70
1024x1024 0.002373 2.373 905.03
2048x2048 0.019180 19.180 895.72
4096x4096 0.129868 129.868 1058.30
=================================
=== CUDA Kernel2 (共享内存优化) ===
CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果
=================================
Matrix Size Time(s) Time(ms) GFLOPS
---------------------------------
512x512 0.000826 0.826 324.87
1024x1024 0.006479 6.479 331.43
2048x2048 0.053598 53.598 320.53
4096x4096 0.432496 432.496 317.78
=================================
=== CPU (OpenMP) 不同线程数 ===
CPU矩阵乘法性能测试 (OpenMP多线程)
=================================================================
Matrix Threads Time(ms) FLOPS(G) Speedup
-----------------------------------------------------------------
256x256 8 90.532 0.37 1.08
256x256 64 83.896 0.40 1.17
256x256 256 83.807 0.40 1.17
-----------------------------------------------------------------
512x512 8 814.564 0.33 1.00
512x512 64 817.633 0.33 1.00
512x512 256 812.408 0.33 1.01
-----------------------------------------------------------------
1024x1024 8 6639.308 0.32 1.00
1024x1024 64 6627.468 0.32 1.00
1024x1024 256 6656.504 0.32 1.00
-----------------------------------------------------------------
2048x2048 8 55719.875 0.31 1.00
2048x2048 64 55636.734 0.31 1.00
2048x2048 256 55657.629 0.31 1.00
-----------------------------------------------------------------
ASCII图表CPU性能分析
=================================================================
1. 不同线程数下的加速比趋势
Matrix Threads=8 Threads=64 Threads=256
2. 不同矩阵规模下的性能趋势
Threads 256x256 512x512 1024x1024 2048x2048
注意完整图表建议使用Python (matplotlib) 生成。
推荐生成以下图表:
- 折线图:不同线程数下的加速比 vs 矩阵规模
- 柱状图不同配置下的GFLOPS对比
- 热力图:线程数 × 矩阵规模 的性能分布
=== CUDA Kernel1 (基础版本) ===
CUDA Kernel1 矩阵乘法性能测试结果
=================================
Matrix Size Time(s) Time(ms) GFLOPS
---------------------------------
512x512 0.000316 0.316 848.68
1024x1024 0.002367 2.367 907.12
2048x2048 0.019190 19.190 895.24
4096x4096 0.138181 138.181 994.63
=================================
=== CUDA Kernel2 (共享内存优化) ===
CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果
=================================
Matrix Size Time(s) Time(ms) GFLOPS
---------------------------------
512x512 0.000828 0.828 324.24
1024x1024 0.006483 6.483 331.27
2048x2048 0.053603 53.603 320.50
4096x4096 0.432285 432.285 317.94
=================================

View File

@ -0,0 +1,9 @@
Vector Addition Performance Test (Threads per block: 256)
========================================================
N=128, Time=9.472 ms
N=256, Time=4.992 ms
N=512, Time=4.928 ms
N=1024, Time=5.696 ms
N=2048, Time=4.928 ms
========================================================
All tests completed.

58
lab4/lab4.sh Executable file
View File

@ -0,0 +1,58 @@
#!/bin/bash
# Lab4 CUDA 程序实验数据收集脚本
SCRIPT_DIR="$(dirname "$0")"
OUTPUT_DIR="$SCRIPT_DIR/experiment_data"
mkdir -p "$OUTPUT_DIR"
ARCH=$(uname -m)
if [ "$ARCH" == "aarch64" ]; then
BUILD_ARCH="arm64-v8a"
else
BUILD_ARCH="x86_64"
fi
echo "=========================================="
echo "Lab4 CUDA 实验数据收集"
echo "=========================================="
echo "数据输出目录: $OUTPUT_DIR"
echo ""
# 检查 CUDA 设备
echo "检查 CUDA 设备..."
nvidia-smi | tee "$OUTPUT_DIR/gpu_info.txt"
echo ""
# 进入构建目录
# cd "$SCRIPT_DIR/build/linux/$BUILD_ARCH/release" || exit 1
echo "=========================================="
echo "实验 4.2: 向量加法 - 不同数据规模测试"
echo "=========================================="
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/vectoradd | tee "$OUTPUT_DIR/vectoradd_results.txt"
echo ""
echo "=========================================="
echo "实验 4.3.1: CPU vs GPU 矩阵乘法性能对比"
echo "=========================================="
echo "=== CPU (OpenMP) 不同线程数 ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_cpu | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
echo ""
echo "=== CUDA Kernel1 (基础版本) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel1 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
echo ""
echo "=== CUDA Kernel2 (共享内存优化) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel2 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
echo ""
echo "=========================================="
echo "实验 4.3.2: 不同 BLOCK_SIZE 对性能的影响"
echo "=========================================="
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/matrixmultiply_block_size_change | tee "$OUTPUT_DIR/blocksize_analysis.txt"
echo ""
echo "=========================================="
echo "实验数据收集完成!"
echo "数据保存在: $OUTPUT_DIR"
echo "=========================================="

View File

@ -0,0 +1,139 @@
#include <iostream>
#include <cuda_runtime.h>
#include <chrono>
#include <vector>
#include <iomanip>
// 测试不同的BLOCK_SIZE
std::vector<int> block_sizes = {4, 8, 16, 32};
// 测试不同的矩阵规模
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
// 共享内存矩阵乘法核函数模板
template<int BLOCK_SIZE>
__global__ void matMultKernel(const float* A, const float* B, float* C, int M, int N, int K) {
__shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE];
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
float sum = 0.0f;
for (int t = 0; t < (N + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
// 加载到共享内存
if (row < M && t * BLOCK_SIZE + threadIdx.x < N)
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * BLOCK_SIZE + threadIdx.x];
else
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
if (col < K && t * BLOCK_SIZE + threadIdx.y < N)
shared_B[threadIdx.y][threadIdx.x] = B[(t * BLOCK_SIZE + threadIdx.y) * K + col];
else
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
__syncthreads();
// 计算当前tile
for (int i = 0; i < BLOCK_SIZE; ++i)
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
__syncthreads();
}
if (row < M && col < K) {
C[row * K + col] = sum;
}
}
void runTest() {
std::cout << "BLOCK_SIZE对CUDA矩阵乘法性能影响测试\n";
std::cout << "========================================\n";
std::cout << std::setw(10) << "Matrix"
<< std::setw(12) << "Block"
<< std::setw(15) << "Time(ms)"
<< std::setw(15) << "FLOPS(G)" << std::endl;
std::cout << "----------------------------------------\n";
// 测试每个矩阵规模
for (int mat_size : matrix_sizes) {
int M = mat_size, N = mat_size, K = mat_size;
// 分配主机内存
float *A = new float[M * N];
float *B = new float[N * K];
float *C = new float[M * K];
// 初始化数据
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
// 分配设备内存
float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_B, N * K * sizeof(float));
cudaMalloc(&d_C, M * K * sizeof(float));
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
// 测试每个BLOCK_SIZE
for (int block_size : block_sizes) {
dim3 blockDim(block_size, block_size);
dim3 gridDim((K + block_size - 1) / block_size, (M + block_size - 1) / block_size);
// 预热
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
cudaDeviceSynchronize();
// 创建CUDA事件计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// 执行并计时
cudaEventRecord(start);
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
// 计算时间
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
// 计算FLOPS
double total_flops = 2.0 * M * N * K; // 乘加各一次
double gflops = total_flops / (milliseconds * 1e6);
// 输出结果
std::cout << std::setw(10) << mat_size << "x" << mat_size
<< std::setw(12) << block_size << "x" << block_size
<< std::setw(15) << std::fixed << std::setprecision(3) << milliseconds
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
// 清理内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
delete[] A;
delete[] B;
delete[] C;
std::cout << "----------------------------------------\n";
}
}
int main() {
runTest();
return 0;
}

123
lab4/vectoradd.cu Normal file
View File

@ -0,0 +1,123 @@
#include <cuda_runtime.h>
#include <stdio.h>
#include <chrono>
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < N) {
dev_c[i] = dev_a[i] + dev_b[i];
}
}
void vectorAddTest(int N, int threadsPerBlock)
{
// 计算块数
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// 分配主机内存
int *host_a = (int*)malloc(N * sizeof(int));
int *host_b = (int*)malloc(N * sizeof(int));
int *host_c = (int*)malloc(N * sizeof(int));
// 初始化数据
for (int i = 0; i < N; i++) {
host_a[i] = i;
host_b[i] = i << 1; // 相当于乘以2
}
// 分配设备内存
int *dev_a = NULL;
int *dev_b = NULL;
int *dev_c = NULL;
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
// 拷贝数据到设备
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
// 创建CUDA事件用于计时
cudaEvent_t start, stop;
CHECK(cudaEventCreate(&start));
CHECK(cudaEventCreate(&stop));
// 预热一次,避免首次启动的额外开销
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
cudaDeviceSynchronize();
// 记录开始时间
CHECK(cudaEventRecord(start));
// 执行核函数
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
// 记录结束时间并等待完成
CHECK(cudaEventRecord(stop));
CHECK(cudaEventSynchronize(stop));
// 计算耗时(毫秒)
float elapsedTime_ms = 0;
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
float elapsedTime = elapsedTime_ms * 1000.0f; // 转换为微秒
// 输出结果
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
// 验证结果(可选)
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
bool success = true;
for (int i = 0; i < N; i++) {
if (host_c[i] != host_a[i] + host_b[i]) {
success = false;
break;
}
}
if (!success) {
printf("Error: Computation failed for N=%d\n", N);
}
// 清理资源
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
CHECK(cudaFree(dev_a));
CHECK(cudaFree(dev_b));
CHECK(cudaFree(dev_c));
free(host_a);
free(host_b);
free(host_c);
}
int main(void)
{
// 设置线程数(保持不变)
const int threadsPerBlock = 256;
// 测试不同向量长度
int testSizes[] = {128, 256, 512, 1024, 2048}; // 注意2056改为20482的幂次
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
printf("========================================================\n");
for (int i = 0; i < numTests; i++) {
vectorAddTest(testSizes[i], threadsPerBlock);
}
printf("========================================================\n");
printf("All tests completed.\n");
return 0;
}

56
lab4/xmake.lua Normal file
View File

@ -0,0 +1,56 @@
set_project("lab4_cuda_programs")
set_version("1.0")
-- 设置 CUDA 工具链
toolchain("cuda")
set_kind("standalone")
set_sdkdir(os.getenv("CUDA_HOME") or "/usr/local/cuda")
set_description("CUDA Toolkit")
toolchain_end()
-- vectoradd 程序
target("vectoradd")
set_kind("binary")
set_languages("c++14")
set_toolchains("cuda")
add_rules("cuda")
add_files("vectoradd.cu")
target_end()
-- MatrixMul_cpu 程序 (使用 OpenMP)
target("MatrixMul_cpu")
set_kind("binary")
set_languages("c++14")
set_toolchains("cuda")
add_rules("cuda")
add_files("MatrixMul_cpu.cu")
add_ldflags("-lgomp", {force = true})
add_cxxflags("-fopenmp", {force = true})
target_end()
-- MatrixMul_kernel1 程序
target("MatrixMul_kernel1")
set_kind("binary")
set_languages("c++14")
set_toolchains("cuda")
add_rules("cuda")
add_files("MatrixMul_kernel1.cu")
target_end()
-- MatrixMul_kernel2 程序
target("MatrixMul_kernel2")
set_kind("binary")
set_languages("c++14")
set_toolchains("cuda")
add_rules("cuda")
add_files("MatrixMul_kernel2.cu")
target_end()
-- matrixmultiply_block_size_change 程序
target("matrixmultiply_block_size_change")
set_kind("binary")
set_languages("c++14")
set_toolchains("cuda")
add_rules("cuda")
add_files("matrixmultiply_block_size_change.cu")
target_end()

302
work/gemm_optimized.cpp Normal file
View File

@ -0,0 +1,302 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include <mpi.h>
#include <omp.h>
#include <vector>
using namespace std;
void randMat(int rows, int cols, float *&Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
// 优化版本:使用循环展开和更好的缓存局部性
void openmp_sgemm_optimized(int m, int n, int k, float *leftMat, float *rightMat,
float *resultMat) {
// 使用更大的分块以提高缓存利用率
const int BLOCK_SIZE = 64;
#pragma omp parallel for collapse(2)
for (int row = 0; row < m; row++) {
for (int col = 0; col < k; col++) {
resultMat[row * k + col] = 0.0;
}
}
// 分块计算以提高缓存命中率
#pragma omp parallel for collapse(2)
for (int row_block = 0; row_block < m; row_block += BLOCK_SIZE) {
for (int col_block = 0; col_block < k; col_block += BLOCK_SIZE) {
for (int i_block = 0; i_block < n; i_block += BLOCK_SIZE) {
int row_end = min(row_block + BLOCK_SIZE, m);
int col_end = min(col_block + BLOCK_SIZE, k);
int i_end = min(i_block + BLOCK_SIZE, n);
for (int row = row_block; row < row_end; row++) {
for (int col = col_block; col < col_end; col++) {
float sum = resultMat[row * k + col];
for (int i = i_block; i < i_end; i++) {
sum += leftMat[row * n + i] * rightMat[col * n + i];
}
resultMat[row * k + col] = sum;
}
}
}
}
}
}
void mpi_sgemm_optimized(int m, int n, int k, float *&leftMat, float *&rightMat,
float *&resultMat, int rank, int worldsize) {
// 计算行列分块数
int rowBlock = (int)sqrt((double)worldsize);
while (rowBlock > 0 && worldsize % rowBlock != 0) {
rowBlock--;
}
int colBlock = worldsize / rowBlock;
int rowStride, colStride;
float *res = nullptr;
float *localLeftMat = leftMat;
float *localRightMat = rightMat;
if (rank == 0) {
// 矩阵转置 - 使用OpenMP加速
float *buf = new float[k * n];
#pragma omp parallel for collapse(2)
for (int r = 0; r < n; r++) {
for (int c = 0; c < k; c++) {
buf[c * n + r] = rightMat[r * k + c];
}
}
#pragma omp parallel for collapse(2)
for (int r = 0; r < k; r++) {
for (int c = 0; c < n; c++) {
rightMat[r * n + c] = buf[r * n + c];
}
}
delete[] buf;
// 使用非阻塞通信重叠计算和通信
std::vector<MPI_Request> sendRequests;
sendRequests.reserve(1000);
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
int sendto = rowB * colBlock + colB;
if (sendto == 0) {
res = new float[rowStride * colStride];
localLeftMat = leftMat + rowStart * n;
localRightMat = rightMat + colStart * n;
continue;
}
// 发送分块大小
MPI_Request req;
MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
// 发送矩阵数据
for (int r = 0; r < rowStride; r++) {
MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto,
1, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
}
for (int c = 0; c < colStride; c++) {
MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto,
2, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
}
}
}
// 等待所有发送完成
for (size_t i = 0; i < sendRequests.size(); i++) {
MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE);
}
} else {
if (rank < worldsize) {
int rowB = rank / colBlock;
int colB = rank % colBlock;
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
localLeftMat = new float[rowStride * n];
localRightMat = new float[colStride * n];
for (int r = 0; r < rowStride; r++) {
MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
}
for (int c = 0; c < colStride; c++) {
MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
}
res = new float[rowStride * colStride];
}
}
MPI_Barrier(MPI_COMM_WORLD);
// 本地计算 - 使用优化版本
if (rank < worldsize) {
int rowB = rank / colBlock;
int colB = rank % colBlock;
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
openmp_sgemm_optimized(rowStride, n, colStride, localLeftMat, localRightMat, res);
}
MPI_Barrier(MPI_COMM_WORLD);
// 收集结果
if (rank == 0) {
int rowB = 0;
int colB = 0;
int rowStart = rowB * (m / rowBlock);
int colStart = colB * (k / colBlock);
for (int r = 0; r < rowStride; r++) {
for (int c = 0; c < colStride; c++) {
resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c];
}
}
delete[] res;
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
int recvfrom = rowB * colBlock + colB;
if (recvfrom == 0) continue;
MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
float *tmpRes = new float[rowStride * colStride];
MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
int rowStart = rowB * (m / rowBlock);
int colStart = colB * (k / colBlock);
for (int r = 0; r < rowStride; r++) {
for (int c = 0; c < colStride; c++) {
resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c];
}
}
delete[] tmpRes;
}
}
} else {
if (rank < worldsize) {
MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD);
delete[] res;
delete[] localLeftMat;
delete[] localRightMat;
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
int main(int argc, char *argv[]) {
if (argc != 4) {
cout << "Usage: " << argv[0] << " M N K\n";
exit(-1);
}
int rank;
int worldSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
float *leftMat, *rightMat, *resMat;
struct timeval start, stop;
if (rank == 0) {
randMat(m, n, leftMat);
randMat(n, k, rightMat);
randMat(m, k, resMat);
}
gettimeofday(&start, NULL);
mpi_sgemm_optimized(m, n, k, leftMat, rightMat, resMat, rank, worldSize);
gettimeofday(&stop, NULL);
if (rank == 0) {
double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 +
(stop.tv_usec - start.tv_usec) / 1000.0;
cout << "optimized mpi matmul: " << elapsed << " ms" << endl;
bool correct = true;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++){
if (int(resMat[i * k + j]) != n) {
cout << "Error at [" << i << "][" << j << "]: "
<< resMat[i * k + j] << " (expected " << n << ")\n";
correct = false;
goto end_check;
}
}
}
end_check:
if (correct) {
cout << "Result verification: PASSED" << endl;
} else {
cout << "Result verification: FAILED" << endl;
}
delete[] leftMat;
delete[] rightMat;
delete[] resMat;
}
MPI_Finalize();
return 0;
}

312
work/gemm_parallel.cpp Normal file
View File

@ -0,0 +1,312 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include <mpi.h>
#include <omp.h>
#include <vector>
using namespace std;
void randMat(int rows, int cols, float *&Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
void openmp_sgemm(int m, int n, int k, float *leftMat, float *rightMat,
float *resultMat) {
// rightMat is transposed
// 使用OpenMP并行化外层循环
#pragma omp parallel for collapse(2)
for (int row = 0; row < m; row++) {
for (int col = 0; col < k; col++) {
resultMat[row * k + col] = 0.0;
for (int i = 0; i < n; i++) {
resultMat[row * k + col] +=
leftMat[row * n + i] * rightMat[col * n + i];
}
}
}
}
void mpi_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat,
float *&resultMat, int rank, int worldsize) {
// 计算行列分块数(尽量接近平方数)
int rowBlock = (int)sqrt((double)worldsize);
while (rowBlock > 0 && worldsize % rowBlock != 0) {
rowBlock--;
}
int colBlock = worldsize / rowBlock;
int rowStride, colStride;
float *res = nullptr;
float *localLeftMat = leftMat;
float *localRightMat = rightMat;
if (rank == 0) {
// 矩阵转置
float *buf = new float[k * n];
#pragma omp parallel for collapse(2)
for (int r = 0; r < n; r++) {
for (int c = 0; c < k; c++) {
buf[c * n + r] = rightMat[r * k + c];
}
}
#pragma omp parallel for collapse(2)
for (int r = 0; r < k; r++) {
for (int c = 0; c < n; c++) {
rightMat[r * n + c] = buf[r * n + c];
}
}
delete[] buf;
// Master-Slave模式将子矩阵发送到各子进程
// 使用vector来动态分配足够的请求空间
std::vector<MPI_Request> sendRequests;
sendRequests.reserve(1000); // 预分配足够空间
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
// 计算分块大小(带状分块)
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
int sendto = rowB * colBlock + colB;
if (sendto == 0) {
// Rank 0 保留自己的分块
res = new float[rowStride * colStride];
localLeftMat = leftMat + rowStart * n;
localRightMat = rightMat + colStart * n;
continue;
}
// 发送左矩阵分块
MPI_Request req;
MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
// 发送左矩阵数据
for (int r = 0; r < rowStride; r++) {
MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto,
1, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
}
// 发送右矩阵数据
for (int c = 0; c < colStride; c++) {
MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto,
2, MPI_COMM_WORLD, &req);
sendRequests.push_back(req);
}
}
}
// 等待所有发送完成
for (size_t i = 0; i < sendRequests.size(); i++) {
MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE);
}
} else {
// 接收从主进程发送来的数据
if (rank < worldsize) {
// 计算当前rank的分块位置
int rowB = rank / colBlock;
int colB = rank % colBlock;
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
// 接收分块大小
MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// 分配内存并接收数据
localLeftMat = new float[rowStride * n];
localRightMat = new float[colStride * n];
for (int r = 0; r < rowStride; r++) {
MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
}
for (int c = 0; c < colStride; c++) {
MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
}
res = new float[rowStride * colStride];
}
}
MPI_Barrier(MPI_COMM_WORLD);
// 本地子矩阵相乘
if (rank < worldsize) {
// 重新计算分块大小
int rowB = rank / colBlock;
int colB = rank % colBlock;
int rowStart = rowB * (m / rowBlock);
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
rowStride = rowEnd - rowStart;
int colStart = colB * (k / colBlock);
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
colStride = colEnd - colStart;
// 调用OpenMP加速本地子矩阵相乘运算
openmp_sgemm(rowStride, n, colStride, localLeftMat, localRightMat, res);
}
MPI_Barrier(MPI_COMM_WORLD);
// 将计算结果传送回rank 0
if (rank == 0) {
// Rank 0 直接复制自己的结果
int rowB = 0;
int colB = 0;
int rowStart = rowB * (m / rowBlock);
int colStart = colB * (k / colBlock);
for (int r = 0; r < rowStride; r++) {
for (int c = 0; c < colStride; c++) {
resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c];
}
}
delete[] res;
// 接收其他进程的结果
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
int recvfrom = rowB * colBlock + colB;
if (recvfrom == 0) continue;
// 接收分块大小
MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// 接收结果数据
float *tmpRes = new float[rowStride * colStride];
MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4,
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// 组装到全局矩阵
int rowStart = rowB * (m / rowBlock);
int colStart = colB * (k / colBlock);
for (int r = 0; r < rowStride; r++) {
for (int c = 0; c < colStride; c++) {
resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c];
}
}
delete[] tmpRes;
}
}
} else {
if (rank < worldsize) {
// 发送分块大小
MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
// 发送结果数据
MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD);
delete[] res;
delete[] localLeftMat;
delete[] localRightMat;
}
}
MPI_Barrier(MPI_COMM_WORLD);
}
int main(int argc, char *argv[]) {
if (argc != 4) {
if (argc == 0) {
cout << "Usage: program M N K" << endl;
} else {
cout << "Usage: " << argv[0] << " M N K\n";
}
exit(-1);
}
int rank;
int worldSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// 矩阵尺寸
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
float *leftMat, *rightMat, *resMat;
struct timeval start, stop;
// 矩阵初始化
if (rank == 0) {
randMat(m, n, leftMat);
randMat(n, k, rightMat);
randMat(m, k, resMat);
}
gettimeofday(&start, NULL);
// 使用MPI-OpenMP加速矩阵相乘
mpi_sgemm(m, n, k, leftMat, rightMat, resMat, rank, worldSize);
gettimeofday(&stop, NULL);
// 打印结果
if (rank == 0) {
double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 +
(stop.tv_usec - start.tv_usec) / 1000.0;
cout << "mpi matmul: " << elapsed << " ms" << endl;
// 验证结果
bool correct = true;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++){
if (int(resMat[i * k + j]) != n) {
cout << "Error at [" << i << "][" << j << "]: "
<< resMat[i * k + j] << " (expected " << n << ")\n";
correct = false;
goto end_check;
}
}
}
end_check:
if (correct) {
cout << "Result verification: PASSED" << endl;
} else {
cout << "Result verification: FAILED" << endl;
}
delete[] leftMat;
delete[] rightMat;
delete[] resMat;
}
MPI_Finalize();
return 0;
}

97
work/gemm_serial.cpp Normal file
View File

@ -0,0 +1,97 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
using namespace std;
void randMat(int rows, int cols, float *&Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
void serial_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat,
float *&resultMat) {
// rightMat is transposed
float *buf = new float[k * n];
// transpose right Mat
for (int r = 0; r < n; r++) {
for (int c = 0; c < k; c++) {
buf[c * n + r] = rightMat[r * k + c];
}
}
for (int r = 0; r < k; r++) {
for (int c = 0; c < n; c++) {
rightMat[r * n + c] = buf[r * n + c];
}
}
for (int row = 0; row < m; row++) {
for (int col = 0; col < k; col++) {
resultMat[row * k + col] = 0.0;
for (int i = 0; i < n; i++) {
resultMat[row * k + col] +=
leftMat[row * n + i] * rightMat[col * n + i];
}
}
}
delete[] buf;
return;
}
int main(int argc, char *argv[]) {
if (argc != 5) {
cout << "Usage: " << argv[0] << " M N K use-blas\n";
exit(-1);
}
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
int blas = atoi(argv[4]);
float *leftMat, *rightMat, *resMat;
struct timeval start, stop;
randMat(m, n, leftMat);
randMat(n, k, rightMat);
randMat(m, k, resMat);
gettimeofday(&start, NULL);
serial_sgemm(m, n, k, leftMat, rightMat, resMat);
gettimeofday(&stop, NULL);
cout << "matmul: "
<< (stop.tv_sec - start.tv_sec) * 1000.0 +
(stop.tv_usec - start.tv_usec) / 1000.0
<< " ms" << endl;
// 验证结果
bool correct = true;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++){
if (int(resMat[i * k + j]) != n) {
cout << "Error at [" << i << "][" << j << "]: "
<< resMat[i * k + j] << " (expected " << n << ")\n";
correct = false;
goto end_check;
}
}
}
end_check:
if (correct) {
cout << "Result verification: PASSED" << endl;
} else {
cout << "Result verification: FAILED" << endl;
}
delete[] leftMat;
delete[] rightMat;
delete[] resMat;
return 0;
}

207
work/run_experiments.sh Executable file
View File

@ -0,0 +1,207 @@
#!/bin/bash
# MPI-OpenMP矩阵乘法性能测试脚本
# 用于收集实验数据
# 设置环境变量
export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1}
# 输出文件
OUTPUT_FILE="experiment_results.csv"
SERIAL_OUTPUT="serial_results.csv"
# get arch using uname -m
# if aarch64 then use arm64-v8a else use x86_64
ARCH=$(uname -m)
if [ "$ARCH" == "aarch64" ]; then
BUILD_ARCH="arm64-v8a"
else
BUILD_ARCH="x86_64"
fi
# Build directory
BUILD_DIR="./build/linux/$BUILD_ARCH/release"
# 创建输出文件并写入表头
echo "Experiment,M,N,K,MPI_Processes,OpenMP_Threads,Time_ms,Speedup,Efficiency" > $OUTPUT_FILE
echo "M,N,K,Time_ms" > $SERIAL_OUTPUT
# 矩阵尺寸配置(可以根据需要修改)
MATRIX_SIZES="512 1024 2048 4096"
# MPI进程数配置
MPI_PROCESSES="1 2 3 6 9 12"
# OpenMP线程数配置
OPENMP_THREADS="1 2 4 8"
echo "=========================================="
echo "MPI-OpenMP矩阵乘法性能测试"
echo "=========================================="
# 编译程序
echo "编译程序..."
xmake
if [ $? -ne 0 ]; then
echo "编译失败!"
exit 1
fi
echo "编译完成!"
echo ""
# 获取串行基准时间
echo "=========================================="
echo "实验0: 串行基准测试"
echo "=========================================="
for SIZE in $MATRIX_SIZES; do
echo "测试矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
TIME=$($BUILD_DIR/gemm_serial $SIZE $SIZE $SIZE 0 | grep "matmul:" | awk '{print $2}')
echo " 时间: ${TIME} ms"
echo "$SIZE,$SIZE,$SIZE,$TIME" >> $SERIAL_OUTPUT
done
echo ""
# 实验一固定OpenMP线程数为1改变MPI进程数
echo "=========================================="
echo "实验一: OpenMP线程数=1改变MPI进程数"
echo "=========================================="
export OMP_NUM_THREADS=1
for SIZE in $MATRIX_SIZES; do
# 获取串行时间
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
echo "串行时间: ${SERIAL_TIME} ms"
for NP in $MPI_PROCESSES; do
echo " MPI进程数: $NP"
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
if [ ! -z "$TIME" ]; then
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $NP" | bc)
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
echo "Exp1,$SIZE,$SIZE,$SIZE,$NP,1,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
fi
done
echo ""
done
# 实验二同时改变MPI进程数和OpenMP线程数
echo "=========================================="
echo "实验二: 改变MPI进程数和OpenMP线程数"
echo "=========================================="
for SIZE in $MATRIX_SIZES; do
# 获取串行时间
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
for NTHREADS in $OPENMP_THREADS; do
export OMP_NUM_THREADS=$NTHREADS
echo " OpenMP线程数: $NTHREADS"
for NP in $MPI_PROCESSES; do
TOTAL_PROCS=$((NP * NTHREADS))
echo " MPI进程数: $NP (总处理器数: $TOTAL_PROCS)"
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
if [ ! -z "$TIME" ]; then
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
echo "Exp2,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
fi
done
done
echo ""
done
# 实验三固定总处理器数改变MPI和OpenMP的组合
echo "=========================================="
echo "实验三: 固定总处理器数改变MPI/OpenMP组合"
echo "=========================================="
TOTAL_PROCS_TARGET=16
echo "目标总处理器数: $TOTAL_PROCS_TARGET"
for SIZE in $MATRIX_SIZES; do
# 获取串行时间
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
# 不同的MPI/OpenMP组合使得总处理器数接近16
declare -a COMBOS=("1:16" "2:8" "4:4" "8:2" "16:1")
for COMBO in "${COMBOS[@]}"; do
NP=$(echo $COMBO | cut -d':' -f1)
NTHREADS=$(echo $COMBO | cut -d':' -f2)
TOTAL_PROCS=$((NP * NTHREADS))
export OMP_NUM_THREADS=$NTHREADS
echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)"
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
if [ ! -z "$TIME" ]; then
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
echo "Exp3,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
fi
done
echo ""
done
# 实验三(优化实现): 固定总处理器数,使用 gemm_optimized结果标识为 Exp3-opt
echo "=========================================="
echo "实验三(优化): 固定总处理器数,使用 gemm_optimized 的 MPI/OpenMP 组合测试"
echo "=========================================="
for SIZE in $MATRIX_SIZES; do
# 获取串行时间
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
# 与之前相同的组合
declare -a COMBOS_OPT=("1:16" "2:8" "4:4" "8:2" "16:1")
for COMBO in "${COMBOS_OPT[@]}"; do
NP=$(echo $COMBO | cut -d':' -f1)
NTHREADS=$(echo $COMBO | cut -d':' -f2)
TOTAL_PROCS=$((NP * NTHREADS))
export OMP_NUM_THREADS=$NTHREADS
echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)"
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_optimized $SIZE $SIZE $SIZE | grep "optimized mpi matmul:" | awk '{print $4}')
if [ ! -z "$TIME" ]; then
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
echo "Exp3-opt,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
fi
done
echo ""
done
echo "=========================================="
echo "测试完成!"
echo "结果已保存到: $OUTPUT_FILE"
echo "串行基准已保存到: $SERIAL_OUTPUT"
echo "=========================================="
echo ""
echo "数据处理说明:"
echo "1. 使用Excel、Python或R读取CSV文件"
echo "2. 绘制图表:"
echo " - 实验一: X轴=MPI进程数, Y轴=加速比/效率, 不同矩阵尺寸用不同颜色"
echo " - 实验二: X轴=总处理器数, Y轴=加速比/效率, 不同OpenMP线程数用不同颜色"
echo " - 实验三: X轴=MPI进程数, Y轴=效率, 不同矩阵尺寸用不同颜色"
echo "3. 分析加速比和效率的变化趋势"
echo "4. 讨论MPI/OpenMP组合对性能的影响"

33
work/xmake.lua Normal file
View File

@ -0,0 +1,33 @@
set_project("gemm")
set_version("1.0")
add_requires("openmp")
add_rules("mode.debug", "mode.release")
-- Find MPI package
add_requires("mpi", {system = true})
add_requires("mpi_cxx", {system = true})
-- 串行版本
target("gemm_serial")
set_kind("binary")
add_files("gemm_serial.cpp")
add_cxxflags("-O3", "-march=native")
-- 并行版本
target("gemm_parallel")
set_kind("binary")
add_files("gemm_parallel.cpp")
add_cxxflags("-O3", "-march=native")
add_packages("openmp")
-- 使用mpic++作为编译器
add_packages("mpi")
add_packages("mpi_cxx")
-- 优化版本
target("gemm_optimized")
set_kind("binary")
add_files("gemm_optimized.cpp")
add_cxxflags("-O3", "-march=native")
add_packages("openmp")
-- 使用mpic++作为编译器
add_packages("mpi")
add_packages("mpi_cxx")