Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
08e8addef1 | ||
|
|
4291d88e3b | ||
|
|
734edd2aa6 |
66
buildall.sh
Executable file
66
buildall.sh
Executable file
@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
|
||||
# HPC Lab 统一构建脚本
|
||||
# 使用 xmake 构建所有实验项目
|
||||
|
||||
set -e # 遇到错误立即退出
|
||||
|
||||
SCRIPT_DIR="$(dirname "$0")"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
# 检查 xmake 是否安装
|
||||
if ! command -v xmake &> /dev/null; then
|
||||
echo "错误: xmake 未安装,请先安装 xmake"
|
||||
echo "安装方法: curl -fsSL https://xmake.io/shget.text | bash"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=========================================="
|
||||
echo "HPC Lab Code - 全项目构建"
|
||||
echo "=========================================="
|
||||
|
||||
# 构建 lab1
|
||||
echo "构建 lab1..."
|
||||
cd lab1
|
||||
xmake
|
||||
cd ..
|
||||
|
||||
# 构建 lab2/omp
|
||||
echo "构建 lab2/omp..."
|
||||
cd lab2/omp
|
||||
xmake
|
||||
cd ../..
|
||||
|
||||
# 构建 lab2/pthread
|
||||
echo "构建 lab2/pthread..."
|
||||
cd lab2/pthread
|
||||
xmake
|
||||
cd ../..
|
||||
|
||||
# 构建 lab3/nbody
|
||||
echo "构建 lab3/nbody..."
|
||||
cd lab3/nbody
|
||||
xmake
|
||||
cd ../..
|
||||
|
||||
# 构建 lab3/prime
|
||||
echo "构建 lab3/prime..."
|
||||
cd lab3/prime
|
||||
xmake
|
||||
cd ../..
|
||||
|
||||
# 构建 lab4
|
||||
echo "构建 lab4..."
|
||||
cd lab4
|
||||
xmake
|
||||
cd ..
|
||||
|
||||
# 构建 work
|
||||
echo "构建 work..."
|
||||
cd work
|
||||
xmake
|
||||
cd ..
|
||||
|
||||
echo "=========================================="
|
||||
echo "构建完成!"
|
||||
echo "=========================================="
|
||||
15
lab1/lab1.sh
15
lab1/lab1.sh
@ -1,9 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo "Current directory: $PWD"
|
||||
|
||||
# get arch using uname -m
|
||||
# if aarch64 then use arm64-v8a else use x86_64
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" == "aarch64" ]; then
|
||||
BUILD_ARCH="arm64-v8a"
|
||||
else
|
||||
BUILD_ARCH="x86_64"
|
||||
fi
|
||||
# Build directory
|
||||
BUILD_DIR="./build/linux/x86_64/release"
|
||||
BUILD_DIR="./build/linux/$BUILD_ARCH/release"
|
||||
|
||||
# Programs
|
||||
MPI_HELLO="$BUILD_DIR/mpi_hello_world"
|
||||
@ -28,7 +35,7 @@ echo "Programs found. Starting tests..."
|
||||
|
||||
# Test mpi_hello_world
|
||||
echo "Testing mpi_hello_world with default settings:"
|
||||
mpirun "$MPI_HELLO"
|
||||
mpirun --hostfile ~/mpi_hosts "$MPI_HELLO"
|
||||
echo "mpi_hello_world test completed."
|
||||
|
||||
# Terms to test
|
||||
@ -41,7 +48,7 @@ echo "Testing mpi_pi with different terms and processes:"
|
||||
for procs in "${PROCS[@]}"; do
|
||||
for terms in "${TERMS[@]}"; do
|
||||
echo "Running mpi_pi with $procs processes and $terms terms:"
|
||||
mpirun -np $procs "$MPI_PI" <<< $terms
|
||||
mpirun --hostfile ~/mpi_hosts -np $procs "$MPI_PI" <<< $terms
|
||||
echo ""
|
||||
done
|
||||
done
|
||||
|
||||
6
lab2/omp/main.cpp
Normal file
6
lab2/omp/main.cpp
Normal file
@ -0,0 +1,6 @@
|
||||
#include <iostream>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
std::cout << "hello world!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
17
lab2/omp/openmp_hello_world.c
Normal file
17
lab2/omp/openmp_hello_world.c
Normal file
@ -0,0 +1,17 @@
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main() {
|
||||
int i;
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
printf("Hello World\n");
|
||||
for(i=0; i<4; i++) {
|
||||
printf("Iter:%d\n",i);
|
||||
}
|
||||
printf("GoodBye World\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
33
lab2/omp/pi.c
Normal file
33
lab2/omp/pi.c
Normal file
@ -0,0 +1,33 @@
|
||||
#include <stdio.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
long long num_steps = 1000000000;
|
||||
double step;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
struct timeval TimeStampStart, TimeStampStop;
|
||||
double ExeTime;
|
||||
double x, pi, sum=0.0;
|
||||
int i;
|
||||
step = 1./(double)num_steps;
|
||||
|
||||
gettimeofday(&TimeStampStart, NULL);
|
||||
|
||||
for (i=0; i<num_steps; i++)
|
||||
{
|
||||
x = (i + .5)*step;
|
||||
sum = sum + 4.0/(1.+ x*x);
|
||||
}
|
||||
|
||||
pi = sum*step;
|
||||
|
||||
gettimeofday(&TimeStampStop, NULL);
|
||||
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
|
||||
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
|
||||
|
||||
printf("The value of PI is %15.12f\n",pi);
|
||||
printf("The time to calculate PI was %f seconds\n", (ExeTime));
|
||||
|
||||
return 0;
|
||||
}
|
||||
38
lab2/omp/pi_par.c
Normal file
38
lab2/omp/pi_par.c
Normal file
@ -0,0 +1,38 @@
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
long long num_steps = 1000000000;
|
||||
double step;
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
struct timeval TimeStampStart, TimeStampStop;
|
||||
double ExeTime;
|
||||
double x, pi, sum=0.0;
|
||||
int i;
|
||||
step = 1./(double)num_steps;
|
||||
|
||||
gettimeofday(&TimeStampStart, NULL);
|
||||
|
||||
#pragma omp parallel private(x) reduction(+:sum)
|
||||
{
|
||||
#pragma omp for
|
||||
for (i=0; i<num_steps; i++)
|
||||
{
|
||||
x = (i + .5)*step;
|
||||
sum = sum + 4.0/(1.+ x*x);
|
||||
}
|
||||
}
|
||||
|
||||
pi = sum*step;
|
||||
|
||||
gettimeofday(&TimeStampStop, NULL);
|
||||
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
|
||||
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
|
||||
|
||||
printf("The value of PI is %15.12f\n",pi);
|
||||
printf("The time to calculate PI was %f seconds\n", (ExeTime));
|
||||
|
||||
return 0;
|
||||
}
|
||||
53
lab2/omp/pimonte_par.c
Normal file
53
lab2/omp/pimonte_par.c
Normal file
@ -0,0 +1,53 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
#define BLOCK_SIZE 500
|
||||
|
||||
int main(){
|
||||
struct timeval TimeStampStart, TimeStampStop;
|
||||
double ExeTime;
|
||||
unsigned int iter=200000000;
|
||||
int i, j;
|
||||
double x, y;
|
||||
double dUnderCurve=0.0;
|
||||
double pi=0.0;
|
||||
double r[BLOCK_SIZE*2];
|
||||
|
||||
gettimeofday(&TimeStampStart, NULL);
|
||||
|
||||
#pragma omp parallel private(i, j, x, y, r) reduction(+:dUnderCurve)
|
||||
{
|
||||
unsigned int seed = omp_get_thread_num() + 1;
|
||||
|
||||
#pragma omp for
|
||||
for(j=0; j<iter/BLOCK_SIZE; j++) {
|
||||
// Create random numbers into array r
|
||||
// 生成 BLOCK_SIZE*2 个在 0.0-1.0 内均匀分布的随机数, 作为横纵坐标
|
||||
for (i=0; i<BLOCK_SIZE*2; i++) {
|
||||
r[i] = 0.0 + 1.0 * rand_r(&seed) / RAND_MAX * ( 1.0 - 0.0 );
|
||||
}
|
||||
|
||||
for (i=0; i<BLOCK_SIZE; i++) {
|
||||
x=r[i]; //X Coordinate
|
||||
y=r[i+BLOCK_SIZE]; //Y Coordinate
|
||||
if (x*x + y*y <= 1.0) { //is distance from Origin under Curve
|
||||
dUnderCurve++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pi = dUnderCurve / (double) iter * 4;
|
||||
|
||||
gettimeofday(&TimeStampStop, NULL);
|
||||
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
|
||||
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
|
||||
|
||||
printf ("pi = %10.9f\n", pi);
|
||||
printf("The time to calculate PI was %f seconds\n", (ExeTime));
|
||||
|
||||
return 0;
|
||||
}
|
||||
49
lab2/omp/pimonte_serial.c
Normal file
49
lab2/omp/pimonte_serial.c
Normal file
@ -0,0 +1,49 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/time.h>
|
||||
#include <time.h>
|
||||
|
||||
#define BLOCK_SIZE 500
|
||||
|
||||
int main(){
|
||||
struct timeval TimeStampStart, TimeStampStop;
|
||||
double ExeTime;
|
||||
unsigned int iter=200000000;
|
||||
int i, j;
|
||||
double x, y;
|
||||
double dUnderCurve=0.0;
|
||||
double pi=0.0;
|
||||
double r[BLOCK_SIZE*2]; //Careful!!!
|
||||
//you need a private copy of whole array for each thread
|
||||
|
||||
srand((unsigned)time(NULL));
|
||||
|
||||
gettimeofday(&TimeStampStart, NULL);
|
||||
|
||||
for(j=0; j<iter/BLOCK_SIZE; j++) {
|
||||
// Create random numbers into array r
|
||||
// 生成 BLOCK_SIZE*2 个在 0.0-1.0 内均匀分布的随机数, 作为横纵坐标
|
||||
for (i=0; i<BLOCK_SIZE*2; i++) {
|
||||
r[i] = 0.0 + 1.0 * rand() / RAND_MAX * ( 1.0 - 0.0 );
|
||||
}
|
||||
|
||||
for (i=0; i<BLOCK_SIZE; i++) {
|
||||
x=r[i]; //X Coordinate
|
||||
y=r[i+BLOCK_SIZE]; //Y Coordinate
|
||||
if (x*x + y*y <= 1.0) { //is distance from Origin under Curve
|
||||
dUnderCurve++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pi = dUnderCurve / (double) iter * 4;
|
||||
|
||||
gettimeofday(&TimeStampStop, NULL);
|
||||
ExeTime = (double)(TimeStampStop.tv_sec - TimeStampStart.tv_sec) +
|
||||
(double)(TimeStampStop.tv_usec - TimeStampStart.tv_usec) * 1e-6;
|
||||
|
||||
printf ("pi = %10.9f\n", pi);
|
||||
printf("The time to calculate PI was %f seconds\n", (ExeTime));
|
||||
|
||||
return 0;
|
||||
}
|
||||
98
lab2/omp/xmake.lua
Normal file
98
lab2/omp/xmake.lua
Normal file
@ -0,0 +1,98 @@
|
||||
add_rules("mode.debug", "mode.release")
|
||||
add_requires("openmp")
|
||||
-- OpenMP Hello World
|
||||
target("openmp_hello_world")
|
||||
set_kind("binary")
|
||||
add_files("openmp_hello_world.c")
|
||||
add_packages("openmp")
|
||||
-- PI Serial (中值积分)
|
||||
target("pi")
|
||||
set_kind("binary")
|
||||
add_files("pi.c")
|
||||
|
||||
-- PI Parallel (中值积分)
|
||||
target("pi_par")
|
||||
set_kind("binary")
|
||||
add_files("pi_par.c")
|
||||
add_packages("openmp")
|
||||
|
||||
-- PI Monte Carlo Serial
|
||||
target("pimonte_serial")
|
||||
set_kind("binary")
|
||||
add_files("pimonte_serial.c")
|
||||
|
||||
-- PI Monte Carlo Parallel
|
||||
target("pimonte_par")
|
||||
set_kind("binary")
|
||||
add_files("pimonte_par.c")
|
||||
add_packages("openmp")
|
||||
|
||||
--
|
||||
-- If you want to known more usage about xmake, please see https://xmake.io
|
||||
--
|
||||
-- ## FAQ
|
||||
--
|
||||
-- You can enter the project directory firstly before building project.
|
||||
--
|
||||
-- $ cd projectdir
|
||||
--
|
||||
-- 1. How to build project?
|
||||
--
|
||||
-- $ xmake
|
||||
--
|
||||
-- 2. How to configure project?
|
||||
--
|
||||
-- $ xmake f -p [macosx|linux|iphoneos ..] -a [x86_64|i386|arm64 ..] -m [debug|release]
|
||||
--
|
||||
-- 3. Where is the build output directory?
|
||||
--
|
||||
-- The default output directory is `./build` and you can configure the output directory.
|
||||
--
|
||||
-- $ xmake f -o outputdir
|
||||
-- $ xmake
|
||||
--
|
||||
-- 4. How to run and debug target after building project?
|
||||
--
|
||||
-- $ xmake run [targetname]
|
||||
-- $ xmake run -d [targetname]
|
||||
--
|
||||
-- 5. How to install target to the system directory or other output directory?
|
||||
--
|
||||
-- $ xmake install
|
||||
-- $ xmake install -o installdir
|
||||
--
|
||||
-- 6. Add some frequently-used compilation flags in xmake.lua
|
||||
--
|
||||
-- @code
|
||||
-- -- add debug and release modes
|
||||
-- add_rules("mode.debug", "mode.release")
|
||||
--
|
||||
-- -- add macro definition
|
||||
-- add_defines("NDEBUG", "_GNU_SOURCE=1")
|
||||
--
|
||||
-- -- set warning all as error
|
||||
-- set_warnings("all", "error")
|
||||
--
|
||||
-- -- set language: c99, c++11
|
||||
-- set_languages("c99", "c++11")
|
||||
--
|
||||
-- -- set optimization: none, faster, fastest, smallest
|
||||
-- set_optimize("fastest")
|
||||
--
|
||||
-- -- add include search directories
|
||||
-- add_includedirs("/usr/include", "/usr/local/include")
|
||||
--
|
||||
-- -- add link libraries and search directories
|
||||
-- add_links("tbox")
|
||||
-- add_linkdirs("/usr/local/lib", "/usr/lib")
|
||||
--
|
||||
-- -- add system link libraries
|
||||
-- add_syslinks("z", "pthread")
|
||||
--
|
||||
-- -- add compilation and link flags
|
||||
-- add_cxflags("-stdnolib", "-fno-strict-aliasing")
|
||||
-- add_ldflags("-L/usr/local/lib", "-lpthread", {force = true})
|
||||
--
|
||||
-- @endcode
|
||||
--
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#include <string.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_THREADS 4
|
||||
int NUM_THREADS= 4;
|
||||
|
||||
FILE *fd;
|
||||
int TotalEvenWords = 0, TotalOddWords = 0, TotalWords = 0;
|
||||
@ -66,14 +66,16 @@ void *count_words_thread(void *arg)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int main()
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
fd = fopen("./InFile1.txt", "r"); // Open file for read
|
||||
if (fd == NULL) {
|
||||
perror("Failed to open file");
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc > 1){
|
||||
NUM_THREADS = atoi(argv[1]);
|
||||
}
|
||||
// Read all lines
|
||||
char **lines = NULL;
|
||||
int total_lines = 0;
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
#include <pthread.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define NUM_THREADS 4
|
||||
int NUM_THREADS=4;
|
||||
|
||||
long long num_steps = 1000000000;
|
||||
double step;
|
||||
@ -34,6 +34,9 @@ int main(int argc, char* argv[])
|
||||
struct timeval TimeStampStart, TimeStampStop;
|
||||
double ExeTime;
|
||||
double pi;
|
||||
if (argc > 1) {
|
||||
NUM_THREADS = atoi(argv[1]);
|
||||
}
|
||||
int thread_ids[NUM_THREADS];
|
||||
pthread_t threads[NUM_THREADS];
|
||||
|
||||
|
||||
@ -10,6 +10,10 @@ target("count_words_par")
|
||||
add_files("count_words_par.c")
|
||||
add_links("pthread")
|
||||
|
||||
target("count_words_par_opt")
|
||||
set_kind("binary")
|
||||
add_files("count_words_par_opt.c")
|
||||
add_links("pthread")
|
||||
target("count_words_ser")
|
||||
set_kind("binary")
|
||||
add_files("count_words_ser.c")
|
||||
|
||||
@ -1,26 +1,222 @@
|
||||
#!/bin/bash
|
||||
|
||||
# N体问题实验脚本
|
||||
# 收集串行和并行程序的性能数据
|
||||
# 多机环境:hpc-ecs-1, hpc-ecs-2, hpc-ecs-3(每台2线程)
|
||||
|
||||
set -e # 遇到错误立即退出
|
||||
set -u # 使用未定义变量时报错
|
||||
set -o pipefail # 管道命令中任何错误都会导致整个管道失败
|
||||
|
||||
OUTPUT_CSV="nbody_results.csv"
|
||||
LOG_FILE="nbody_experiment.log"
|
||||
|
||||
# 主机配置
|
||||
HOST1="hpc-ecs-1"
|
||||
HOST2="hpc-ecs-2"
|
||||
HOST3="hpc-ecs-3"
|
||||
|
||||
# 记录日志函数
|
||||
log_error() {
|
||||
echo "[ERROR] $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_info() {
|
||||
echo "[INFO] $*" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# 清空或创建CSV文件
|
||||
echo "实验,数据规模,每机进程数,机器配置,运行时间(s)" > "$OUTPUT_CSV"
|
||||
|
||||
echo "=========================================="
|
||||
echo "N体问题串行模拟实验"
|
||||
echo "N体问题性能测试实验"
|
||||
echo "=========================================="
|
||||
echo "主机配置: $HOST1, $HOST2, $HOST3"
|
||||
echo ""
|
||||
|
||||
# 默认天体数量
|
||||
N=${1:-4}
|
||||
|
||||
echo "运行参数:"
|
||||
echo " 天体数量: $N"
|
||||
echo " 时间步长: 0.01 s"
|
||||
echo " 总步数: 100"
|
||||
echo ""
|
||||
# 编译程序
|
||||
xmake build nbody_ser
|
||||
# 运行程序
|
||||
./build/linux/x86_64/release/nbody_ser $N
|
||||
|
||||
echo "编译程序..."
|
||||
log_info "开始编译程序..."
|
||||
if ! xmake build nbody_ser; then
|
||||
log_error "编译 nbody_ser 失败"
|
||||
exit 1
|
||||
fi
|
||||
if ! xmake build nbody_par; then
|
||||
log_error "编译 nbody_par 失败"
|
||||
exit 1
|
||||
fi
|
||||
log_info "编译完成"
|
||||
echo ""
|
||||
|
||||
# 固定数据规模
|
||||
FIXED_N=6000
|
||||
|
||||
# 实验一:单机上,数据规模为6000时,随每机进程数变化的运行时间(串行程序)
|
||||
echo "=========================================="
|
||||
echo "实验一:串行程序 - 数据规模6000"
|
||||
echo "=========================================="
|
||||
log_info "运行串行程序..."
|
||||
ser_output=$(./build/linux/arm64-v8a/release/nbody_ser $FIXED_N 2>&1)
|
||||
ser_exit_code=$?
|
||||
if [ $ser_exit_code -ne 0 ]; then
|
||||
log_error "串行程序执行失败,退出码: $ser_exit_code"
|
||||
echo "$ser_output" | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
time_output=$(echo "$ser_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间"
|
||||
echo "$ser_output" | tee -a "$LOG_FILE"
|
||||
exit 1
|
||||
fi
|
||||
echo "实验一,6000,1,单机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
log_info "实验一完成"
|
||||
echo ""
|
||||
|
||||
# 实验二:多机环境下,数据规模为6000,随每机进程数变化的运行时间
|
||||
echo "=========================================="
|
||||
echo "实验二:并行程序 - 数据规模6000,不同每机进程数"
|
||||
echo "=========================================="
|
||||
|
||||
# 测试不同的每机进程数和机器配置
|
||||
for ppn in 1 2 3 4; do
|
||||
# 单机测试
|
||||
echo "每机进程数: $ppn, 单机"
|
||||
log_info "实验二: 单机, ppn=$ppn"
|
||||
par_output=$(mpirun --host "$HOST1:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(单机 ppn=$ppn),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(单机 ppn=$ppn)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验二,6000,$ppn,单机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 双机测试
|
||||
echo "每机进程数: $ppn, 双机"
|
||||
log_info "实验二: 双机, ppn=$ppn"
|
||||
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(双机 ppn=$ppn),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(双机 ppn=$ppn)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验二,6000,$ppn,双机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
|
||||
# 三机测试
|
||||
echo "每机进程数: $ppn, 三机"
|
||||
log_info "实验二: 三机, ppn=$ppn"
|
||||
par_output=$(mpirun --host "$HOST1:$ppn,$HOST2:$ppn,$HOST3:$ppn" --oversubscribe ./build/linux/arm64-v8a/release/nbody_par $FIXED_N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(三机 ppn=$ppn),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(三机 ppn=$ppn)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验二,6000,$ppn,三机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
# 实验三:每机1个进程,随数据规模变化的并行程序运行时间
|
||||
echo "=========================================="
|
||||
echo "实验三:并行程序 - 每机1进程,不同数据规模"
|
||||
echo "=========================================="
|
||||
|
||||
# 测试不同的数据规模
|
||||
for N in 150 300 600 1200 2400 4800 9600; do
|
||||
echo "数据规模: $N"
|
||||
log_info "实验三: 数据规模=$N"
|
||||
|
||||
# 单机测试
|
||||
echo " 单机..."
|
||||
par_output=$(mpirun --host "$HOST1:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(单机 N=$N),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(单机 N=$N)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验三,$N,单机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 双机测试
|
||||
echo " 双机..."
|
||||
par_output=$(mpirun --host "$HOST1:1,$HOST2:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(双机 N=$N),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(双机 N=$N)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验三,$N,双机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
|
||||
# 三机测试
|
||||
echo " 三机..."
|
||||
par_output=$(mpirun --host "$HOST1:1,$HOST2:1,$HOST3:1" ./build/linux/arm64-v8a/release/nbody_par $N 2>&1)
|
||||
par_exit_code=$?
|
||||
if [ $par_exit_code -ne 0 ]; then
|
||||
log_error "并行程序执行失败(三机 N=$N),退出码: $par_exit_code"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
time_output=$(echo "$par_output" | grep "模拟用时" | awk '{print $2}')
|
||||
if [ -z "$time_output" ]; then
|
||||
log_error "无法从输出中提取运行时间(三机 N=$N)"
|
||||
echo "$par_output" | tee -a "$LOG_FILE"
|
||||
else
|
||||
echo "实验三,$N,三机,$time_output" >> "$OUTPUT_CSV"
|
||||
echo " 时间: $time_output s"
|
||||
fi
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "=========================================="
|
||||
echo "实验完成"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
log_info "所有实验完成"
|
||||
echo "结果已保存到: $OUTPUT_CSV"
|
||||
echo "日志已保存到: $LOG_FILE"
|
||||
echo ""
|
||||
echo "数据预览:"
|
||||
cat "$OUTPUT_CSV"
|
||||
echo ""
|
||||
echo "如有错误,请查看日志文件: $LOG_FILE"
|
||||
|
||||
@ -163,7 +163,7 @@ int main(int argc, char **argv) {
|
||||
verbose = (strcmp(argv[2], "--verbose") == 0 || strcmp(argv[2], "-v") == 0);
|
||||
}
|
||||
// 只有rank 0打印初始信息
|
||||
if (verbose && world_rank == 0) {
|
||||
if (world_rank == 0) {
|
||||
cout << "N体问题并行模拟" << endl;
|
||||
cout << "天体数量: " << n << endl;
|
||||
cout << "进程数量: " << world_size << endl;
|
||||
|
||||
@ -7,7 +7,14 @@ echo "=========================================="
|
||||
echo "Lab 3: Prime Number Calculation Performance Test"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
# get arch using uname -m
|
||||
# if aarch64 then use arm64-v8a else use x86_64
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" == "aarch64" ]; then
|
||||
BUILD_ARCH="arm64-v8a"
|
||||
else
|
||||
BUILD_ARCH="x86_64"
|
||||
fi
|
||||
# Array of N values
|
||||
N_VALUES=(100000 200000 400000 800000)
|
||||
|
||||
@ -21,7 +28,7 @@ OUTPUT_FILE="prime_results.txt"
|
||||
> $OUTPUT_FILE
|
||||
|
||||
# Print header
|
||||
echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE
|
||||
echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE
|
||||
echo "--------------------------------------------------------" | tee -a $OUTPUT_FILE
|
||||
|
||||
# Loop through each N value
|
||||
@ -35,7 +42,7 @@ for N in "${N_VALUES[@]}"; do
|
||||
echo -n "Running with $P process(es)... "
|
||||
|
||||
# Run the program and capture output
|
||||
OUTPUT=$(mpirun -n $P ./build/linux/x86_64/release/prime_par_naive $N 2>&1)
|
||||
OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par_naive $N 2>&1)
|
||||
|
||||
# Extract prime count and time from output
|
||||
PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)')
|
||||
@ -43,15 +50,84 @@ for N in "${N_VALUES[@]}"; do
|
||||
|
||||
# Print result
|
||||
if [ ! -z "$PRIME_COUNT" ] && [ ! -z "$TIME" ]; then
|
||||
echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE
|
||||
echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE
|
||||
echo "Done! (Primes: $PRIME_COUNT, Time: ${TIME}s)"
|
||||
else
|
||||
echo "Error running program!"
|
||||
echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE
|
||||
echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Test completed!"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "Results saved to: $OUTPUT_FILE"
|
||||
echo ""
|
||||
echo "Summary Table:"
|
||||
echo "--------------------------------------------------------"
|
||||
cat $OUTPUT_FILE
|
||||
echo "--------------------------------------------------------"
|
||||
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Begin Optimized Test!"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" == "aarch64" ]; then
|
||||
BUILD_ARCH="arm64-v8a"
|
||||
else
|
||||
BUILD_ARCH="x86_64"
|
||||
fi
|
||||
# Array of N values
|
||||
N_VALUES=(100000 200000 400000 800000)
|
||||
|
||||
# Array of process counts
|
||||
PROCESS_COUNTS=(1 2 4 6 8)
|
||||
|
||||
# Output file for results
|
||||
OUTPUT_FILE="prime_results_opt.txt"
|
||||
|
||||
# Clear previous results
|
||||
> $OUTPUT_FILE
|
||||
|
||||
# Print header
|
||||
echo "N值 进程数 素数个数 执行时间(秒)" | tee -a $OUTPUT_FILE
|
||||
echo "--------------------------------------------------------" | tee -a $OUTPUT_FILE
|
||||
|
||||
# Loop through each N value
|
||||
for N in "${N_VALUES[@]}"; do
|
||||
echo ""
|
||||
echo "Testing N = $N"
|
||||
echo "------------------------"
|
||||
|
||||
# Loop through each process count
|
||||
for P in "${PROCESS_COUNTS[@]}"; do
|
||||
echo -n "Running with $P process(es)... "
|
||||
|
||||
# Run the program and capture output
|
||||
OUTPUT=$(mpirun --oversubscribe --hostfile ~/mpi_hosts -np $P ./build/linux/$BUILD_ARCH/release/prime_par $N $(echo "$N/$P" | bc) 2>&1)
|
||||
|
||||
# Extract prime count and time from output
|
||||
PRIME_COUNT=$(echo "$OUTPUT" | grep "Between" | grep -oP '\d+(?= primes)')
|
||||
TIME=$(echo "$OUTPUT" | grep "Time =" | grep -oP '[0-9.]+(?= seconds)')
|
||||
|
||||
# Print result
|
||||
if [ ! -z "$PRIME_COUNT" ] && [ ! -z "$TIME" ]; then
|
||||
echo "$N $P $PRIME_COUNT $TIME" | tee -a $OUTPUT_FILE
|
||||
echo "Done! (Primes: $PRIME_COUNT, Time: ${TIME}s)"
|
||||
else
|
||||
echo "Error running program!"
|
||||
echo "$N $P ERROR ERROR" | tee -a $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
|
||||
$(echo "$N/$P" | bc)
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Test completed!"
|
||||
|
||||
@ -103,7 +103,8 @@ int main(int argc, char* argv[]) {
|
||||
// No range to distribute, all primes are base primes
|
||||
int total_count = base_primes.size();
|
||||
if (rank == 0) {
|
||||
std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl;
|
||||
std::cout << "Between 2 and " << N << ", there are " << total_count
|
||||
<< " primes." << std::endl;
|
||||
}
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
@ -172,7 +173,8 @@ int main(int argc, char* argv[]) {
|
||||
if (rank == 0) {
|
||||
end_wtime = MPI_Wtime ( ) - wtime;
|
||||
int total_count = base_primes.size() + global_prime_count;
|
||||
std::cout << "Total prime count in [2, " << N << "] is " << total_count << "." << std::endl;
|
||||
std::cout << "Between 2 and " << N << ", there are " << total_count
|
||||
<< " primes." << std::endl;
|
||||
std::cout << "Time = " << end_wtime << " seconds" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
109
lab4/MatrixMul_cpu.cu
Normal file
109
lab4/MatrixMul_cpu.cu
Normal file
@ -0,0 +1,109 @@
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
#include <cmath>
|
||||
|
||||
void matrixMultiplyCPU(const float* A, const float* B, float* C, int M, int N, int K, int num_threads) {
|
||||
#pragma omp parallel for num_threads(num_threads)
|
||||
for (int i = 0; i < M; ++i) {
|
||||
for (int j = 0; j < K; ++j) {
|
||||
float sum = 0.0f;
|
||||
for (int k = 0; k < N; ++k) {
|
||||
sum += A[i * N + k] * B[k * K + j];
|
||||
}
|
||||
C[i * K + j] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void runCPUTest() {
|
||||
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
|
||||
std::vector<int> thread_counts = {8, 64, 256};
|
||||
|
||||
std::cout << "CPU矩阵乘法性能测试 (OpenMP多线程)\n";
|
||||
std::cout << "=================================================================\n";
|
||||
std::cout << std::setw(12) << "Matrix"
|
||||
<< std::setw(12) << "Threads"
|
||||
<< std::setw(15) << "Time(ms)"
|
||||
<< std::setw(15) << "FLOPS(G)"
|
||||
<< std::setw(15) << "Speedup" << std::endl;
|
||||
std::cout << "-----------------------------------------------------------------\n";
|
||||
|
||||
// 存储基准性能(单线程)
|
||||
std::vector<double> baseline_times(matrix_sizes.size());
|
||||
|
||||
for (size_t m = 0; m < matrix_sizes.size(); ++m) {
|
||||
int size = matrix_sizes[m];
|
||||
int M = size, N = size, K = size;
|
||||
|
||||
// 分配内存
|
||||
float *A = new float[M * N];
|
||||
float *B = new float[N * K];
|
||||
float *C = new float[M * K];
|
||||
|
||||
// 初始化数据
|
||||
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
|
||||
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
|
||||
|
||||
// 首先测试单线程作为基准
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
matrixMultiplyCPU(A, B, C, M, N, K, 1);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
auto single_duration = std::chrono::duration<float, std::milli>(end - start).count();
|
||||
baseline_times[m] = single_duration;
|
||||
|
||||
// 测试多线程
|
||||
for (int threads : thread_counts) {
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
matrixMultiplyCPU(A, B, C, M, N, K, threads);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration<float, std::milli>(end - start).count();
|
||||
|
||||
// 计算FLOPS
|
||||
double total_flops = 2.0 * M * N * K;
|
||||
double gflops = total_flops / (duration * 1e6);
|
||||
|
||||
// 计算加速比
|
||||
double speedup = baseline_times[m] / duration;
|
||||
|
||||
std::cout << std::setw(12) << size << "x" << size
|
||||
<< std::setw(12) << threads
|
||||
<< std::setw(15) << std::fixed << std::setprecision(3) << duration
|
||||
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops
|
||||
<< std::setw(15) << std::fixed << std::setprecision(2) << speedup << std::endl;
|
||||
}
|
||||
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
|
||||
std::cout << "-----------------------------------------------------------------\n";
|
||||
}
|
||||
}
|
||||
|
||||
void plotData() {
|
||||
std::cout << "\n\nASCII图表:CPU性能分析\n";
|
||||
std::cout << "=================================================================\n";
|
||||
std::cout << "1. 不同线程数下的加速比趋势\n";
|
||||
std::cout << " Matrix Threads=8 Threads=64 Threads=256\n";
|
||||
|
||||
// 这里可以添加具体的绘图逻辑
|
||||
// 由于是文本输出,可以使用简单的ASCII字符绘制柱状图
|
||||
|
||||
std::cout << "\n2. 不同矩阵规模下的性能趋势\n";
|
||||
std::cout << " Threads 256x256 512x512 1024x1024 2048x2048\n";
|
||||
|
||||
std::cout << "\n注意:完整图表建议使用Python (matplotlib) 生成。\n";
|
||||
std::cout << "推荐生成以下图表:\n";
|
||||
std::cout << "- 折线图:不同线程数下的加速比 vs 矩阵规模\n";
|
||||
std::cout << "- 柱状图:不同配置下的GFLOPS对比\n";
|
||||
std::cout << "- 热力图:线程数 × 矩阵规模 的性能分布\n";
|
||||
}
|
||||
|
||||
int main() {
|
||||
runCPUTest();
|
||||
plotData();
|
||||
return 0;
|
||||
}
|
||||
109
lab4/MatrixMul_kernel1.cu
Normal file
109
lab4/MatrixMul_kernel1.cu
Normal file
@ -0,0 +1,109 @@
|
||||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <cuda_runtime.h>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
|
||||
__global__ void matMultCUDAKernel1(const float* A, const float* B, float* C, int M, int N, int K) {
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if(row < M && col < K){
|
||||
float sum = 0.0f;
|
||||
for(int i = 0; i < N; ++i){
|
||||
sum += A[row * N + i] * B[i * K + col];
|
||||
}
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::vector<int> sizes = {512, 1024, 2048,4096};
|
||||
std::vector<float> times;
|
||||
|
||||
// 遍历所有矩阵尺寸
|
||||
for(int idx = 0; idx < sizes.size(); ++idx) {
|
||||
int M = sizes[idx];
|
||||
int N = sizes[idx];
|
||||
int K = sizes[idx];
|
||||
|
||||
// 分配主机内存
|
||||
float *A = new float[M * N];
|
||||
float *B = new float[N * K];
|
||||
float *C = new float[M * K];
|
||||
|
||||
// 初始化数据
|
||||
for(int i = 0; i < M * N; ++i) A[i] = rand() % 10;
|
||||
for(int i = 0; i < N * K; ++i) B[i] = rand() % 10;
|
||||
|
||||
// 分配设备内存
|
||||
float *d_A, *d_B, *d_C;
|
||||
cudaMalloc(&d_A, M * N * sizeof(float));
|
||||
cudaMalloc(&d_B, N * K * sizeof(float));
|
||||
cudaMalloc(&d_C, M * K * sizeof(float));
|
||||
|
||||
// 拷贝数据到设备
|
||||
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
// 配置线程块和网格
|
||||
dim3 blockSize(16, 16);
|
||||
dim3 gridSize((K + blockSize.x - 1) / blockSize.x,
|
||||
(M + blockSize.y - 1) / blockSize.y);
|
||||
|
||||
// 预热(可选)
|
||||
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// 计时开始
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// 执行核函数
|
||||
matMultCUDAKernel1<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// 计时结束
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// 拷贝结果回主机
|
||||
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
// 计算时间
|
||||
std::chrono::duration<float> duration = end - start;
|
||||
times.push_back(duration.count());
|
||||
|
||||
// 清理设备内存
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_C);
|
||||
|
||||
// 清理主机内存
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
}
|
||||
|
||||
// 输出结果
|
||||
std::cout << "CUDA Kernel1 矩阵乘法性能测试结果" << std::endl;
|
||||
std::cout << "=================================" << std::endl;
|
||||
std::cout << std::setw(12) << "Matrix Size"
|
||||
<< std::setw(15) << "Time(s)"
|
||||
<< std::setw(15) << "Time(ms)"
|
||||
<< std::setw(15) << "GFLOPS" << std::endl;
|
||||
std::cout << "---------------------------------" << std::endl;
|
||||
|
||||
for(int i = 0; i < sizes.size(); ++i) {
|
||||
int size = sizes[i];
|
||||
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
|
||||
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
|
||||
double time_ms = times[i] * 1000.0; // 转换为毫秒
|
||||
|
||||
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
|
||||
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
|
||||
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
|
||||
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
|
||||
}
|
||||
std::cout << "=================================" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
114
lab4/MatrixMul_kernel2.cu
Normal file
114
lab4/MatrixMul_kernel2.cu
Normal file
@ -0,0 +1,114 @@
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
|
||||
#define TILE_WIDTH 4
|
||||
|
||||
__global__ void matMultCUDAKernel2(const float* A, const float* B, float* C, int M, int N, int K) {
|
||||
__shared__ float shared_A[TILE_WIDTH][TILE_WIDTH];
|
||||
__shared__ float shared_B[TILE_WIDTH][TILE_WIDTH];
|
||||
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int t = 0; t < (N + TILE_WIDTH - 1) / TILE_WIDTH; ++t) {
|
||||
if (row < M && t * TILE_WIDTH + threadIdx.x < N)
|
||||
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * TILE_WIDTH + threadIdx.x];
|
||||
else
|
||||
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
|
||||
if (col < K && t * TILE_WIDTH + threadIdx.y < N)
|
||||
shared_B[threadIdx.y][threadIdx.x] = B[(t * TILE_WIDTH + threadIdx.y) * K + col];
|
||||
else
|
||||
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = 0; i < TILE_WIDTH; ++i)
|
||||
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(row < M && col < K){
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::vector<int> sizes = {512, 1024, 2048,4096};
|
||||
std::vector<float> times;
|
||||
|
||||
for(int idx = 0; idx < sizes.size(); ++idx) {
|
||||
int M = sizes[idx];
|
||||
int N = sizes[idx];
|
||||
int K = sizes[idx];
|
||||
|
||||
float *A = new float[M * N];
|
||||
float *B = new float[N * K];
|
||||
float *C = new float[M * K];
|
||||
|
||||
for (int i = 0; i < M * N; ++i) A[i] = rand() % 10;
|
||||
for (int i = 0; i < N * K; ++i) B[i] = rand() % 10;
|
||||
|
||||
float *d_A, *d_B, *d_C;
|
||||
cudaMalloc(&d_A, M * N * sizeof(float));
|
||||
cudaMalloc(&d_B, N * K * sizeof(float));
|
||||
cudaMalloc(&d_C, M * K * sizeof(float));
|
||||
|
||||
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
|
||||
dim3 gridSize((K + TILE_WIDTH - 1) / TILE_WIDTH, (M + TILE_WIDTH - 1) / TILE_WIDTH);
|
||||
|
||||
// 预热
|
||||
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
matMultCUDAKernel2<<<gridSize, blockSize>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaDeviceSynchronize();
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
cudaMemcpy(C, d_C, M * K * sizeof(float), cudaMemcpyDeviceToHost);
|
||||
|
||||
std::chrono::duration<float> duration = end - start;
|
||||
times.push_back(duration.count());
|
||||
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_C);
|
||||
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
}
|
||||
|
||||
std::cout << "CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果" << std::endl;
|
||||
std::cout << "=================================" << std::endl;
|
||||
std::cout << std::setw(12) << "Matrix Size"
|
||||
<< std::setw(15) << "Time(s)"
|
||||
<< std::setw(15) << "Time(ms)"
|
||||
<< std::setw(15) << "GFLOPS" << std::endl;
|
||||
std::cout << "---------------------------------" << std::endl;
|
||||
|
||||
for(int i = 0; i < sizes.size(); ++i) {
|
||||
int size = sizes[i];
|
||||
double total_flops = 2.0 * size * size * size; // 矩阵乘法的浮点运算数
|
||||
double gflops = total_flops / (times[i] * 1e9); // 转换为 GFLOPS
|
||||
double time_ms = times[i] * 1000.0; // 转换为毫秒
|
||||
|
||||
std::cout << std::setw(8) << size << "x" << std::setw(3) << size
|
||||
<< std::setw(15) << std::fixed << std::setprecision(6) << times[i]
|
||||
<< std::setw(15) << std::fixed << std::setprecision(3) << time_ms
|
||||
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
|
||||
}
|
||||
std::cout << "=================================" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
24
lab4/experiment_data/blocksize_analysis.txt
Normal file
24
lab4/experiment_data/blocksize_analysis.txt
Normal file
@ -0,0 +1,24 @@
|
||||
BLOCK_SIZE对CUDA矩阵乘法性能影响测试
|
||||
========================================
|
||||
Matrix Block Time(ms) FLOPS(G)
|
||||
----------------------------------------
|
||||
256x256 4x4 0.115 292.57
|
||||
256x256 8x8 0.040 836.85
|
||||
256x256 16x16 0.029 1151.02
|
||||
256x256 32x32 0.026 1315.65
|
||||
----------------------------------------
|
||||
512x512 4x4 0.831 323.00
|
||||
512x512 8x8 0.264 1018.65
|
||||
512x512 16x16 0.190 1416.04
|
||||
512x512 32x32 0.174 1542.02
|
||||
----------------------------------------
|
||||
1024x1024 4x4 6.541 328.33
|
||||
1024x1024 8x8 2.021 1062.62
|
||||
1024x1024 16x16 1.393 1541.24
|
||||
1024x1024 32x32 1.353 1586.69
|
||||
----------------------------------------
|
||||
2048x2048 4x4 54.011 318.08
|
||||
2048x2048 8x8 16.104 1066.82
|
||||
2048x2048 16x16 11.355 1512.97
|
||||
2048x2048 32x32 10.978 1565.00
|
||||
----------------------------------------
|
||||
20
lab4/experiment_data/gpu_info.txt
Normal file
20
lab4/experiment_data/gpu_info.txt
Normal file
@ -0,0 +1,20 @@
|
||||
Wed Jan 21 16:23:03 2026
|
||||
+---------------------------------------------------------------------------------------+
|
||||
| NVIDIA-SMI 535.247.01 Driver Version: 535.247.01 CUDA Version: 12.2 |
|
||||
|-----------------------------------------+----------------------+----------------------+
|
||||
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
|
||||
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
|
||||
| | | MIG M. |
|
||||
|=========================================+======================+======================|
|
||||
| 0 NVIDIA GeForce RTX 2080 Ti On | 00000000:03:00.0 On | N/A |
|
||||
| 34% 27C P8 20W / 250W | 1MiB / 22528MiB | 0% Default |
|
||||
| | | N/A |
|
||||
+-----------------------------------------+----------------------+----------------------+
|
||||
|
||||
+---------------------------------------------------------------------------------------+
|
||||
| Processes: |
|
||||
| GPU GI CI PID Type Process name GPU Memory |
|
||||
| ID ID Usage |
|
||||
|=======================================================================================|
|
||||
| No running processes found |
|
||||
+---------------------------------------------------------------------------------------+
|
||||
112
lab4/experiment_data/matrixmul_comparison.txt
Normal file
112
lab4/experiment_data/matrixmul_comparison.txt
Normal file
@ -0,0 +1,112 @@
|
||||
=== CPU (OpenMP) 不同线程数 ===
|
||||
CPU矩阵乘法性能测试 (OpenMP多线程)
|
||||
=================================================================
|
||||
Matrix Threads Time(ms) FLOPS(G) Speedup
|
||||
-----------------------------------------------------------------
|
||||
256x256 8 90.372 0.37 1.07
|
||||
256x256 64 83.707 0.40 1.16
|
||||
256x256 256 84.262 0.40 1.15
|
||||
-----------------------------------------------------------------
|
||||
512x512 8 815.295 0.33 1.01
|
||||
512x512 64 813.476 0.33 1.01
|
||||
512x512 256 812.463 0.33 1.01
|
||||
-----------------------------------------------------------------
|
||||
1024x1024 8 6571.000 0.33 1.00
|
||||
1024x1024 64 6586.094 0.33 1.00
|
||||
1024x1024 256 6569.582 0.33 1.00
|
||||
-----------------------------------------------------------------
|
||||
2048x2048 8 55244.488 0.31 1.00
|
||||
2048x2048 64 55211.832 0.31 1.00
|
||||
2048x2048 256 55239.930 0.31 1.00
|
||||
-----------------------------------------------------------------
|
||||
|
||||
|
||||
ASCII图表:CPU性能分析
|
||||
=================================================================
|
||||
1. 不同线程数下的加速比趋势
|
||||
Matrix Threads=8 Threads=64 Threads=256
|
||||
|
||||
2. 不同矩阵规模下的性能趋势
|
||||
Threads 256x256 512x512 1024x1024 2048x2048
|
||||
|
||||
注意:完整图表建议使用Python (matplotlib) 生成。
|
||||
推荐生成以下图表:
|
||||
- 折线图:不同线程数下的加速比 vs 矩阵规模
|
||||
- 柱状图:不同配置下的GFLOPS对比
|
||||
- 热力图:线程数 × 矩阵规模 的性能分布
|
||||
=== CUDA Kernel1 (基础版本) ===
|
||||
CUDA Kernel1 矩阵乘法性能测试结果
|
||||
=================================
|
||||
Matrix Size Time(s) Time(ms) GFLOPS
|
||||
---------------------------------
|
||||
512x512 0.000312 0.312 860.70
|
||||
1024x1024 0.002373 2.373 905.03
|
||||
2048x2048 0.019180 19.180 895.72
|
||||
4096x4096 0.129868 129.868 1058.30
|
||||
=================================
|
||||
=== CUDA Kernel2 (共享内存优化) ===
|
||||
CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果
|
||||
=================================
|
||||
Matrix Size Time(s) Time(ms) GFLOPS
|
||||
---------------------------------
|
||||
512x512 0.000826 0.826 324.87
|
||||
1024x1024 0.006479 6.479 331.43
|
||||
2048x2048 0.053598 53.598 320.53
|
||||
4096x4096 0.432496 432.496 317.78
|
||||
=================================
|
||||
=== CPU (OpenMP) 不同线程数 ===
|
||||
CPU矩阵乘法性能测试 (OpenMP多线程)
|
||||
=================================================================
|
||||
Matrix Threads Time(ms) FLOPS(G) Speedup
|
||||
-----------------------------------------------------------------
|
||||
256x256 8 90.532 0.37 1.08
|
||||
256x256 64 83.896 0.40 1.17
|
||||
256x256 256 83.807 0.40 1.17
|
||||
-----------------------------------------------------------------
|
||||
512x512 8 814.564 0.33 1.00
|
||||
512x512 64 817.633 0.33 1.00
|
||||
512x512 256 812.408 0.33 1.01
|
||||
-----------------------------------------------------------------
|
||||
1024x1024 8 6639.308 0.32 1.00
|
||||
1024x1024 64 6627.468 0.32 1.00
|
||||
1024x1024 256 6656.504 0.32 1.00
|
||||
-----------------------------------------------------------------
|
||||
2048x2048 8 55719.875 0.31 1.00
|
||||
2048x2048 64 55636.734 0.31 1.00
|
||||
2048x2048 256 55657.629 0.31 1.00
|
||||
-----------------------------------------------------------------
|
||||
|
||||
|
||||
ASCII图表:CPU性能分析
|
||||
=================================================================
|
||||
1. 不同线程数下的加速比趋势
|
||||
Matrix Threads=8 Threads=64 Threads=256
|
||||
|
||||
2. 不同矩阵规模下的性能趋势
|
||||
Threads 256x256 512x512 1024x1024 2048x2048
|
||||
|
||||
注意:完整图表建议使用Python (matplotlib) 生成。
|
||||
推荐生成以下图表:
|
||||
- 折线图:不同线程数下的加速比 vs 矩阵规模
|
||||
- 柱状图:不同配置下的GFLOPS对比
|
||||
- 热力图:线程数 × 矩阵规模 的性能分布
|
||||
=== CUDA Kernel1 (基础版本) ===
|
||||
CUDA Kernel1 矩阵乘法性能测试结果
|
||||
=================================
|
||||
Matrix Size Time(s) Time(ms) GFLOPS
|
||||
---------------------------------
|
||||
512x512 0.000316 0.316 848.68
|
||||
1024x1024 0.002367 2.367 907.12
|
||||
2048x2048 0.019190 19.190 895.24
|
||||
4096x4096 0.138181 138.181 994.63
|
||||
=================================
|
||||
=== CUDA Kernel2 (共享内存优化) ===
|
||||
CUDA Kernel2 (共享内存优化) 矩阵乘法性能测试结果
|
||||
=================================
|
||||
Matrix Size Time(s) Time(ms) GFLOPS
|
||||
---------------------------------
|
||||
512x512 0.000828 0.828 324.24
|
||||
1024x1024 0.006483 6.483 331.27
|
||||
2048x2048 0.053603 53.603 320.50
|
||||
4096x4096 0.432285 432.285 317.94
|
||||
=================================
|
||||
9
lab4/experiment_data/vectoradd_results.txt
Normal file
9
lab4/experiment_data/vectoradd_results.txt
Normal file
@ -0,0 +1,9 @@
|
||||
Vector Addition Performance Test (Threads per block: 256)
|
||||
========================================================
|
||||
N=128, Time=9.472 ms
|
||||
N=256, Time=4.992 ms
|
||||
N=512, Time=4.928 ms
|
||||
N=1024, Time=5.696 ms
|
||||
N=2048, Time=4.928 ms
|
||||
========================================================
|
||||
All tests completed.
|
||||
58
lab4/lab4.sh
Executable file
58
lab4/lab4.sh
Executable file
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Lab4 CUDA 程序实验数据收集脚本
|
||||
|
||||
SCRIPT_DIR="$(dirname "$0")"
|
||||
OUTPUT_DIR="$SCRIPT_DIR/experiment_data"
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" == "aarch64" ]; then
|
||||
BUILD_ARCH="arm64-v8a"
|
||||
else
|
||||
BUILD_ARCH="x86_64"
|
||||
fi
|
||||
echo "=========================================="
|
||||
echo "Lab4 CUDA 实验数据收集"
|
||||
echo "=========================================="
|
||||
echo "数据输出目录: $OUTPUT_DIR"
|
||||
echo ""
|
||||
|
||||
# 检查 CUDA 设备
|
||||
echo "检查 CUDA 设备..."
|
||||
nvidia-smi | tee "$OUTPUT_DIR/gpu_info.txt"
|
||||
echo ""
|
||||
|
||||
# 进入构建目录
|
||||
# cd "$SCRIPT_DIR/build/linux/$BUILD_ARCH/release" || exit 1
|
||||
|
||||
echo "=========================================="
|
||||
echo "实验 4.2: 向量加法 - 不同数据规模测试"
|
||||
echo "=========================================="
|
||||
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/vectoradd | tee "$OUTPUT_DIR/vectoradd_results.txt"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "实验 4.3.1: CPU vs GPU 矩阵乘法性能对比"
|
||||
echo "=========================================="
|
||||
echo "=== CPU (OpenMP) 不同线程数 ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_cpu | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
echo ""
|
||||
|
||||
echo "=== CUDA Kernel1 (基础版本) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel1 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
echo ""
|
||||
|
||||
echo "=== CUDA Kernel2 (共享内存优化) ===" | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/MatrixMul_kernel2 | tee -a "$OUTPUT_DIR/matrixmul_comparison.txt"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "实验 4.3.2: 不同 BLOCK_SIZE 对性能的影响"
|
||||
echo "=========================================="
|
||||
$SCRIPT_DIR/build/linux/$BUILD_ARCH/release/matrixmultiply_block_size_change | tee "$OUTPUT_DIR/blocksize_analysis.txt"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "实验数据收集完成!"
|
||||
echo "数据保存在: $OUTPUT_DIR"
|
||||
echo "=========================================="
|
||||
139
lab4/matrixmultiply_block_size_change.cu
Normal file
139
lab4/matrixmultiply_block_size_change.cu
Normal file
@ -0,0 +1,139 @@
|
||||
#include <iostream>
|
||||
#include <cuda_runtime.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <iomanip>
|
||||
|
||||
// 测试不同的BLOCK_SIZE
|
||||
std::vector<int> block_sizes = {4, 8, 16, 32};
|
||||
// 测试不同的矩阵规模
|
||||
std::vector<int> matrix_sizes = {256, 512, 1024, 2048};
|
||||
|
||||
// 共享内存矩阵乘法核函数模板
|
||||
template<int BLOCK_SIZE>
|
||||
__global__ void matMultKernel(const float* A, const float* B, float* C, int M, int N, int K) {
|
||||
__shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE];
|
||||
__shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
int row = blockIdx.y * BLOCK_SIZE + threadIdx.y;
|
||||
int col = blockIdx.x * BLOCK_SIZE + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
for (int t = 0; t < (N + BLOCK_SIZE - 1) / BLOCK_SIZE; ++t) {
|
||||
// 加载到共享内存
|
||||
if (row < M && t * BLOCK_SIZE + threadIdx.x < N)
|
||||
shared_A[threadIdx.y][threadIdx.x] = A[row * N + t * BLOCK_SIZE + threadIdx.x];
|
||||
else
|
||||
shared_A[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
|
||||
if (col < K && t * BLOCK_SIZE + threadIdx.y < N)
|
||||
shared_B[threadIdx.y][threadIdx.x] = B[(t * BLOCK_SIZE + threadIdx.y) * K + col];
|
||||
else
|
||||
shared_B[threadIdx.y][threadIdx.x] = 0.0f;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// 计算当前tile
|
||||
for (int i = 0; i < BLOCK_SIZE; ++i)
|
||||
sum += shared_A[threadIdx.y][i] * shared_B[i][threadIdx.x];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (row < M && col < K) {
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
void runTest() {
|
||||
std::cout << "BLOCK_SIZE对CUDA矩阵乘法性能影响测试\n";
|
||||
std::cout << "========================================\n";
|
||||
std::cout << std::setw(10) << "Matrix"
|
||||
<< std::setw(12) << "Block"
|
||||
<< std::setw(15) << "Time(ms)"
|
||||
<< std::setw(15) << "FLOPS(G)" << std::endl;
|
||||
std::cout << "----------------------------------------\n";
|
||||
|
||||
// 测试每个矩阵规模
|
||||
for (int mat_size : matrix_sizes) {
|
||||
int M = mat_size, N = mat_size, K = mat_size;
|
||||
|
||||
// 分配主机内存
|
||||
float *A = new float[M * N];
|
||||
float *B = new float[N * K];
|
||||
float *C = new float[M * K];
|
||||
|
||||
// 初始化数据
|
||||
for (int i = 0; i < M * N; ++i) A[i] = (rand() % 100) / 100.0f;
|
||||
for (int i = 0; i < N * K; ++i) B[i] = (rand() % 100) / 100.0f;
|
||||
|
||||
// 分配设备内存
|
||||
float *d_A, *d_B, *d_C;
|
||||
cudaMalloc(&d_A, M * N * sizeof(float));
|
||||
cudaMalloc(&d_B, N * K * sizeof(float));
|
||||
cudaMalloc(&d_C, M * K * sizeof(float));
|
||||
|
||||
cudaMemcpy(d_A, A, M * N * sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_B, B, N * K * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
// 测试每个BLOCK_SIZE
|
||||
for (int block_size : block_sizes) {
|
||||
dim3 blockDim(block_size, block_size);
|
||||
dim3 gridDim((K + block_size - 1) / block_size, (M + block_size - 1) / block_size);
|
||||
|
||||
// 预热
|
||||
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// 创建CUDA事件计时
|
||||
cudaEvent_t start, stop;
|
||||
cudaEventCreate(&start);
|
||||
cudaEventCreate(&stop);
|
||||
|
||||
// 执行并计时
|
||||
cudaEventRecord(start);
|
||||
if (block_size == 4) matMultKernel<4><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 8) matMultKernel<8><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 16) matMultKernel<16><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
else if (block_size == 32) matMultKernel<32><<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
|
||||
cudaEventRecord(stop);
|
||||
cudaEventSynchronize(stop);
|
||||
|
||||
// 计算时间
|
||||
float milliseconds = 0;
|
||||
cudaEventElapsedTime(&milliseconds, start, stop);
|
||||
|
||||
// 计算FLOPS
|
||||
double total_flops = 2.0 * M * N * K; // 乘加各一次
|
||||
double gflops = total_flops / (milliseconds * 1e6);
|
||||
|
||||
// 输出结果
|
||||
std::cout << std::setw(10) << mat_size << "x" << mat_size
|
||||
<< std::setw(12) << block_size << "x" << block_size
|
||||
<< std::setw(15) << std::fixed << std::setprecision(3) << milliseconds
|
||||
<< std::setw(15) << std::fixed << std::setprecision(2) << gflops << std::endl;
|
||||
|
||||
cudaEventDestroy(start);
|
||||
cudaEventDestroy(stop);
|
||||
}
|
||||
|
||||
// 清理内存
|
||||
cudaFree(d_A);
|
||||
cudaFree(d_B);
|
||||
cudaFree(d_C);
|
||||
delete[] A;
|
||||
delete[] B;
|
||||
delete[] C;
|
||||
|
||||
std::cout << "----------------------------------------\n";
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
runTest();
|
||||
return 0;
|
||||
}
|
||||
123
lab4/vectoradd.cu
Normal file
123
lab4/vectoradd.cu
Normal file
@ -0,0 +1,123 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <stdio.h>
|
||||
#include <chrono>
|
||||
|
||||
#define CHECK(call) \
|
||||
{ \
|
||||
const cudaError_t error = call; \
|
||||
if (error != cudaSuccess) \
|
||||
{ \
|
||||
printf("Error: %s:%d, ", __FILE__, __LINE__); \
|
||||
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
|
||||
exit(1); \
|
||||
} \
|
||||
}
|
||||
|
||||
__global__ void add(const int *dev_a, const int *dev_b, int *dev_c, int N)
|
||||
{
|
||||
int i = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (i < N) {
|
||||
dev_c[i] = dev_a[i] + dev_b[i];
|
||||
}
|
||||
}
|
||||
|
||||
void vectorAddTest(int N, int threadsPerBlock)
|
||||
{
|
||||
// 计算块数
|
||||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
|
||||
// 分配主机内存
|
||||
int *host_a = (int*)malloc(N * sizeof(int));
|
||||
int *host_b = (int*)malloc(N * sizeof(int));
|
||||
int *host_c = (int*)malloc(N * sizeof(int));
|
||||
|
||||
// 初始化数据
|
||||
for (int i = 0; i < N; i++) {
|
||||
host_a[i] = i;
|
||||
host_b[i] = i << 1; // 相当于乘以2
|
||||
}
|
||||
|
||||
// 分配设备内存
|
||||
int *dev_a = NULL;
|
||||
int *dev_b = NULL;
|
||||
int *dev_c = NULL;
|
||||
CHECK(cudaMalloc((void**)&dev_a, N * sizeof(int)));
|
||||
CHECK(cudaMalloc((void**)&dev_b, N * sizeof(int)));
|
||||
CHECK(cudaMalloc((void**)&dev_c, N * sizeof(int)));
|
||||
|
||||
// 拷贝数据到设备
|
||||
CHECK(cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice));
|
||||
CHECK(cudaMemcpy(dev_b, host_b, N * sizeof(int), cudaMemcpyHostToDevice));
|
||||
|
||||
// 创建CUDA事件用于计时
|
||||
cudaEvent_t start, stop;
|
||||
CHECK(cudaEventCreate(&start));
|
||||
CHECK(cudaEventCreate(&stop));
|
||||
|
||||
// 预热一次,避免首次启动的额外开销
|
||||
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// 记录开始时间
|
||||
CHECK(cudaEventRecord(start));
|
||||
|
||||
// 执行核函数
|
||||
add<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_c, N);
|
||||
|
||||
// 记录结束时间并等待完成
|
||||
CHECK(cudaEventRecord(stop));
|
||||
CHECK(cudaEventSynchronize(stop));
|
||||
|
||||
// 计算耗时(毫秒)
|
||||
float elapsedTime_ms = 0;
|
||||
CHECK(cudaEventElapsedTime(&elapsedTime_ms, start, stop));
|
||||
float elapsedTime = elapsedTime_ms * 1000.0f; // 转换为微秒
|
||||
|
||||
// 输出结果
|
||||
printf("N=%d, Time=%.3f ms\n", N, elapsedTime);
|
||||
|
||||
// 验证结果(可选)
|
||||
CHECK(cudaMemcpy(host_c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost));
|
||||
bool success = true;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (host_c[i] != host_a[i] + host_b[i]) {
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!success) {
|
||||
printf("Error: Computation failed for N=%d\n", N);
|
||||
}
|
||||
|
||||
// 清理资源
|
||||
CHECK(cudaEventDestroy(start));
|
||||
CHECK(cudaEventDestroy(stop));
|
||||
CHECK(cudaFree(dev_a));
|
||||
CHECK(cudaFree(dev_b));
|
||||
CHECK(cudaFree(dev_c));
|
||||
free(host_a);
|
||||
free(host_b);
|
||||
free(host_c);
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
// 设置线程数(保持不变)
|
||||
const int threadsPerBlock = 256;
|
||||
|
||||
// 测试不同向量长度
|
||||
int testSizes[] = {128, 256, 512, 1024, 2048}; // 注意:2056改为2048(2的幂次)
|
||||
int numTests = sizeof(testSizes) / sizeof(testSizes[0]);
|
||||
|
||||
printf("Vector Addition Performance Test (Threads per block: %d)\n", threadsPerBlock);
|
||||
printf("========================================================\n");
|
||||
|
||||
for (int i = 0; i < numTests; i++) {
|
||||
vectorAddTest(testSizes[i], threadsPerBlock);
|
||||
}
|
||||
|
||||
printf("========================================================\n");
|
||||
printf("All tests completed.\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
56
lab4/xmake.lua
Normal file
56
lab4/xmake.lua
Normal file
@ -0,0 +1,56 @@
|
||||
set_project("lab4_cuda_programs")
|
||||
set_version("1.0")
|
||||
|
||||
-- 设置 CUDA 工具链
|
||||
toolchain("cuda")
|
||||
set_kind("standalone")
|
||||
set_sdkdir(os.getenv("CUDA_HOME") or "/usr/local/cuda")
|
||||
set_description("CUDA Toolkit")
|
||||
toolchain_end()
|
||||
|
||||
-- vectoradd 程序
|
||||
target("vectoradd")
|
||||
set_kind("binary")
|
||||
set_languages("c++14")
|
||||
set_toolchains("cuda")
|
||||
add_rules("cuda")
|
||||
add_files("vectoradd.cu")
|
||||
target_end()
|
||||
|
||||
-- MatrixMul_cpu 程序 (使用 OpenMP)
|
||||
target("MatrixMul_cpu")
|
||||
set_kind("binary")
|
||||
set_languages("c++14")
|
||||
set_toolchains("cuda")
|
||||
add_rules("cuda")
|
||||
add_files("MatrixMul_cpu.cu")
|
||||
add_ldflags("-lgomp", {force = true})
|
||||
add_cxxflags("-fopenmp", {force = true})
|
||||
target_end()
|
||||
|
||||
-- MatrixMul_kernel1 程序
|
||||
target("MatrixMul_kernel1")
|
||||
set_kind("binary")
|
||||
set_languages("c++14")
|
||||
set_toolchains("cuda")
|
||||
add_rules("cuda")
|
||||
add_files("MatrixMul_kernel1.cu")
|
||||
target_end()
|
||||
|
||||
-- MatrixMul_kernel2 程序
|
||||
target("MatrixMul_kernel2")
|
||||
set_kind("binary")
|
||||
set_languages("c++14")
|
||||
set_toolchains("cuda")
|
||||
add_rules("cuda")
|
||||
add_files("MatrixMul_kernel2.cu")
|
||||
target_end()
|
||||
|
||||
-- matrixmultiply_block_size_change 程序
|
||||
target("matrixmultiply_block_size_change")
|
||||
set_kind("binary")
|
||||
set_languages("c++14")
|
||||
set_toolchains("cuda")
|
||||
add_rules("cuda")
|
||||
add_files("matrixmultiply_block_size_change.cu")
|
||||
target_end()
|
||||
302
work/gemm_optimized.cpp
Normal file
302
work/gemm_optimized.cpp
Normal file
@ -0,0 +1,302 @@
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void randMat(int rows, int cols, float *&Mat) {
|
||||
Mat = new float[rows * cols];
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++)
|
||||
Mat[i * cols + j] = 1.0;
|
||||
}
|
||||
|
||||
// 优化版本:使用循环展开和更好的缓存局部性
|
||||
void openmp_sgemm_optimized(int m, int n, int k, float *leftMat, float *rightMat,
|
||||
float *resultMat) {
|
||||
// 使用更大的分块以提高缓存利用率
|
||||
const int BLOCK_SIZE = 64;
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int row = 0; row < m; row++) {
|
||||
for (int col = 0; col < k; col++) {
|
||||
resultMat[row * k + col] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// 分块计算以提高缓存命中率
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int row_block = 0; row_block < m; row_block += BLOCK_SIZE) {
|
||||
for (int col_block = 0; col_block < k; col_block += BLOCK_SIZE) {
|
||||
for (int i_block = 0; i_block < n; i_block += BLOCK_SIZE) {
|
||||
|
||||
int row_end = min(row_block + BLOCK_SIZE, m);
|
||||
int col_end = min(col_block + BLOCK_SIZE, k);
|
||||
int i_end = min(i_block + BLOCK_SIZE, n);
|
||||
|
||||
for (int row = row_block; row < row_end; row++) {
|
||||
for (int col = col_block; col < col_end; col++) {
|
||||
float sum = resultMat[row * k + col];
|
||||
for (int i = i_block; i < i_end; i++) {
|
||||
sum += leftMat[row * n + i] * rightMat[col * n + i];
|
||||
}
|
||||
resultMat[row * k + col] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mpi_sgemm_optimized(int m, int n, int k, float *&leftMat, float *&rightMat,
|
||||
float *&resultMat, int rank, int worldsize) {
|
||||
|
||||
// 计算行列分块数
|
||||
int rowBlock = (int)sqrt((double)worldsize);
|
||||
while (rowBlock > 0 && worldsize % rowBlock != 0) {
|
||||
rowBlock--;
|
||||
}
|
||||
int colBlock = worldsize / rowBlock;
|
||||
|
||||
int rowStride, colStride;
|
||||
float *res = nullptr;
|
||||
float *localLeftMat = leftMat;
|
||||
float *localRightMat = rightMat;
|
||||
|
||||
if (rank == 0) {
|
||||
// 矩阵转置 - 使用OpenMP加速
|
||||
float *buf = new float[k * n];
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int r = 0; r < n; r++) {
|
||||
for (int c = 0; c < k; c++) {
|
||||
buf[c * n + r] = rightMat[r * k + c];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int r = 0; r < k; r++) {
|
||||
for (int c = 0; c < n; c++) {
|
||||
rightMat[r * n + c] = buf[r * n + c];
|
||||
}
|
||||
}
|
||||
delete[] buf;
|
||||
|
||||
// 使用非阻塞通信重叠计算和通信
|
||||
std::vector<MPI_Request> sendRequests;
|
||||
sendRequests.reserve(1000);
|
||||
|
||||
for (int rowB = 0; rowB < rowBlock; rowB++) {
|
||||
for (int colB = 0; colB < colBlock; colB++) {
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
int sendto = rowB * colBlock + colB;
|
||||
if (sendto == 0) {
|
||||
res = new float[rowStride * colStride];
|
||||
localLeftMat = leftMat + rowStart * n;
|
||||
localRightMat = rightMat + colStart * n;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 发送分块大小
|
||||
MPI_Request req;
|
||||
MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
|
||||
// 发送矩阵数据
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto,
|
||||
1, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
}
|
||||
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto,
|
||||
2, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 等待所有发送完成
|
||||
for (size_t i = 0; i < sendRequests.size(); i++) {
|
||||
MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE);
|
||||
}
|
||||
} else {
|
||||
if (rank < worldsize) {
|
||||
int rowB = rank / colBlock;
|
||||
int colB = rank % colBlock;
|
||||
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
localLeftMat = new float[rowStride * n];
|
||||
localRightMat = new float[colStride * n];
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
|
||||
MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD,
|
||||
MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
res = new float[rowStride * colStride];
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
// 本地计算 - 使用优化版本
|
||||
if (rank < worldsize) {
|
||||
int rowB = rank / colBlock;
|
||||
int colB = rank % colBlock;
|
||||
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
openmp_sgemm_optimized(rowStride, n, colStride, localLeftMat, localRightMat, res);
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
// 收集结果
|
||||
if (rank == 0) {
|
||||
int rowB = 0;
|
||||
int colB = 0;
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int colStart = colB * (k / colBlock);
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c];
|
||||
}
|
||||
}
|
||||
delete[] res;
|
||||
|
||||
for (int rowB = 0; rowB < rowBlock; rowB++) {
|
||||
for (int colB = 0; colB < colBlock; colB++) {
|
||||
int recvfrom = rowB * colBlock + colB;
|
||||
if (recvfrom == 0) continue;
|
||||
|
||||
MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
float *tmpRes = new float[rowStride * colStride];
|
||||
MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4,
|
||||
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int colStart = colB * (k / colBlock);
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c];
|
||||
}
|
||||
}
|
||||
delete[] tmpRes;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (rank < worldsize) {
|
||||
MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
|
||||
MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
|
||||
MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD);
|
||||
|
||||
delete[] res;
|
||||
delete[] localLeftMat;
|
||||
delete[] localRightMat;
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 4) {
|
||||
cout << "Usage: " << argv[0] << " M N K\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int rank;
|
||||
int worldSize;
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
|
||||
float *leftMat, *rightMat, *resMat;
|
||||
struct timeval start, stop;
|
||||
|
||||
if (rank == 0) {
|
||||
randMat(m, n, leftMat);
|
||||
randMat(n, k, rightMat);
|
||||
randMat(m, k, resMat);
|
||||
}
|
||||
|
||||
gettimeofday(&start, NULL);
|
||||
mpi_sgemm_optimized(m, n, k, leftMat, rightMat, resMat, rank, worldSize);
|
||||
gettimeofday(&stop, NULL);
|
||||
|
||||
if (rank == 0) {
|
||||
double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 +
|
||||
(stop.tv_usec - start.tv_usec) / 1000.0;
|
||||
cout << "optimized mpi matmul: " << elapsed << " ms" << endl;
|
||||
|
||||
bool correct = true;
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < k; j++){
|
||||
if (int(resMat[i * k + j]) != n) {
|
||||
cout << "Error at [" << i << "][" << j << "]: "
|
||||
<< resMat[i * k + j] << " (expected " << n << ")\n";
|
||||
correct = false;
|
||||
goto end_check;
|
||||
}
|
||||
}
|
||||
}
|
||||
end_check:
|
||||
if (correct) {
|
||||
cout << "Result verification: PASSED" << endl;
|
||||
} else {
|
||||
cout << "Result verification: FAILED" << endl;
|
||||
}
|
||||
|
||||
delete[] leftMat;
|
||||
delete[] rightMat;
|
||||
delete[] resMat;
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
312
work/gemm_parallel.cpp
Normal file
312
work/gemm_parallel.cpp
Normal file
@ -0,0 +1,312 @@
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <iostream>
|
||||
#include <mpi.h>
|
||||
#include <omp.h>
|
||||
#include <vector>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void randMat(int rows, int cols, float *&Mat) {
|
||||
Mat = new float[rows * cols];
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++)
|
||||
Mat[i * cols + j] = 1.0;
|
||||
}
|
||||
|
||||
void openmp_sgemm(int m, int n, int k, float *leftMat, float *rightMat,
|
||||
float *resultMat) {
|
||||
// rightMat is transposed
|
||||
// 使用OpenMP并行化外层循环
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int row = 0; row < m; row++) {
|
||||
for (int col = 0; col < k; col++) {
|
||||
resultMat[row * k + col] = 0.0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
resultMat[row * k + col] +=
|
||||
leftMat[row * n + i] * rightMat[col * n + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void mpi_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat,
|
||||
float *&resultMat, int rank, int worldsize) {
|
||||
|
||||
// 计算行列分块数(尽量接近平方数)
|
||||
int rowBlock = (int)sqrt((double)worldsize);
|
||||
while (rowBlock > 0 && worldsize % rowBlock != 0) {
|
||||
rowBlock--;
|
||||
}
|
||||
int colBlock = worldsize / rowBlock;
|
||||
|
||||
int rowStride, colStride;
|
||||
|
||||
float *res = nullptr;
|
||||
float *localLeftMat = leftMat;
|
||||
float *localRightMat = rightMat;
|
||||
|
||||
if (rank == 0) {
|
||||
// 矩阵转置
|
||||
float *buf = new float[k * n];
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int r = 0; r < n; r++) {
|
||||
for (int c = 0; c < k; c++) {
|
||||
buf[c * n + r] = rightMat[r * k + c];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for collapse(2)
|
||||
for (int r = 0; r < k; r++) {
|
||||
for (int c = 0; c < n; c++) {
|
||||
rightMat[r * n + c] = buf[r * n + c];
|
||||
}
|
||||
}
|
||||
delete[] buf;
|
||||
|
||||
// Master-Slave模式,将子矩阵发送到各子进程
|
||||
// 使用vector来动态分配足够的请求空间
|
||||
std::vector<MPI_Request> sendRequests;
|
||||
sendRequests.reserve(1000); // 预分配足够空间
|
||||
|
||||
for (int rowB = 0; rowB < rowBlock; rowB++) {
|
||||
for (int colB = 0; colB < colBlock; colB++) {
|
||||
// 计算分块大小(带状分块)
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
int sendto = rowB * colBlock + colB;
|
||||
if (sendto == 0) {
|
||||
// Rank 0 保留自己的分块
|
||||
res = new float[rowStride * colStride];
|
||||
localLeftMat = leftMat + rowStart * n;
|
||||
localRightMat = rightMat + colStart * n;
|
||||
continue;
|
||||
}
|
||||
|
||||
// 发送左矩阵分块
|
||||
MPI_Request req;
|
||||
MPI_Isend(&rowStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
MPI_Isend(&colStride, 1, MPI_INT, sendto, 0, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
|
||||
// 发送左矩阵数据
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
MPI_Isend(leftMat + (rowStart + r) * n, n, MPI_FLOAT, sendto,
|
||||
1, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
}
|
||||
|
||||
// 发送右矩阵数据
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
MPI_Isend(rightMat + (colStart + c) * n, n, MPI_FLOAT, sendto,
|
||||
2, MPI_COMM_WORLD, &req);
|
||||
sendRequests.push_back(req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 等待所有发送完成
|
||||
for (size_t i = 0; i < sendRequests.size(); i++) {
|
||||
MPI_Wait(&sendRequests[i], MPI_STATUS_IGNORE);
|
||||
}
|
||||
} else {
|
||||
// 接收从主进程发送来的数据
|
||||
if (rank < worldsize) {
|
||||
// 计算当前rank的分块位置
|
||||
int rowB = rank / colBlock;
|
||||
int colB = rank % colBlock;
|
||||
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
// 接收分块大小
|
||||
MPI_Recv(&rowStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
MPI_Recv(&colStride, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
// 分配内存并接收数据
|
||||
localLeftMat = new float[rowStride * n];
|
||||
localRightMat = new float[colStride * n];
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
MPI_Recv(localLeftMat + r * n, n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
|
||||
MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
MPI_Recv(localRightMat + c * n, n, MPI_FLOAT, 0, 2, MPI_COMM_WORLD,
|
||||
MPI_STATUS_IGNORE);
|
||||
}
|
||||
|
||||
res = new float[rowStride * colStride];
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
// 本地子矩阵相乘
|
||||
if (rank < worldsize) {
|
||||
// 重新计算分块大小
|
||||
int rowB = rank / colBlock;
|
||||
int colB = rank % colBlock;
|
||||
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int rowEnd = (rowB == rowBlock - 1) ? m : (rowB + 1) * (m / rowBlock);
|
||||
rowStride = rowEnd - rowStart;
|
||||
|
||||
int colStart = colB * (k / colBlock);
|
||||
int colEnd = (colB == colBlock - 1) ? k : (colB + 1) * (k / colBlock);
|
||||
colStride = colEnd - colStart;
|
||||
|
||||
// 调用OpenMP加速本地子矩阵相乘运算
|
||||
openmp_sgemm(rowStride, n, colStride, localLeftMat, localRightMat, res);
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
// 将计算结果传送回rank 0
|
||||
if (rank == 0) {
|
||||
// Rank 0 直接复制自己的结果
|
||||
int rowB = 0;
|
||||
int colB = 0;
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int colStart = colB * (k / colBlock);
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
resultMat[(rowStart + r) * k + (colStart + c)] = res[r * colStride + c];
|
||||
}
|
||||
}
|
||||
delete[] res;
|
||||
|
||||
// 接收其他进程的结果
|
||||
for (int rowB = 0; rowB < rowBlock; rowB++) {
|
||||
for (int colB = 0; colB < colBlock; colB++) {
|
||||
int recvfrom = rowB * colBlock + colB;
|
||||
if (recvfrom == 0) continue;
|
||||
|
||||
// 接收分块大小
|
||||
MPI_Recv(&rowStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
MPI_Recv(&colStride, 1, MPI_INT, recvfrom, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
// 接收结果数据
|
||||
float *tmpRes = new float[rowStride * colStride];
|
||||
MPI_Recv(tmpRes, rowStride * colStride, MPI_FLOAT, recvfrom, 4,
|
||||
MPI_COMM_WORLD, MPI_STATUS_IGNORE);
|
||||
|
||||
// 组装到全局矩阵
|
||||
int rowStart = rowB * (m / rowBlock);
|
||||
int colStart = colB * (k / colBlock);
|
||||
|
||||
for (int r = 0; r < rowStride; r++) {
|
||||
for (int c = 0; c < colStride; c++) {
|
||||
resultMat[(rowStart + r) * k + (colStart + c)] = tmpRes[r * colStride + c];
|
||||
}
|
||||
}
|
||||
delete[] tmpRes;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (rank < worldsize) {
|
||||
// 发送分块大小
|
||||
MPI_Send(&rowStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
|
||||
MPI_Send(&colStride, 1, MPI_INT, 0, 3, MPI_COMM_WORLD);
|
||||
|
||||
// 发送结果数据
|
||||
MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 4, MPI_COMM_WORLD);
|
||||
|
||||
delete[] res;
|
||||
delete[] localLeftMat;
|
||||
delete[] localRightMat;
|
||||
}
|
||||
}
|
||||
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 4) {
|
||||
if (argc == 0) {
|
||||
cout << "Usage: program M N K" << endl;
|
||||
} else {
|
||||
cout << "Usage: " << argv[0] << " M N K\n";
|
||||
}
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int rank;
|
||||
int worldSize;
|
||||
MPI_Init(&argc, &argv);
|
||||
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
|
||||
// 矩阵尺寸
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
|
||||
float *leftMat, *rightMat, *resMat;
|
||||
|
||||
struct timeval start, stop;
|
||||
|
||||
// 矩阵初始化
|
||||
if (rank == 0) {
|
||||
randMat(m, n, leftMat);
|
||||
randMat(n, k, rightMat);
|
||||
randMat(m, k, resMat);
|
||||
}
|
||||
|
||||
gettimeofday(&start, NULL);
|
||||
|
||||
// 使用MPI-OpenMP加速矩阵相乘
|
||||
mpi_sgemm(m, n, k, leftMat, rightMat, resMat, rank, worldSize);
|
||||
|
||||
gettimeofday(&stop, NULL);
|
||||
|
||||
// 打印结果
|
||||
if (rank == 0) {
|
||||
double elapsed = (stop.tv_sec - start.tv_sec) * 1000.0 +
|
||||
(stop.tv_usec - start.tv_usec) / 1000.0;
|
||||
cout << "mpi matmul: " << elapsed << " ms" << endl;
|
||||
|
||||
// 验证结果
|
||||
bool correct = true;
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < k; j++){
|
||||
if (int(resMat[i * k + j]) != n) {
|
||||
cout << "Error at [" << i << "][" << j << "]: "
|
||||
<< resMat[i * k + j] << " (expected " << n << ")\n";
|
||||
correct = false;
|
||||
goto end_check;
|
||||
}
|
||||
}
|
||||
}
|
||||
end_check:
|
||||
if (correct) {
|
||||
cout << "Result verification: PASSED" << endl;
|
||||
} else {
|
||||
cout << "Result verification: FAILED" << endl;
|
||||
}
|
||||
|
||||
delete[] leftMat;
|
||||
delete[] rightMat;
|
||||
delete[] resMat;
|
||||
}
|
||||
|
||||
MPI_Finalize();
|
||||
return 0;
|
||||
}
|
||||
97
work/gemm_serial.cpp
Normal file
97
work/gemm_serial.cpp
Normal file
@ -0,0 +1,97 @@
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/time.h>
|
||||
#include <iostream>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void randMat(int rows, int cols, float *&Mat) {
|
||||
Mat = new float[rows * cols];
|
||||
for (int i = 0; i < rows; i++)
|
||||
for (int j = 0; j < cols; j++)
|
||||
Mat[i * cols + j] = 1.0;
|
||||
}
|
||||
|
||||
void serial_sgemm(int m, int n, int k, float *&leftMat, float *&rightMat,
|
||||
float *&resultMat) {
|
||||
// rightMat is transposed
|
||||
float *buf = new float[k * n];
|
||||
// transpose right Mat
|
||||
for (int r = 0; r < n; r++) {
|
||||
for (int c = 0; c < k; c++) {
|
||||
buf[c * n + r] = rightMat[r * k + c];
|
||||
}
|
||||
}
|
||||
for (int r = 0; r < k; r++) {
|
||||
for (int c = 0; c < n; c++) {
|
||||
rightMat[r * n + c] = buf[r * n + c];
|
||||
}
|
||||
}
|
||||
|
||||
for (int row = 0; row < m; row++) {
|
||||
for (int col = 0; col < k; col++) {
|
||||
resultMat[row * k + col] = 0.0;
|
||||
for (int i = 0; i < n; i++) {
|
||||
resultMat[row * k + col] +=
|
||||
leftMat[row * n + i] * rightMat[col * n + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
delete[] buf;
|
||||
return;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc != 5) {
|
||||
cout << "Usage: " << argv[0] << " M N K use-blas\n";
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int m = atoi(argv[1]);
|
||||
int n = atoi(argv[2]);
|
||||
int k = atoi(argv[3]);
|
||||
int blas = atoi(argv[4]);
|
||||
|
||||
float *leftMat, *rightMat, *resMat;
|
||||
|
||||
struct timeval start, stop;
|
||||
randMat(m, n, leftMat);
|
||||
randMat(n, k, rightMat);
|
||||
randMat(m, k, resMat);
|
||||
|
||||
gettimeofday(&start, NULL);
|
||||
|
||||
serial_sgemm(m, n, k, leftMat, rightMat, resMat);
|
||||
|
||||
gettimeofday(&stop, NULL);
|
||||
cout << "matmul: "
|
||||
<< (stop.tv_sec - start.tv_sec) * 1000.0 +
|
||||
(stop.tv_usec - start.tv_usec) / 1000.0
|
||||
<< " ms" << endl;
|
||||
|
||||
// 验证结果
|
||||
bool correct = true;
|
||||
for (int i = 0; i < m; i++) {
|
||||
for (int j = 0; j < k; j++){
|
||||
if (int(resMat[i * k + j]) != n) {
|
||||
cout << "Error at [" << i << "][" << j << "]: "
|
||||
<< resMat[i * k + j] << " (expected " << n << ")\n";
|
||||
correct = false;
|
||||
goto end_check;
|
||||
}
|
||||
}
|
||||
}
|
||||
end_check:
|
||||
if (correct) {
|
||||
cout << "Result verification: PASSED" << endl;
|
||||
} else {
|
||||
cout << "Result verification: FAILED" << endl;
|
||||
}
|
||||
|
||||
delete[] leftMat;
|
||||
delete[] rightMat;
|
||||
delete[] resMat;
|
||||
|
||||
return 0;
|
||||
}
|
||||
207
work/run_experiments.sh
Executable file
207
work/run_experiments.sh
Executable file
@ -0,0 +1,207 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MPI-OpenMP矩阵乘法性能测试脚本
|
||||
# 用于收集实验数据
|
||||
|
||||
# 设置环境变量
|
||||
export OMP_NUM_THREADS=${OMP_NUM_THREADS:-1}
|
||||
|
||||
# 输出文件
|
||||
OUTPUT_FILE="experiment_results.csv"
|
||||
SERIAL_OUTPUT="serial_results.csv"
|
||||
# get arch using uname -m
|
||||
# if aarch64 then use arm64-v8a else use x86_64
|
||||
ARCH=$(uname -m)
|
||||
if [ "$ARCH" == "aarch64" ]; then
|
||||
BUILD_ARCH="arm64-v8a"
|
||||
else
|
||||
BUILD_ARCH="x86_64"
|
||||
fi
|
||||
# Build directory
|
||||
BUILD_DIR="./build/linux/$BUILD_ARCH/release"
|
||||
# 创建输出文件并写入表头
|
||||
echo "Experiment,M,N,K,MPI_Processes,OpenMP_Threads,Time_ms,Speedup,Efficiency" > $OUTPUT_FILE
|
||||
echo "M,N,K,Time_ms" > $SERIAL_OUTPUT
|
||||
|
||||
# 矩阵尺寸配置(可以根据需要修改)
|
||||
MATRIX_SIZES="512 1024 2048 4096"
|
||||
|
||||
# MPI进程数配置
|
||||
MPI_PROCESSES="1 2 3 6 9 12"
|
||||
|
||||
# OpenMP线程数配置
|
||||
OPENMP_THREADS="1 2 4 8"
|
||||
|
||||
echo "=========================================="
|
||||
echo "MPI-OpenMP矩阵乘法性能测试"
|
||||
echo "=========================================="
|
||||
|
||||
# 编译程序
|
||||
echo "编译程序..."
|
||||
xmake
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "编译失败!"
|
||||
exit 1
|
||||
fi
|
||||
echo "编译完成!"
|
||||
echo ""
|
||||
|
||||
# 获取串行基准时间
|
||||
echo "=========================================="
|
||||
echo "实验0: 串行基准测试"
|
||||
echo "=========================================="
|
||||
|
||||
for SIZE in $MATRIX_SIZES; do
|
||||
echo "测试矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
|
||||
TIME=$($BUILD_DIR/gemm_serial $SIZE $SIZE $SIZE 0 | grep "matmul:" | awk '{print $2}')
|
||||
echo " 时间: ${TIME} ms"
|
||||
echo "$SIZE,$SIZE,$SIZE,$TIME" >> $SERIAL_OUTPUT
|
||||
done
|
||||
echo ""
|
||||
|
||||
# 实验一:固定OpenMP线程数为1,改变MPI进程数
|
||||
echo "=========================================="
|
||||
echo "实验一: OpenMP线程数=1,改变MPI进程数"
|
||||
echo "=========================================="
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
|
||||
for SIZE in $MATRIX_SIZES; do
|
||||
# 获取串行时间
|
||||
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
|
||||
|
||||
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
|
||||
echo "串行时间: ${SERIAL_TIME} ms"
|
||||
|
||||
for NP in $MPI_PROCESSES; do
|
||||
echo " MPI进程数: $NP"
|
||||
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
|
||||
|
||||
if [ ! -z "$TIME" ]; then
|
||||
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
|
||||
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $NP" | bc)
|
||||
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
|
||||
echo "Exp1,$SIZE,$SIZE,$SIZE,$NP,1,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
# 实验二:同时改变MPI进程数和OpenMP线程数
|
||||
echo "=========================================="
|
||||
echo "实验二: 改变MPI进程数和OpenMP线程数"
|
||||
echo "=========================================="
|
||||
|
||||
for SIZE in $MATRIX_SIZES; do
|
||||
# 获取串行时间
|
||||
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
|
||||
|
||||
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
|
||||
|
||||
for NTHREADS in $OPENMP_THREADS; do
|
||||
export OMP_NUM_THREADS=$NTHREADS
|
||||
echo " OpenMP线程数: $NTHREADS"
|
||||
|
||||
for NP in $MPI_PROCESSES; do
|
||||
TOTAL_PROCS=$((NP * NTHREADS))
|
||||
echo " MPI进程数: $NP (总处理器数: $TOTAL_PROCS)"
|
||||
|
||||
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
|
||||
|
||||
if [ ! -z "$TIME" ]; then
|
||||
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
|
||||
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
|
||||
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
|
||||
echo "Exp2,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
# 实验三:固定总处理器数,改变MPI和OpenMP的组合
|
||||
echo "=========================================="
|
||||
echo "实验三: 固定总处理器数,改变MPI/OpenMP组合"
|
||||
echo "=========================================="
|
||||
|
||||
TOTAL_PROCS_TARGET=16
|
||||
echo "目标总处理器数: $TOTAL_PROCS_TARGET"
|
||||
|
||||
for SIZE in $MATRIX_SIZES; do
|
||||
# 获取串行时间
|
||||
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
|
||||
|
||||
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
|
||||
|
||||
# 不同的MPI/OpenMP组合,使得总处理器数接近16
|
||||
declare -a COMBOS=("1:16" "2:8" "4:4" "8:2" "16:1")
|
||||
|
||||
for COMBO in "${COMBOS[@]}"; do
|
||||
NP=$(echo $COMBO | cut -d':' -f1)
|
||||
NTHREADS=$(echo $COMBO | cut -d':' -f2)
|
||||
TOTAL_PROCS=$((NP * NTHREADS))
|
||||
|
||||
export OMP_NUM_THREADS=$NTHREADS
|
||||
echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)"
|
||||
|
||||
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_parallel $SIZE $SIZE $SIZE | grep "mpi matmul:" | awk '{print $3}')
|
||||
|
||||
if [ ! -z "$TIME" ]; then
|
||||
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
|
||||
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
|
||||
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
|
||||
echo "Exp3,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
# 实验三(优化实现): 固定总处理器数,使用 gemm_optimized,结果标识为 Exp3-opt
|
||||
echo "=========================================="
|
||||
echo "实验三(优化): 固定总处理器数,使用 gemm_optimized 的 MPI/OpenMP 组合测试"
|
||||
echo "=========================================="
|
||||
|
||||
for SIZE in $MATRIX_SIZES; do
|
||||
# 获取串行时间
|
||||
SERIAL_TIME=$(grep "^$SIZE," $SERIAL_OUTPUT | cut -d',' -f4)
|
||||
|
||||
echo "矩阵尺寸: ${SIZE}x${SIZE}x${SIZE}"
|
||||
|
||||
# 与之前相同的组合
|
||||
declare -a COMBOS_OPT=("1:16" "2:8" "4:4" "8:2" "16:1")
|
||||
|
||||
for COMBO in "${COMBOS_OPT[@]}"; do
|
||||
NP=$(echo $COMBO | cut -d':' -f1)
|
||||
NTHREADS=$(echo $COMBO | cut -d':' -f2)
|
||||
TOTAL_PROCS=$((NP * NTHREADS))
|
||||
|
||||
export OMP_NUM_THREADS=$NTHREADS
|
||||
echo " MPI: $NP, OpenMP: $NTHREADS (总处理器: $TOTAL_PROCS)"
|
||||
|
||||
TIME=$(mpirun --hostfile ~/mpi_hosts --oversubscribe -np $NP $BUILD_DIR/gemm_optimized $SIZE $SIZE $SIZE | grep "optimized mpi matmul:" | awk '{print $4}')
|
||||
|
||||
if [ ! -z "$TIME" ]; then
|
||||
SPEEDUP=$(echo "scale=4; $SERIAL_TIME / $TIME" | bc)
|
||||
EFFICIENCY=$(echo "scale=4; $SPEEDUP / $TOTAL_PROCS" | bc)
|
||||
echo " 时间: ${TIME} ms, 加速比: $SPEEDUP, 效率: $EFFICIENCY"
|
||||
echo "Exp3-opt,$SIZE,$SIZE,$SIZE,$NP,$NTHREADS,$TIME,$SPEEDUP,$EFFICIENCY" >> $OUTPUT_FILE
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
|
||||
echo "=========================================="
|
||||
echo "测试完成!"
|
||||
echo "结果已保存到: $OUTPUT_FILE"
|
||||
echo "串行基准已保存到: $SERIAL_OUTPUT"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "数据处理说明:"
|
||||
echo "1. 使用Excel、Python或R读取CSV文件"
|
||||
echo "2. 绘制图表:"
|
||||
echo " - 实验一: X轴=MPI进程数, Y轴=加速比/效率, 不同矩阵尺寸用不同颜色"
|
||||
echo " - 实验二: X轴=总处理器数, Y轴=加速比/效率, 不同OpenMP线程数用不同颜色"
|
||||
echo " - 实验三: X轴=MPI进程数, Y轴=效率, 不同矩阵尺寸用不同颜色"
|
||||
echo "3. 分析加速比和效率的变化趋势"
|
||||
echo "4. 讨论MPI/OpenMP组合对性能的影响"
|
||||
|
||||
33
work/xmake.lua
Normal file
33
work/xmake.lua
Normal file
@ -0,0 +1,33 @@
|
||||
set_project("gemm")
|
||||
set_version("1.0")
|
||||
add_requires("openmp")
|
||||
|
||||
add_rules("mode.debug", "mode.release")
|
||||
-- Find MPI package
|
||||
add_requires("mpi", {system = true})
|
||||
add_requires("mpi_cxx", {system = true})
|
||||
-- 串行版本
|
||||
target("gemm_serial")
|
||||
set_kind("binary")
|
||||
add_files("gemm_serial.cpp")
|
||||
add_cxxflags("-O3", "-march=native")
|
||||
|
||||
-- 并行版本
|
||||
target("gemm_parallel")
|
||||
set_kind("binary")
|
||||
add_files("gemm_parallel.cpp")
|
||||
add_cxxflags("-O3", "-march=native")
|
||||
add_packages("openmp")
|
||||
-- 使用mpic++作为编译器
|
||||
add_packages("mpi")
|
||||
add_packages("mpi_cxx")
|
||||
|
||||
-- 优化版本
|
||||
target("gemm_optimized")
|
||||
set_kind("binary")
|
||||
add_files("gemm_optimized.cpp")
|
||||
add_cxxflags("-O3", "-march=native")
|
||||
add_packages("openmp")
|
||||
-- 使用mpic++作为编译器
|
||||
add_packages("mpi")
|
||||
add_packages("mpi_cxx")
|
||||
Loading…
x
Reference in New Issue
Block a user