linpack实验:MPI代码调优
代碼調(diào)優(yōu)其實屬于編譯優(yōu)化,是編譯器干的事情,但為了研究,我們用手動的方式簡單地做三個小實驗
先介紹mpi的相關(guān)知識:
mpicc:類似于gcc編譯器,可以編譯c文件為一個可執(zhí)行文件mpic++:類似于g++編譯器,可以編譯cpp文件為一個可執(zhí)行文件mpirun:運行可執(zhí)行文件,可以調(diào)整線程數(shù)目,但需要代碼中含有mpi的一些函數(shù)在這里我們選用mpic++來編譯cpp文件
mpi實際上是一個庫,可以被c++,c和fortran三種語言調(diào)用(作者已知的),這里我們使用c++的庫,也就是一個含有很多頭文件的文件夾,我們不用關(guān)心他在哪里,因為我們使用的mpic++會自動找到mpi庫,在cpp中首先要引入頭文件:
#include <mpi.h>下面這段代碼目的是啟用MPI環(huán)境,具體含義作者母雞,就不深究了,總之所有測試程序都要寫
int numprocs, myid, source; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &myid); MPI_Comm_size(MPI_COMM_WORLD, &numprocs);我們還需知道兩個函數(shù):
MPI_Wtime();//獲取當前時間 MPI_Wtick();//獲取本進程時間精度(時鐘頻率)我們通過時間計算出執(zhí)行時間來衡量性能
開始實驗:
① 循環(huán)交換
老師的PPT給出了如下例子:
我們不妨就按照這個例子書寫代碼:
尚未優(yōu)化的代碼:
#include<iostream> #include <mpi.h> using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);int row=50000,col=1000;int **x = new int*[row];for(int i=0;i<row;i++)x[i]=new int[col];double begin = MPI_Wtime();for(int j=0;j<col;j++){for(int i=0;i<row;i++){x[i][j]=2*x[i][j];}}double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }優(yōu)化后的代碼:
#include<iostream> #include <mpi.h> using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);int row=50000,col=1000;int **x = new int*[row];for(int i=0;i<row;i++)x[i]=new int[col];double begin = MPI_Wtime();//在這里進行了循環(huán)交換for(int i=0;i<row;i++){for(int j=0;j<col;j++){x[i][j]=2*x[i][j];}}double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }下面是性能對比:
可見進行循環(huán)交換讓程序性能提高了7-8倍
② 數(shù)組合并
老師的PPT給出了如下例子:
我們不妨就按照這個例子書寫代碼:
尚未優(yōu)化的代碼:
#include<iostream> #include <mpi.h> #define SIZE 1000000 using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);int val[SIZE];int key[SIZE];double begin = MPI_Wtime();for(int i=0;i<SIZE;i++){val[i] = val[i]+key[i];}double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }優(yōu)化后的代碼:
#include<iostream> #include <mpi.h> #define SIZE 1000000 using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);//這里兩個數(shù)組合并為一個結(jié)構(gòu)體數(shù)組struct merge{int val;int key;};struct merge merge_array[SIZE];double begin = MPI_Wtime();for(int i=0;i<SIZE;i++){merge_array->val = merge_array->val+merge_array->key;}double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }下面是性能對比:
可見,數(shù)組合并讓程序性能提高了3倍左右
③ 循環(huán)融合
老師的PPT給出了如下例子:
我們不妨就按照這個例子書寫代碼:
尚未優(yōu)化的代碼:
#include<iostream> #include <mpi.h> #define N 700 using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);int a[N][N],b[N][N],c[N][N],d[N][N];for(int i=0;i<N;i++){for(int j=0;j<N;j++){a[i][j]=1;b[i][j]=1;c[i][j]=1;d[i][j]=1;}}double begin = MPI_Wtime();for(int i=0;i<N;i++)for(int j=0;j<N;j++)a[i][j]=1/b[i][j]*c[i][j];for(int i=0;i<N;i++)for(int j=0;j<N;j++)d[i][j]=a[i][j]+c[i][j]; double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }優(yōu)化后的代碼:
#include<iostream> #include <mpi.h> #define N 700 using namespace std;int main(int argc, char* argv[]){int numprocs, myid, source;MPI_Status status;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &numprocs);int a[N][N],b[N][N],c[N][N],d[N][N];for(int i=0;i<N;i++){for(int j=0;j<N;j++){a[i][j]=1;b[i][j]=1;c[i][j]=1;d[i][j]=1;}}double begin = MPI_Wtime();//兩個循環(huán)合成一個循環(huán)for(int i=0;i<N;i++)for(int j=0;j<N;j++){a[i][j]=1/b[i][j]*c[i][j];d[i][j]=a[i][j]+c[i][j]; }double end = MPI_Wtime();double diff = end - begin;printf("%d process time is %9.16f\n", myid, diff);printf("%d process tick is %9.16f\n", myid, MPI_Wtick());MPI_Finalize(); }下面是性能對比:
可見循環(huán)融合對程序性能的提升非常有限
總結(jié)
以上是生活随笔為你收集整理的linpack实验:MPI代码调优的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 变电站仪器仪表检测图像数据集
- 下一篇: OneNMP路由器、交换机监控