Cannon矩阵乘法的MPI实现及性能分析.doc
#include <stdio.h>#include <mpi.h>#include <math.h>#include <stdlib.h>#include <time.h>#include <memory.h>MPI_Status status;double *A, *B, *C; /C=A*Bdouble *a,*b,*c; /各个进程的缓冲区int n; /矩阵的行列数int np; /每个进程控制的小矩阵的行列数int p,rank; /进程个个数、当前进程的编号,笛卡尔进程编号double *tempa, *tempb;void ProduceABC(); /在根处理器中生成矩阵AB,初始化矩阵Cvoid PrintABC();/输出结果void ScatterAB();/ 分发矩阵AB中的元素到各个进程中void MainProcess(); /cannon算法的主过程void collectC(); /收集结果矩阵Cvoid Mutiply(); /矩阵相乘void Printab();void Printc();int main(int argc, char *argv) int i;double starttime,endtime;MPI_Init(&argc, &argv);MPI_Comm_size(MPI_COMM_WORLD, &p);MPI_Comm_rank(MPI_COMM_WORLD, &rank);if(rank = 0)printf("请输入矩阵的行列数n= "); fflush(stdout);scanf_s("%d", &n);printf("n");MPI_Bcast(&n, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);/ n = atoi(argv1);np = n/(int)sqrt(p);a = (double*)malloc(np*np*sizeof(double);b = (double*)malloc(np*np*sizeof(double);c = (double*)malloc(np*np*sizeof(double);memset(c, 0, np*np*sizeof(double);tempa = (double*)malloc(np*np*sizeof(double);tempb = (double*)malloc(np*np*sizeof(double); if(rank = 0) /在根处理器中为矩阵ABC分配空间A = (double*)malloc(n*sizeof(double*);B = (double*)malloc(n*sizeof(double*);C = (double*)malloc(n*sizeof(double*); for(i = 0; i < n; i+)Ai = (double*)malloc(n*sizeof(double);Bi = (double*)malloc(n*sizeof(double);Ci = (double*)malloc(n*sizeof(double);ProduceABC(); /在根处理器中随机生成矩阵AB,初始化矩阵CScatterAB();/ 分发矩阵AB中的元素到各个进程中 elseMPI_Recv(a, np*np, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &status);MPI_Recv(b, np*np, MPI_DOUBLE, 0, 2, MPI_COMM_WORLD, &status); starttime=MPI_Wtime(); /开始时间MainProcess(); /cannon算法的主过程if(rank = 0)collectC(); /收集结果矩阵CPrintABC(); /输出结果endtime=MPI_Wtime(); printf("time used: %lfn",endtime - starttime);for(i = 0; i < n; i+)free(Ai);free(Bi);free(Ci);free(A);free(B);free(C);else MPI_Send(c, np*np, MPI_DOUBLE, 0, 1, MPI_COMM_WORLD);free(a);free(b);free(c);free(tempa);free(tempb);MPI_Finalize();return 0;void ProduceABC()/在根处理器中生成矩阵AB int i,j;for(i=0; i<n; i+)for(j=0; j<n; j+)Aij=1.0;Bij=1.0;Cij=0.0;void PrintABC()/输出结果printf("A00=%fnB00=%fnC00=%fn",A00,B00,C00); void ScatterAB()/ 分发矩阵AB中的元素到各个进程中int imin,imax,jmin,jmax;int sp;int i,j,k,m;for(k=0; k<p; k+)/*计算相应处理器所分得的矩阵块在总矩阵中的坐标范围*/sp = (int)sqrt(p);imin = (k / sp) * np;imax = imin + np - 1; jmin = (k % sp) * np;jmax = jmin + np -1;/*rank=0的处理器将A,B中的相应块拷至tempa,tempb,准备向其他处理器发送*/m = 0; for(i=imin; i<=imax; i+)for(j=jmin; j<=jmax; j+) tempam = Aij; tempbm = Bji; /矩阵B按列优先存储 m+; /*根处理器将自己对应的矩阵块从tempa,tempb拷至a,b*/if(k=0)memcpy(a, tempa, np*np*sizeof(double);memcpy(b, tempb, np*np*sizeof(double); else /*根处理器向其他处理器发送tempa,tempb中相关的矩阵块*/MPI_Send(tempa, np*np, MPI_DOUBLE, k, 1, MPI_COMM_WORLD);MPI_Send(tempb, np*np, MPI_DOUBLE, k, 2, MPI_COMM_WORLD);void MainProcess() /cannon算法的主过程MPI_Comm comm; /笛卡尔结构通讯器int crank;int dims2,periods2, coords2;int source, dest, up, down, right, left;int i;dims0 = dims1 = (int)sqrt(p);periods0 = periods1 = 1;MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 1, &comm);MPI_Comm_rank(comm, &crank);MPI_Cart_coords(comm, crank, 2, coords);MPI_Cart_shift(comm, 1, -1, &right, &left);MPI_Cart_shift(comm, 0, -1, &down, &up);MPI_Cart_shift(comm, 1, -coords0, &source, &dest);MPI_Sendrecv_replace(a, np*np, MPI_DOUBLE, dest, 1, source, 1, comm, &status);MPI_Cart_shift(comm, 0, -coords1, &source, &dest);MPI_Sendrecv_replace(b, np*np, MPI_DOUBLE, dest, 1, source, 1, comm, &status);Mutiply(); /矩阵相乘for(i = 1; i < dims0; i+)MPI_Sendrecv_replace(a, np*np, MPI_DOUBLE, left, 1, right, 1, comm, &status);MPI_Sendrecv_replace(b, np*np, MPI_DOUBLE, up, 1, down, 1, comm, &status);Mutiply(); /矩阵相乘MPI_Comm_free(&comm);void collectC() /收集结果矩阵Cint i,j,k,s,m;int imin,imax,jmin,jmax;int sp= (int)sqrt(p);/* 根处理器中的c赋给总矩阵C */for (i=0;i<np;i+)for(j=0;j<np;j+)Cij=ci*np+j;for (k=1;k<p;k+)/*根处理器从其他处理器接收相应的分块c*/MPI_Recv(c, np*np, MPI_DOUBLE, k, 1, MPI_COMM_WORLD, &status);/printf("rank = %dn", k);Printc();imin = (k / sp) * np;imax = imin + np - 1; jmin = (k % sp) * np;jmax = jmin + np -1; /*将接收到的c拷至C中的相应位置,从而构造出C*/for(i=imin,m=0; i<=imax; i+,m+)for(j=jmin,s=0; j<=jmax; j+,s+)Cij=cm*np+s; void Mutiply() /矩阵相乘int i,j,k;for(i=0; i<np; i+) for(j=0; j<np; j+)for(k=0; k<np; k+)ci*np+j += ai*np+k*bj*np+k; /b按列优先来搞注意:1.以.c来编译2.realease需单独配环境,并以realease跑比较快3.64位的mpich2要以x64平台编译,否则通不过4.mpiexec中一定要勾选以单独窗口运行,否则跑不动5.矩阵乘法,第二个矩阵按列优先存储性能提升数倍。若以二维存储,先转置再乘,同样的效果。