Loading... # 并行与分布式计算导论 第三次作业 > Based on markdown 推荐使用markdown阅读器阅读,或访问 [并行与分布式计算导论——作业三](https://www.ruixiaolu.com/archives/66/) ## 作业要求 完成以下MPI程序,提交作业报告,并在报告的最后附源码,格式为PDF。 [“大”和“小”](http://linhucaotang.com/problem/52#MPI) [求素数](http://linhucaotang.com/problem/4#)(这题ref程序有问题,不要求对比,可以自己写一个串行的函数验证正确性) (求pi的那题出了点问题,换成大和小,视图的概念如附件所示) 报告的内容可以包括: 1.并行的核心代码或核心思想; 2.与参考串行/并行程序相比较,或使用不同的线程数量比较输出结果并简单分析; 3.使用MPI的心得或遇到的困难。 ## "大"和"小" ### 临湖草堂数据对比 | 问题规模N | 线程数 | 加速比 | 线程数 | 加速比 | 线程数 | 加速比 | | :-------- | :----- | :----- | :----- | :----- | :----- | :----- | | $10$ | 1 | 0.569 | 2 | 0.209 | 4 | 0.044 | | $12$ | 1 | 0.691 | 2 | 0.937 | 4 | 1.269 | | $14$ | 1 | 1.035 | 2 | 1.406 | 4 | 3.180 | 从上表中可以看出,当问题规模上升,并且线程数上升时,并行计算的时间大幅降低。 ### 林湖草堂输出示例 [http://linhucaotang.com/submission/33993](http://linhucaotang.com/submission/33993) ``` array-size = 2^14 **************stdout from your impl.**************** k2 bufLen:32768, 32768 0 k2 bufLen:32768, 32768 1 k2 bufLen:32768, 32768 2 k5 bufLen:81920, 81920 3 =================================================================================== = BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES = PID 121570 RUNNING AT ailab-fx2-server-02 = EXIT CODE: 134 = CLEANING UP REMAINING PROCESSES = YOU CAN IGNORE THE BELOW CLEANUP MESSAGES =================================================================================== YOUR APPLICATION TERMINATED WITH THE EXIT STRING: Aborted (signal 6) This typically refers to a problem with your application. Please see the FAQ page for debugging suggestions ******************result evaluation***************** congratulations! the anwser is correct. ***************performance evaluation*************** reference serial impl. : time_cost=8.2295e-05 reference parallel impl.: time_cost=2.3413e-04 speedup=0.351 your impl. : time_cost=1.9550e-05 speedup=4.209 ``` ### 本题难点 1. 本题主要是题目的输入输出有明确规定,正确理解题意需要下一番功夫 2. 程序中基本上都是以字节数计数的,需要注意 3. 当线程数非2的幂次时,程序无法均分,需要特殊处理 ### 源代码 ```cpp #include <stdio.h> #include <stdlib.h> #include <mpi.h> #include <math.h> #include "fio.h" //extern struct FILE_SEG { // // int64_t offset; // // int64_t width; // // int64_t stride; // //}; typedef unsigned char byte; extern int64_t input_data(void *buf, int64_t count, FILE_SEG fseg); extern int64_t output_data(void *buf, int64_t count, FILE_SEG fseg); int main(int argc, char **argv) { int N = atoi(argv[1]); int num = pow(2, N); MPI_Init(NULL, NULL); int rank; int world; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world); int size = num / 2 / world; FILE_SEG fseg; fseg.offset = size * rank * sizeof(int64_t); fseg.width = size * sizeof(int64_t); fseg.stride = sizeof(int64_t) * (num - size * (2 * rank + 1)); if(rank==world-1) { fseg.width = (num / 2 - size * rank) * sizeof(int64_t); fseg.stride=fseg.width; } int64_t length = sizeof(int64_t) * num; int64_t k = 0; while (k * fseg.stride + fseg.offset < length) k++; printf("k%lld\n",k); int64_t bufLen = k * fseg.width; byte *data = new byte[bufLen]; int64_t x = input_data(data, bufLen, fseg); printf("bufLen:%lld, %lld %d \n", bufLen, x, rank); int wid=fseg.width/ sizeof(int64_t); for (int i = 0; i < wid; i++) { if(((int64_t *) data)[i]>((int64_t *) data)[2*wid-1-i]){ //printf("%lld,%lld,",((int64_t *) data)[i],((int64_t *) data)[2*wid-1-i]); int64_t tmp =((int64_t *) data)[i]; ((int64_t *) data)[i]=((int64_t *) data)[2*wid-1-i]; ((int64_t *) data)[2*wid-i-1]=tmp; } } //MPI_Barrier(MPI_COMM_WORLD); //printf("\n"); output_data(data,2*fseg.width,fseg); printf("Hello: rank %d, world: %d\n", rank, world); MPI_Finalize(); return 0; } ``` ## 求素数 ### 结果比较(本机8代标压酷睿i7-6核心) | 问题规模 | 线程数 | 时间(ms) | 线程数 | 时间(ms) | 线程数 | 时间(ms) | 线程数 | 时间(ms) | | :------- | :----- | :------- | :----- | :------- | :----- | :------- | :----- | :------- | | $8$ | 1 | 86 | 2 | 91 | 4 | 105 | 6 | 119 | | $12$ | 1 | 91 | 2 | 96 | 4 | 103 | 6 | 118 | | $15$ | 1 | 91 | 2 | 99 | 4 | 110 | 6 | 127 | | $18$ | 1 | 161 | 2 | 154 | 4 | 166 | 6 | 189 | 从表中可以看出,并行算法并没有加速,这是由于MPI传递信息的时间占比较大,并行加速不明显,另外,从下面部分也可以看出,在本地并非几线程就将CPU满载至百分之几百,这也是时间不准的原因之一。 ### 本地示例输出 ``` # rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:14:49] $ time mpirun -np 6 findPrime 18 mpirun -np 6 findPrime 18 0.35s user 0.13s system 254% cpu 0.189 total # rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:04] $ time mpirun -np 4 findPrime 18 mpirun -np 4 findPrime 18 0.25s user 0.08s system 199% cpu 0.166 total # rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:14] $ time mpirun -np 2 findPrime 18 mpirun -np 2 findPrime 18 0.17s user 0.05s system 141% cpu 0.154 total # rainshaw @ gaoruixodeMBP15 in ~/Desktop/Code/CLionProjects/Parallel&DistributedProgramming/homework3/findPrime/cmake-build-debug [3:15:31] $ time mpirun -np 1 findPrime 18 mpirun -np 1 findPrime 18 0.13s user 0.03s system 97% cpu 0.161 total ``` ### 源码 ```c #include <stdio.h> #include <stdlib.h> #include <mpi.h> #include <math.h> int isPrime(int n) { int flag = 1; if (n == 1 || n == 2) return 0; for (int m = 2; m <= sqrt(n * 1.0); m++) { if (n % m == 0) { flag = 0; break; } } return flag; } int main(int argc, char **argv) { int k = atoi(argv[1]); FILE *fp; MPI_Init(NULL, NULL); int rank; int world; int prime[(int) pow(2, 20)]; prime[0] = 0; prime[1] = 0; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &world); int size = (int) pow(2, k) / world + 1; for (int i = 1; i <= size && rank * size + i <= (int) pow(2, k); i++) { prime[rank * size + i] = isPrime(rank * size + i); } //printf("Hello: rank %d, world: %d, from %d to %d\n",rank, world, rank*size+1, rank*size+size); if (rank != 0) { MPI_Send(prime, (int) pow(2, k), MPI_INT, 0, 0, MPI_COMM_WORLD); //printf("%d\n",rank); } else { for (int source = 1; source < world; source++) { int buffer[(int) pow(2, k)]; MPI_Recv(buffer, (int) pow(2, k), MPI_INT, source, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); //printf("0 %d\n",source); for (int i = 0; i < (int) pow(2, k); i++) prime[i] = (prime[i] || buffer[i]); } } if (rank == 0) { //fp = fopen("ref.out", "w"); // for(int i=1;i<pow(2,k);i++) // if(prime[i]) { // printf("%d\n", i); // //fwrite(&i, 64, 1, fp); // } for (int i = 2; i < pow(2, k); i++) if (prime[i] != isPrime(i)) printf("False check at %d\n", i); } MPI_Finalize(); return 0; } ``` 最后修改:2021 年 03 月 13 日 © 禁止转载 打赏 赞赏作者 支付宝微信 赞 0 如果觉得我的文章对你有用,请随意赞赏