Hi Everyone!!
I am paralleling Jacobi algorithm and I got same results as sequential code with two processors and four processors but with eight and sixteen I've got wrong results.
So, could anyone help me with this issue?
Here's my code:
#include <math.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include "timer.h"
#include "mpi.h"
#define M 4096
#define N 4096
int main(int argc, char** argv)
{
StartTimer();
int size, rank, i, j;
int local_rows = 0;
int iter_max = 1000;
int iter = 0;
float error = 1.0f;
const float tol = 1.0e-5f;
const float pi = 2.0f * asinf(1.0f);
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Request request;
MPI_Status status;
local_rows = (int) floor(N/size);
float **my_u;
my_u = (float**) malloc((local_rows) * sizeof(float*));
for(j = 0; j < local_rows; j++){
my_u[j] = (float*) malloc(M * sizeof(float));
memset(my_u[j], 0, M * sizeof(float));
}
float **my_u_temp;
my_u_temp = (float**) malloc((local_rows) * sizeof(float*));
for(j = 0; j < local_rows; j++){
my_u_temp[j] = (float*) malloc(M * sizeof(float));
memset(my_u_temp[j], 0, M * sizeof(float));
}
float g0[M];
for(j = 0; j < N; j++){
g0[j] = sinf(pi * j / (N-1));
}
for(j = 0; j < local_rows; j++){
my_u[j][0] = g0[(rank * local_rows) + j];
my_u[j][M-1] = g0[(rank * local_rows) + j] * expf(-pi);
my_u_temp[j][0] = g0[(rank * local_rows) + j];
my_u_temp[j][M-1] = g0[(rank * local_rows) + j] * expf(-pi);
}
float vtemp[M];
float vtemp2[M];
for(j = 0; j < M; j++){
vtemp[j] = 0.f;
vtemp2[j] = 0.f;
}
printf("Jacobi relaxation Calculation: %d x %d mesh\n", N, M);
float m_error = 0.f;
while ( error > tol && iter < iter_max ){
error = 0.f;
if(rank == 0){
for(j = 1; j < local_rows; j++){
for(i = 1; i < M-1; i++){
if(j == local_rows-1){
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + my_u[j-1][i] + vtemp[i]);
}
else{
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + my_u[j-1][i] + my_u[j+1][i]);
}
error = fmaxf( error, fabsf(my_u_temp[j][i]-my_u[j][i]));
}
}
}
else if(rank == size-1){
for(j = 0; j < local_rows-1;j++){
for(i = 1; i < M-1; i++){
if(j == 0){
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + vtemp[i] + my_u[j+1][i]);
}
else{
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + my_u[j-1][i] + my_u[j+1][i]);
}
error = fmaxf( error, fabsf(my_u_temp[j][i]-my_u[j][i]));
}
}
}
else{
for(j = 0; j < local_rows; j++){
for(i = 1; i < M-1;i++){
if(j == 0){
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + vtemp[i] + my_u[j+1][i]);
}
else if(j == local_rows - 1){
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + my_u[j-1][i] + vtemp2[i]);
}
else{
my_u_temp[j][i] = 0.25f * (my_u[j][i+1] + my_u[j][i-1] + my_u[j-1][i] + my_u[j+1][i]);
}
error = fmaxf( error, fabsf(my_u_temp[j][i]-my_u[j][i]));
}
}
}
for( j = 0; j < local_rows; j++){
for( i = 0; i < M; i++ ){
my_u[j][i] = my_u_temp[j][i];
}
}
if(rank % 2 == 0){
MPI_Send(my_u[local_rows-1], M, MPI_FLOAT, rank+1, 0, MPI_COMM_WORLD);
MPI_Recv(vtemp, M, MPI_FLOAT, rank+1, 0, MPI_COMM_WORLD, &status);
}
else if(rank % 2 == 1){
MPI_Recv(vtemp, M, MPI_FLOAT, rank-1, 0, MPI_COMM_WORLD, &status);
MPI_Send(my_u[0], M, MPI_FLOAT, rank-1, 0, MPI_COMM_WORLD);
}
if(rank != 0 && rank != size-1){
if(rank % 2 == 1){
MPI_Recv(vtemp2, M, MPI_FLOAT, rank+1, 0, MPI_COMM_WORLD, &status);
MPI_Send(my_u[local_rows-1], M, MPI_FLOAT, rank+1, 0, MPI_COMM_WORLD);
}
else if(rank % 2 == 0){
MPI_Send(my_u[0], M, MPI_FLOAT, rank-1, 0, MPI_COMM_WORLD);
MPI_Recv(vtemp2, M, MPI_FLOAT, rank-1, 0, MPI_COMM_WORLD, &status);
}
}
MPI_Allreduce(&error, &m_error, 8, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD);
error = m_error;
if(iter % 100 == 0){
printf("%5d, %0.6f\n", iter, error);
}
MPI_Barrier(MPI_COMM_WORLD);
iter++;
}
float **u;
u = (float**) malloc(N * sizeof(float*));
for(j = 0; j < N; j++){
u[j] = (float*) malloc(M * sizeof(float));
memset(u[j], 0, M * sizeof(float));
}
for (j = 0; j < N; j++)
{
u[j][0] = g0[j];
u[j][M-1] = g0[j]*expf(-pi);
}
MPI_Allgather(*my_u, local_rows * M, MPI_FLOAT, *u, local_rows * M, MPI_FLOAT, MPI_COMM_WORLD);
double runtime = GetTimer();
printf("Rank %d: total: %f s\n",rank, runtime / 1000.f);
double totalRunTime;
MPI_Allreduce(&runtime, &totalRunTime, 8, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
printf("************** Maximum running time is: %f s\n", totalRunTime / 1000.f);
FILE *output;
output = fopen("laplace_Par_Sample_1.txt", "w");
for(j = 3000; j < 4096; j++)
{
for(i = 4; i < 20; i++)
{
fprintf(output, "%0.6f\t", u[j][i]);
}
fprintf(output, "\n");
}
fclose(output);
MPI_Finalize();
return 0;
}
Thanks