A: 

I'm not sure that I fully understand what you are trying to do, nor how, so this may be off the mark:

It looks as if you are trying to distribute the columns of a 2D array across a number of processes. Perhaps you have a 10-column array and 4 processes, so 2 processes will get 3 columns each and 2 processes will get 2 columns each ? Of course, process 0 'gets' 3 columns from itself.

Your first step, which you've taken, is to define an MPI_Type_vector which defines a column of the array.

Next, and here I get a bit puzzled by your use of MPI_Scatter, why not simply write a loop which sends columns 1,2,3,4,5,6,7,8,9,10 to processes 0,1,2,3,0,1,2,3,0,1 (oh, having previously arranged that the receiving processes have a 2d array to put the columns into as they arrive) ?

The problem with using MPI_Scatter for this is that you have (I think) to send the same amount of data to each of the receiving processes, which, as you have observed, will not work if the number of processes does not exactly divide into the number of columns in your array. If you had to use MPI_Scatter then you might have to pad your array with extra columns, but that seems a bit pointless.

Finally, you might be able to do what you want in 1 statement with MPI_Type_create_subarray or MPI_Type_create_darray, but I have no experience of using these.

High Performance Mark
A: 

Sorry for my bad and confusing english and thanks for your reply. I ve tried that before with only two processes involved where I send one column of a 2D array from process 0 to another 2d array on proc 1 with less success. That has been my first attempt to deal with my problem, but also thought its not the best way. So here is my last attempt, where proc 0 distributes the elements in the wrong order.

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv)
{
  double **A ;   /*2D array initialised on process 0 */
  int i,j ;      
  double **column ; /*2D array for distributed columns */

  int my_rank, p ;
  MPI_Status status ;

  int block_lengths[2] ;
  MPI_Aint displacements[2] ;
  MPI_Datatype types[2] ;
  MPI_Datatype vector_mpi_t ;
  MPI_Datatype ub_mpi_t;
  int n_bar = 1 ;

  MPI_Init(&argc,&argv) ;

  MPI_Comm_rank(MPI_COMM_WORLD,&my_rank) ;
  MPI_Comm_size(MPI_COMM_WORLD,&p) ;

  /*initialise 2D array on process 0 and allocate memory*/
  if(my_rank==0)
  {
    A = (double**)malloc(5*sizeof(double)) ;
    for(i=0;i<5;i++)
    {
      A[i] = (double*)malloc(5*sizeof(double)) ;
      for(j=0;j<5;j++)
    A[i][j] = (double)(i+1) ;

    }
    /* print 2D array to screen */
    for(i=0;i<5;i++)
    {

      for(j=0;j<5;j++)
    printf( "%lf " , A[i][j]) ;
      printf( "\n") ;

    }
    printf( "\n") ;
    printf( "\n") ;
  }
  /* initialise and allocate memory for column 2D array on every process */
  column = (double**)malloc(5*sizeof(double)) ;
    for(i=0;i<5;i++)
    {
      column[i] = (double*)malloc(5*sizeof(double)) ;
      for(j=0;j<5;j++)
    column[i][j] = 0 ;

    }
    /*derived datatype for 2D array columns*/
    MPI_Type_vector(5,1,5,MPI_DOUBLE,&vector_mpi_t) ;

    /*override extent calculation for the case sizeof(double) != extent(vector_mpi_t)*/
    types[0] = vector_mpi_t ;
    types[1] = MPI_UB ;

    displacements[0] = 0 ;
    displacements[1] = sizeof(double) ;

    block_lengths[0] = block_lengths[1] = 1 ;

    MPI_Type_struct(2,block_lengths,displacements,types,&ub_mpi_t);
    MPI_Type_commit(&ub_mpi_t) ;

    if(my_rank==0)
    {

      MPI_Send(&(A[0][0]),1,ub_mpi_t,1,0,MPI_COMM_WORLD) ;
    }
    else
    {
      MPI_Recv(&(column[0][0]),1,ub_mpi_t,0,0,MPI_COMM_WORLD,&status) ;
    }

    /*print 2d array of columns on every process */
    for(i=0;i<5;i++)
    {

      for(j=0;j<5;j++)
    printf( "%lf " , column[i][j]) ;
      printf( "\n") ;

    }
    printf( "\n") ;
    printf( "\n") ;



  MPI_Finalize() ;

  return 0;
}
Stepp
High Performance Mark
Ok, had problems to post the code. I ve done a little bit finetuning and added a few comments. Hope its now easier to read.
Stepp
Do I have actualy to commit vector_mpi_t, because I use it to create the derived datatype ub_mpi_t?!
Stepp
A: 

Ah, it must be a problem with the dynamic 2D arrays. If initialise each 2D array as a pointer to a pointer it works, but my arrays must be dynamic. Whats wrong with my dynamic arrays?!

Stepp
A: 

Ok, so let's try the simple case first -- that there's exactly one column per process. Below is my slightly edited version of what you have above; the differences I want to point out are just that we've changed how the array A is allocated, and we're just using the one vector data type:

#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv)
{
  double **A = NULL ;   /*2D array initialised on process 0 */
  double *Adata = NULL;
  double *sendbufptr = NULL;
  int i,j ;
  double *column ; /*1D array for column */
  const int columnlen=6;
  int my_rank, p ;
  MPI_Datatype vector_mpi_t ;

  MPI_Init(&argc,&argv) ;

  MPI_Comm_rank(MPI_COMM_WORLD,&my_rank) ;
  MPI_Comm_size(MPI_COMM_WORLD,&p) ;

  /*initialise 2D array on process 0 and allocate memory*/
  if(my_rank==0)
  {
    A = (double**)malloc(p*sizeof(double *)) ;
    Adata = (double *)malloc(p*columnlen*sizeof(double));
    for(i=0;i<p;i++)
      A[i] = &(Adata[i*columnlen]);

    for (i=0; i<p; i++) 
        for (j=0; j<columnlen; j++)
            A[i][j] = i;

    /* print 2D array to screen */
    printf("Rank 0's 2D array:\n");
    for(i=0;i<p;i++)
    {
      for(j=0;j<columnlen;j++)
        printf( "%lf " , A[i][j]) ;
      printf( "\n") ;
    }
    printf( "\n") ;
    printf( "\n") ;
  }
  /* initialise and allocate memory for 1d column array on every process */
  column = (double*)malloc(columnlen*sizeof(double)) ;
  for(i=0;i<columnlen;i++)
  {
    column[i] = 0 ;
  }

  /*derived datatype for 2D array columns*/
  MPI_Type_vector(columnlen,1,1,MPI_DOUBLE,&vector_mpi_t) ;
  MPI_Type_commit(&vector_mpi_t);

  sendbufptr = NULL;
  if (my_rank == 0) sendbufptr=&(A[0][0]);
  MPI_Scatter(sendbufptr, 1, vector_mpi_t, column, 1, vector_mpi_t, 0, MPI_COMM_WORLD);
  /*print column on every process */

   printf("Rank %d's column: \n", my_rank);
   for(i=0;i<columnlen;i++)
   {
      printf( "%lf " , column[i]) ;
   }
   printf( "\n") ;


  MPI_Finalize() ;

  free(column);
  free(Adata);
  free(A);

  return 0;
}

The key here is that MPI_Scatter takes a pointer to a block of data - not pointers to pointers. So it won't dereference A[1] and then send what's pointing there, and then A[2] and what's pointing there, etc. It expects a contiguous block of data. So we've arranged that in how A's data is laid out in memory (note that this is usually the right way to do things anyway for numerical computation) - it has a column of data followed by the next column of data, etc. (Although the way I'm printing out the data it's more like rows, but whatever.)

Note too that in the MPI_Scatter call I can't just use &(A[0][0]), because that's dereferencing a null pointer in all but one of the processes.

Going from one column to several is pretty straightforward; the column data structure goes from being a 1d array to a 2d array laid out like A is.

#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv)
{ 
  double **A = NULL ;   /*2D array initialised on process 0 */
  double *Adata = NULL;
  double *sendbufptr = NULL;
  int i,j ;
  double **columns ; /*2D array for column */
  double *columndata;
  const int columnlen=6;
  int ncolumns;
  int my_rank, p ;
  MPI_Datatype vector_mpi_t ;

  MPI_Init(&argc,&argv) ;

  MPI_Comm_rank(MPI_COMM_WORLD,&my_rank) ;
  MPI_Comm_size(MPI_COMM_WORLD,&p) ;

  ncolumns = 2*p;

  /*initialise 2D array on process 0 and allocate memory*/
  if(my_rank==0)
  {
    A = (double**)malloc(ncolumns*sizeof(double *)) ;
    Adata = (double *)malloc(ncolumns*columnlen*sizeof(double));
    for(i=0;i<ncolumns;i++)
      A[i] = &(Adata[i*columnlen]);

    for (i=0; i<ncolumns; i++) 
        for (j=0; j<columnlen; j++)
            A[i][j] = i;

    /* print 2D array to screen */
    printf("Rank 0's 2D array:\n");
    for(i=0;i<ncolumns;i++)
    {
      for(j=0;j<columnlen;j++)
        printf( "%lf " , A[i][j]) ;
      printf( "\n") ;
    }
    printf( "\n") ;
    printf( "\n") ;
  }
  /* initialise and allocate memory for 1d column array on every process */
  columndata = (double*)malloc((ncolumns/p)*columnlen*sizeof(double)) ;
  columns = (double **)malloc((ncolumns/p)*sizeof(double *));
  for(i=0;i<(ncolumns/p);i++)
  {
    columns[i] = &(columndata[i*columnlen]);
  }

  /*derived datatype for 2D array columns*/
  MPI_Type_vector(columnlen,1,1,MPI_DOUBLE,&vector_mpi_t) ;
  MPI_Type_commit(&vector_mpi_t);

  sendbufptr = NULL;
  if (my_rank == 0) sendbufptr=&(A[0][0]);
  MPI_Scatter(sendbufptr, (ncolumns/p), vector_mpi_t, &(columns[0][0]), (ncolumns/p), vector_mpi_t, 0, MPI_COMM_WORLD);

  /*print columns on every process */

   printf("Rank %d's columns: \n", my_rank);
   for(i=0;i<ncolumns/p;i++)
   {
     printf( "[%d]: ", my_rank) ;
     for(j=0;j<columnlen;j++)
     {
        printf( "%lf " , columns[i][j]) ;
     }
     printf( "\n") ;
  }

  MPI_Finalize() ;

  free(columns);
  free(Adata);
  free(A);

  return 0;
}

And then going to differing number of columns per processor requires using MPI_Scatterv rather than MPI_Scatter.

Jonathan Dursi
A: 

Hi Jonathan,

thanks for your detailed reply. Hope im not wrong, but your solution distributes the rows of the matrix to the local arrays. I resolved it on my own a few days ago. As you pointed out that scatter expects a contiguous block of data and my attempts with an 2D array led to no where, I simplified it by using a one 1D array for the matrix and 1D array for the columns on the processes. Scatterv combined with a MPI_Type_vector distributes the columns perfectly:

columns = 3, rows = 3, processes = 2,

A:

1 2 3
4 5 6 
7 8 9


    array on
proc 0: 1 3 4 6 7 9 (column ( 1 4 7) and ( 3 6 9) )
proc 1: 2 0 5 0 8 0 (column ( 2 5 8) and ( 0 0 0) )

because of the cyclic distribution I had to include one column of zeros to the matrix A, otherwise the derived data type wont work correctly. Everything works fine and i can even gather the columns on another process to localy build the matrix A. But I have big trouble to transpose the matrix by using MPI_Alltoallv. Cause the are only 2 processes involved MPI_alltollv only distributes two elements to the local array. Is there a better way to transpose the matrix by transposing the distrbituted arrays? For my problem MPI_alltoallc seems to be rather complicated.

Stepp
A: 

Dont know why I have so many problems here. The last hours I tried to transpose the arrays and at the moment I am out of ideas. Maybe someone could guide me to the right direction. The matrix is distributed among the processes by columns. Now I want to tranpose the whole matrix by transposing the local arrays of columns. As a example we have a square matrix with dimension 3 and 2 processes involved. The matrix is simple:

1 2 3

4 5 6

7 8 9

and the array on

proc 0:

1 3 4 6 7 9

proc 1:

{2 0 5 0 8 0}

should be after transposing like this:

proc 0:

{1 7 2 8 3 9}

proc 1:

{4 0 5 0 6 0}

For testing i directly initialised on every processes different arrays with their corresponding columns like above. I use Alltoallv. As a result the first two columns of the matrix are transposed correctly, but if one process has more then one column locally stored I am stuck. I tried derived datatypes and thought about to just send blocks to the arrays and later sort the array elements to their right position, but I think it would be more complicated.

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>

int main(int argc, char** argv)
{
  int my_rank, p, i,j,k ;
  double *column ;
  double *column_t ;
  double *tmp ;
  int*  send_displ, *recv_displ, *send_count, * recv_count ;
  int n =3 ;

  MPI_Init(&argc,&argv) ;

  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank ) ;
  MPI_Comm_size(MPI_COMM_WORLD, &p) ;


  column = (double*)malloc(10*sizeof(double)) ;
  column_t = (double*)malloc(6*sizeof(double)) ;
  tmp =  (double*)malloc(6*sizeof(double)) ;
  if(my_rank==0)
  {
    column[0] = 1 ;
    column[1] = 3 ;
    column[2] = 4 ;
    column[3] = 6 ;
    column[4] = 7 ;
    column[5] = 9 ;
  }
  if(my_rank==1)
  {
    column[0] = 2 ;
    column[1] = 0 ;
    column[2] = 5 ;
    column[3] = 0 ;
    column[4] = 8 ;
    column[5] = 0 ;
  }

  send_displ = (int*)malloc(p*sizeof(int))  ;
    recv_displ = (int*)malloc(p*sizeof(int))  ;
    send_count = (int*)malloc(p*sizeof(int)) ; 
    recv_count = (int*)malloc(p*sizeof(int))  ;

  for(i=0;i<p;i++)
  {
    send_displ[i] = i*(1+n/p) ; 
    recv_displ[i] = i ;
    send_count[i] = 1 ;
    recv_count[i] = 2 ;
  }
  for(i=0;i<3;i++)
  {
  MPI_Alltoallv(&column[i],send_count,send_displ,MPI_DOUBLE,tmp,recv_count,recv_displ,MPI_DOUBLE,MPI_COMM_WORLD) ;

  {
    for( j =0;j<p;j++)
    {
      column_t[i*4+j*2] = tmp[j] ;
      tmp[j] = 0.0 ;
    }
  }


  }


  if(my_rank==0)
  {
    printf( "my_rank: %d \n", my_rank) ;
    for(i=0;i<6;i++)
      printf( "%lf ", column_t[i]) ;
  }
  printf( "\n") ;
  if(my_rank==1)
  {
    printf( "my_rank: %d \n", my_rank) ;
    for(i=0;i<6;i++)
      printf( "%lf ", column_t[i]) ;
  }
  printf( "\n") ;

  MPI_Finalize() ;
  return 0 ;
}
Stepp