Ok, so let's try the simple case first -- that there's exactly one column per process. Below is my slightly edited version of what you have above; the differences I want to point out are just that we've changed how the array A is allocated, and we're just using the one vector data type:
#include <mpi.h>
#include <stdlib.h>
int main(int argc, char** argv)
{
double **A = NULL ; /*2D array initialised on process 0 */
double *Adata = NULL;
double *sendbufptr = NULL;
int i,j ;
double *column ; /*1D array for column */
const int columnlen=6;
int my_rank, p ;
MPI_Datatype vector_mpi_t ;
MPI_Init(&argc,&argv) ;
MPI_Comm_rank(MPI_COMM_WORLD,&my_rank) ;
MPI_Comm_size(MPI_COMM_WORLD,&p) ;
/*initialise 2D array on process 0 and allocate memory*/
if(my_rank==0)
{
A = (double**)malloc(p*sizeof(double *)) ;
Adata = (double *)malloc(p*columnlen*sizeof(double));
for(i=0;i<p;i++)
A[i] = &(Adata[i*columnlen]);
for (i=0; i<p; i++)
for (j=0; j<columnlen; j++)
A[i][j] = i;
/* print 2D array to screen */
printf("Rank 0's 2D array:\n");
for(i=0;i<p;i++)
{
for(j=0;j<columnlen;j++)
printf( "%lf " , A[i][j]) ;
printf( "\n") ;
}
printf( "\n") ;
printf( "\n") ;
}
/* initialise and allocate memory for 1d column array on every process */
column = (double*)malloc(columnlen*sizeof(double)) ;
for(i=0;i<columnlen;i++)
{
column[i] = 0 ;
}
/*derived datatype for 2D array columns*/
MPI_Type_vector(columnlen,1,1,MPI_DOUBLE,&vector_mpi_t) ;
MPI_Type_commit(&vector_mpi_t);
sendbufptr = NULL;
if (my_rank == 0) sendbufptr=&(A[0][0]);
MPI_Scatter(sendbufptr, 1, vector_mpi_t, column, 1, vector_mpi_t, 0, MPI_COMM_WORLD);
/*print column on every process */
printf("Rank %d's column: \n", my_rank);
for(i=0;i<columnlen;i++)
{
printf( "%lf " , column[i]) ;
}
printf( "\n") ;
MPI_Finalize() ;
free(column);
free(Adata);
free(A);
return 0;
}
The key here is that MPI_Scatter takes a pointer to a block of data - not pointers to pointers. So it won't dereference A[1] and then send what's pointing there, and then A[2] and what's pointing there, etc. It expects a contiguous block of data. So we've arranged that in how A's data is laid out in memory (note that this is usually the right way to do things anyway for numerical computation) - it has a column of data followed by the next column of data, etc. (Although the way I'm printing out the data it's more like rows, but whatever.)
Note too that in the MPI_Scatter call I can't just use &(A[0][0]), because that's dereferencing a null pointer in all but one of the processes.
Going from one column to several is pretty straightforward; the column data structure goes from being a 1d array to a 2d array laid out like A is.
#include <mpi.h>
#include <stdlib.h>
int main(int argc, char** argv)
{
double **A = NULL ; /*2D array initialised on process 0 */
double *Adata = NULL;
double *sendbufptr = NULL;
int i,j ;
double **columns ; /*2D array for column */
double *columndata;
const int columnlen=6;
int ncolumns;
int my_rank, p ;
MPI_Datatype vector_mpi_t ;
MPI_Init(&argc,&argv) ;
MPI_Comm_rank(MPI_COMM_WORLD,&my_rank) ;
MPI_Comm_size(MPI_COMM_WORLD,&p) ;
ncolumns = 2*p;
/*initialise 2D array on process 0 and allocate memory*/
if(my_rank==0)
{
A = (double**)malloc(ncolumns*sizeof(double *)) ;
Adata = (double *)malloc(ncolumns*columnlen*sizeof(double));
for(i=0;i<ncolumns;i++)
A[i] = &(Adata[i*columnlen]);
for (i=0; i<ncolumns; i++)
for (j=0; j<columnlen; j++)
A[i][j] = i;
/* print 2D array to screen */
printf("Rank 0's 2D array:\n");
for(i=0;i<ncolumns;i++)
{
for(j=0;j<columnlen;j++)
printf( "%lf " , A[i][j]) ;
printf( "\n") ;
}
printf( "\n") ;
printf( "\n") ;
}
/* initialise and allocate memory for 1d column array on every process */
columndata = (double*)malloc((ncolumns/p)*columnlen*sizeof(double)) ;
columns = (double **)malloc((ncolumns/p)*sizeof(double *));
for(i=0;i<(ncolumns/p);i++)
{
columns[i] = &(columndata[i*columnlen]);
}
/*derived datatype for 2D array columns*/
MPI_Type_vector(columnlen,1,1,MPI_DOUBLE,&vector_mpi_t) ;
MPI_Type_commit(&vector_mpi_t);
sendbufptr = NULL;
if (my_rank == 0) sendbufptr=&(A[0][0]);
MPI_Scatter(sendbufptr, (ncolumns/p), vector_mpi_t, &(columns[0][0]), (ncolumns/p), vector_mpi_t, 0, MPI_COMM_WORLD);
/*print columns on every process */
printf("Rank %d's columns: \n", my_rank);
for(i=0;i<ncolumns/p;i++)
{
printf( "[%d]: ", my_rank) ;
for(j=0;j<columnlen;j++)
{
printf( "%lf " , columns[i][j]) ;
}
printf( "\n") ;
}
MPI_Finalize() ;
free(columns);
free(Adata);
free(A);
return 0;
}
And then going to differing number of columns per processor requires using MPI_Scatterv rather than MPI_Scatter.