views:

231

answers:

2

Hi, The function below contains nested for loops. There are 3 of them. I have given the whole function below for easy understanding. I want to parallelize the code in the innermost for loop as it takes maximum CPU time. Then i can think about outer 2 for loops. I can see dependencies and internal inline functions in the innermost for loop . Can the innermost for loop be rewritten to enable parallelization using openmp pragmas. Please tell how. I am writing just the loop which i am interested in first and then the full function where this loop exists for referance.

Interested in parallelizing the loop mentioned below.

//* LOOP WHICH I WANT TO PARALLELIZE *//

 for (y = 0; y < 4; y++)    
 {
  refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
  LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
 }

The full function where this loop exists is below for referance. /*!

***********************************************************************

* \brief

* Setup the fast search for an macroblock

***********************************************************************

*/

void SetupFastFullPelSearch (short ref, int list) // <-- reference frame parameter, list0 or 1

{

short pmv[2];
    pel_t orig_blocks[256], *orgptr=orig_blocks, *refptr, *tem; // created pointer tem
    int offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
    int LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
    int max_width, max_height;
    int img_width, img_height;
    StorablePicture *ref_picture;
    pel_t *ref_pic;


int** block_sad = BlockSAD[list][ref][7];
int search_range = max_search_range[list][ref];
int max_pos = (2*search_range+1) * (2*search_range+1);

int list_offset = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

int apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
(active_pps->weighted_bipred_idc && (img->type == B_SLICE)));

ref_picture = listX[list+list_offset][ref];

//===== Use weighted Reference for ME ====

if (apply_weights && input->UseWeightedReferenceME)

ref_pic = ref_picture->imgY_11_w;

else

ref_pic = ref_picture->imgY_11;


max_width = ref_picture->size_x - 17;
max_height = ref_picture->size_y - 17;
img_width = ref_picture->size_x;
img_height = ref_picture->size_y;

//===== get search center: predictor of 16x16 block =====

SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);

search_center_x[list][ref] = pmv[0] / 4;
search_center_y[list][ref] = pmv[1] / 4;

if (!input->rdopt)
{

//--- correct center so that (0,0) vector is inside ---

search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));

search_center_y[list][ref] = max(-search_range, min(search_range, search_center_y[list][ref]));

}

search_center_x[list][ref] += img->opix_x;
search_center_y[list][ref] += img->opix_y;
offset_x = search_center_x[list][ref];
offset_y = search_center_y[list][ref];

//===== copy original block for fast access =====

for (y = img->opix_y; y < img->opix_y+16; y++)
for (x = img->opix_x; x < img->opix_x+16; x++)

*orgptr++ = imgY_org [y][x];

//===== check if whole search range is inside image =====

if (offset_x >= search_range && offset_x <= max_width - search_range &&

offset_y >= search_range && offset_y <= max_height - search_range )

{
 range_partly_outside = 0; PelYline_11 = FastLine16Y_11;
}

else
{
 range_partly_outside = 1;
}

//===== determine position of (0,0)-vector =====

if (!input->rdopt)

{

ref_x = img->opix_x - offset_x;

ref_y = img->opix_y - offset_y;

for (pos = 0; pos < max_pos; pos++)
{
 if (ref_x == spiral_search_x[pos] &&

ref_y == spiral_search_y[pos])
  {
   pos_00[list][ref] = pos;
   break;
  }

}

  }



//===== loop over search range (spiral search): get blockwise SAD =====
**// =====THIS IS THE PART WHERE NESTED FOR STARTS=====**

for (pos = 0; pos < max_pos; pos++) // OUTERMOST FOR LOOP

{
abs_y = offset_y + spiral_search_y[pos];
abs_x = offset_x + spiral_search_x[pos];

if (range_partly_outside)
{
  if (abs_y >= 0 && abs_y <= max_height && abs_x >= 0 && abs_x <= max_width )
     {
      PelYline_11 = FastLine16Y_11;
     }

  else
     {
      PelYline_11 = UMVLine16Y_11;
     }
 }

orgptr = orig_blocks;
bindex = 0;

for (blky = 0; blky < 4; blky++)    // SECOND FOR LOOP
{
 LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

   for (y = 0; y < 4; y++)    //INNERMOST FOR LOOP WHICH I WANT TO PARALLELIZE
   {
     refptr = PelYline_11 (ref_pic, abs_y++, abs_x, img_height, img_width);

      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk0 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk1 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk2 += byte_abs [*refptr++ - *orgptr++];

      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
      LineSadBlk3 += byte_abs [*refptr++ - *orgptr++];
     }

    block_sad[bindex++][pos] = LineSadBlk0;
    block_sad[bindex++][pos] = LineSadBlk1;
    block_sad[bindex++][pos] = LineSadBlk2;
    block_sad[bindex++][pos] = LineSadBlk3;

  }

 }


//===== combine SAD's for larger block types =====

SetupLargerBlocks (list, ref, max_pos);

//===== set flag marking that search setup have been done =====

search_setup_done[list][ref] = 1;

}

#endif // _FAST_FULL_ME_
A: 

Yes the inner most loop appears at first blush to be parallelizable. I don't know what the function PelYline_11 does nor can I deduce its purpose given its name. I do see that it returns a pointer so this will more than likely suffice.

  1. Turn the inner for loop into a function.
  2. Add a variable so that you can increment a local variable copy of "orgptr" to the correct spaces as if it were at the Nth iteration of the loop.
  3. Use the fact that you've changed the for loop as a guide in the preprocessor definitions open mp requires.

1rst edit: err. that wasn't so clear...

for( ..; ..; .. ){
    for( ..; ..; .. ){
        function()
    }
}

2nd edit: changed step #3 to remove "refptr" as needing to be incremented a certain amount in passing in the correct function places. Instead notice this...

first pass through inner loop "orgptr" is incremented 16 times. After 2nd pass it is now been incremented 32 times from its original value. Hence, the function will need a parameter to indicate to the function how many places past its original value it should be.

wheaties
Thanks for a quick response. You are right that PelYline_11 returns a pointer at runtime and i am not bothered of it. I am sorry for my ignorance but can you help me in the code of the loop how exactly to make the function. Actually i tried incrementing the pointer but if i do *(refptr+2) or *(refptr+i) instead of *refptr++ as in the loop the pointer doesnt get incremented and i think the value the pointer is pointing to gets incremented. can u help me to make that function as u said with a local variable. How exactly to do it in this code. Thanks in Advance.
anubhav
Sure can. I'll edit step #3. You are correct in looking at me funny about "refptr." You shouldn't be incrementing "refptr," it's "orgptr" that needs the incrementing. Notice that after each series of inner loops that the value "orgptr" points to has been incremented 4 steps? You'll want to replicate that.
wheaties
Here I am lapsing into assumptions again. I'm thinking about this program in the way I would have written PelYline_11 and not thinking how you would have written it. Let me ask you this, if x = PelYline_11(a, b, c, d), then y = PelYline_11(a, b, c, d), then *x++ would *x == *y be true?
wheaties
you very correctly observed that orgptr is increasing 16 times after first pass. Actually after first pass both refptr and orgptr have been incremented 16 times but refptr is initialised again with function pelYline and argument abs_y++ to next value which is to be taken for second pass.
anubhav
However orgptr is not initialised again. I have actually tried to rewrite the code which i am posting here in the answer but i am not sure whether it will also have dependencies when executing in parallel. My email is [email protected]. Please help if u see the code is still having dependencies by correcting them for me
anubhav
I think instead of doing that for you I'll instead give you insight into how to make code better for parallelization. Your major task is always reducing side-effects. Give me some time though as I need to focus on the job.
wheaties
A: 

I have rewitten the code to try to resolve dependencies in the innermost for loop ie the for(y=0;y<4;y++) and lot of LineSadBlk's. Please comment if it is wrong. I think the refptr and orgptr is sorted out by this and dependencies are resolved but LineSadBlk0,1,2,3 are still having dependencies as if we run the first and second iteration in parallel which value of LineSadBlk0,1,2,3 will be taken by the thread. How to resolve this please.

/*!
***********************************************************************
* \brief
*    Setup the fast search for an macroblock
***********************************************************************
*/
void SetupFastFullPelSearch (short ref, int list)  // <--  reference frame parameter,   list0 or 1
{
 short   pmv[2];
 pel_t   orig_blocks[256];
 //pel_t   *orgptr, *refptr[4];
 pel_t *orgptr[4],*refptr[4];  //defined by me new
 int     offset_x, offset_y, x, y, range_partly_outside, ref_x, ref_y, pos, abs_x, abs_y, bindex, blky;
 int     LineSadBlk0, LineSadBlk1, LineSadBlk2, LineSadBlk3;
 int     max_width, max_height;
 int     img_width, img_height;

 StorablePicture *ref_picture;
 pel_t   *ref_pic;

 int**   block_sad     = BlockSAD[list][ref][7];
 int     search_range  = max_search_range[list][ref];
 int     max_pos       = (2*search_range+1) * (2*search_range+1);

 int     list_offset   = ((img->MbaffFrameFlag)&&(img->mb_data[img->current_mb_nr].mb_field))? img->current_mb_nr%2 ? 4 : 2 : 0;

 int     apply_weights = ( (active_pps->weighted_pred_flag && (img->type == P_SLICE || img->type == SP_SLICE)) ||
                        (active_pps->weighted_bipred_idc && (img->type == B_SLICE)));


 ref_picture     = listX[list+list_offset][ref];

 //===== Use weighted Reference for ME ====
 if (apply_weights && input->UseWeightedReferenceME)
 ref_pic       = ref_picture->imgY_11_w;
 else
 ref_pic       = ref_picture->imgY_11;

 max_width     = ref_picture->size_x - 17;
 max_height    = ref_picture->size_y - 17;

 img_width     = ref_picture->size_x;
 img_height    = ref_picture->size_y;

 //===== get search center: predictor of 16x16 block =====
 SetMotionVectorPredictor (pmv, enc_picture->ref_idx, enc_picture->mv, ref, list, 0, 0, 16, 16);     //call 1
 search_center_x[list][ref] = pmv[0] / 4;
 search_center_y[list][ref] = pmv[1] / 4;

 if (!input->rdopt)
 {
  //--- correct center so that (0,0) vector is inside ---
  search_center_x[list][ref] = max(-search_range, min(search_range, search_center_x[list][ref]));
  search_center_y[list][ref] = max(-search_range, min(search_range,    search_center_y[list][ref]));
  }

  search_center_x[list][ref] += img->opix_x;
  search_center_y[list][ref] += img->opix_y;

  offset_x = search_center_x[list][ref];
  offset_y = search_center_y[list][ref];


// orgptr=orig_blocks;
orgptr[0]= orig_blocks   //all org pointers defined orig blocks
orgptr[1]= orig_blocks;
orgptr[2]= orig_blocks;
orgptr[3]= orig_blocks;



   //===== copy original block for fast access =====
  for   (y = img->opix_y; y < img->opix_y+16; y++)
  for (x = img->opix_x; x < img->opix_x+16; x++)
{       
//*orgptr++ = imgY_org [y][x];
*(orgptr[0])++ = imgY_org [y][x];     // img stored in all orgptr
*(orgptr[1])++ = imgY_org [y][x];
*(orgptr[2])++ = imgY_org [y][x];
*(orgptr[3])++ = imgY_org [y][x];
}

   //===== check if whole search range is inside image =====
   if (offset_x >= search_range && offset_x <= max_width  - search_range &&
  offset_y >= search_range && offset_y <= max_height - search_range   )
   {
    range_partly_outside = 0; PelYline_11 = FastLine16Y_11;  //search range is fully inside image
    }
   else
   {
    range_partly_outside     //search range is partly outside image
    }

     //===== determine position of (0,0)-vector =====
   if (!input->rdopt)
   {
    ref_x = img->opix_x - offset_x;
    ref_y = img->opix_y - offset_y;

    for (pos = 0; pos < max_pos; pos++)
    {
     if (ref_x == spiral_search_x[pos] &&
      ref_y == spiral_search_y[pos])
      {
       pos_00[list][ref] = pos;
       break;
      }
    }
   }

   //===== loop over search range (spiral search): get blockwise SAD =====
  for (pos = 0; pos < max_pos; pos++)
  {
   abs_y = offset_y + spiral_search_y[pos];
   abs_x = offset_x + spiral_search_x[pos];

    if (range_partly_outside)
    {
     if (abs_y >= 0 && abs_y <= max_height &&
      abs_x >= 0 && abs_x <= max_width    )
      {
       PelYline_11 = FastLine16Y_11;      //call 2
      }
      else
      {
       PelYline_11 = UMVLine16Y_11;      //call 3
      }
     }

    //orgptr=orig_blocks;
      orgptr[0]=orig_blocks;
  orgptr[1]=orgptr[0]+16;
  orgptr[2]=orgptr[1]+16;
  orgptr[3]=orgptr[2]+16;
      bindex = 0;


    for (blky = 0; blky < 4; blky++)
    {
      LineSadBlk0 = LineSadBlk1 = LineSadBlk2 = LineSadBlk3 = 0;

       // i added the following to take refptr out of loop
  refptr[0] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);   //call either 2 or 3
  abs_y++;
  refptr[1] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);   //call either 2 or 3
   abs_y++;
  refptr[2] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);   //call either 2 or 3
   abs_y++;
   refptr[3] = PelYline_11 (ref_pic, abs_y, abs_x, img_height, img_width);   //call either 2 or 3
   abs_y++;

omp_set_num_threads(4);
#pragma omp parallel for reduction(+:LineSadBlk0,LineSadBlk1,LineSadBlk2,LineSadBlk3)
     for (y = 0; y < 4; y++)
     {

{
LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk0 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    }


    {
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk1 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk2 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}


    {
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
    LineSadBlk3 += byte_abs [*(refptr[y])++ - *(orgptr[y])++];
}
  }

  }


  block_sad[bindex++][pos] = LineSadBlk0;
  block_sad[bindex++][pos] = LineSadBlk1;
  block_sad[bindex++][pos] = LineSadBlk2;
  block_sad[bindex++][pos] = LineSadBlk3;

  }
 }





 //===== combine SAD's for larger block types =====
  SetupLargerBlocks (list, ref, max_pos);           //call4


  //===== set flag marking that search setup have been done =====
  search_setup_done[list][ref] = 1;
  }
  #endif // _FAST_FULL_ME_
anubhav
Hi wheaties,Please copy the copy and make modifications and u can mail back to me at [email protected] after resolving dependencies on LineSadBlk's. or if can rewrite it in your way so that the innermost loop is parallelizable then also it is fine. I am so thankful to you.
anubhav