views:

629

answers:

1

i am trying to write a memcpy function that does not load the source memory to the cpu cache. The purpose is to avoid cache pollution. The memcpy function below works, but pollutes the cache like the standard memcpy does. i am using P8700 proccesoor with visual C++ 2008 express. i see the cpu cache usage with intel vtune.

void memcpy(char *dst,char*src,unsigned size){
    char *dst_end=dst+size;
    while(dst!=dst_end){
     __m128i res = _mm_stream_load_si128((__m128i *)src);
     *((__m128i *)dst)=res;
     src+=16;
     dst+=16;
    }
}

i have another version, that have the same results - works but pollutes the cache.

void memcpy(char *dst,char*src,unsigned size){

        char *dst_end = dst+size;

        __asm{
        mov edi, dst 
        mov edx, dst_end 
        mov esi,src
        inner_start: 
        LFENCE 
      MOVNTDQA xmm0,    [esi ]
      MOVNTDQA xmm1, [esi+16] 
      MOVNTDQA xmm2, [esi+32] 
      MOVNTDQA xmm3, [esi+48] 
      //19. ; Copy data to buffer 
      MOVDQA [edi], xmm0 
      MOVDQA  [edi+16], xmm1 
      MOVDQA  [edi+32], xmm2 
      MOVDQA  [edi+48], xmm3 
    //  25. ; Increment pointers by cache line size and test for end of loop 
      add esi, 040h 
      add edi, 040h 
      cmp edi, edx 
      jne inner_start 


}
}

update: this is the test program

        void test(int table_size,int num_iter,int item_size){
            char *src_table=alloc_aligned(table_size*item_size);//return value is aligned on 64 bytes
            char *dst=alloc_aligned(item_size); //destination is always the same buffer
            for (int i=0;i<num_iter;i++){
                int location=my_rand()%table_size;
                char *src=src_table+location*item_size;//selecting a different src every time
                memcpy(dst,src,item_size);
            }

        }
main(){
       test(1024*32,1024*1024,1024*32)
}
+1  A: 

quoting from intel

"The streaming load instruction is intended to accelerate data transfers from the USWC memory type. For other memory types such as cacheable (WB) or Uncacheable (UC), the instruction behaves as a typical 16-byte MOVDQA load instruction. However, future processors may use the streaming load instruction for other memory types (such as WB) as a hint that the intended cache line should be streamed from memory directly to the core while minimizing cache pollution."

that explains why the code does not work - the memory is of type wb.

yigal