i am trying to write a memcpy function that does not load the source memory to the cpu cache. The purpose is to avoid cache pollution. The memcpy function below works, but pollutes the cache like the standard memcpy does. i am using P8700 proccesoor with visual C++ 2008 express. i see the cpu cache usage with intel vtune.
void memcpy(char *dst,char*src,unsigned size){
char *dst_end=dst+size;
while(dst!=dst_end){
__m128i res = _mm_stream_load_si128((__m128i *)src);
*((__m128i *)dst)=res;
src+=16;
dst+=16;
}
}
i have another version, that have the same results - works but pollutes the cache.
void memcpy(char *dst,char*src,unsigned size){
char *dst_end = dst+size;
__asm{
mov edi, dst
mov edx, dst_end
mov esi,src
inner_start:
LFENCE
MOVNTDQA xmm0, [esi ]
MOVNTDQA xmm1, [esi+16]
MOVNTDQA xmm2, [esi+32]
MOVNTDQA xmm3, [esi+48]
//19. ; Copy data to buffer
MOVDQA [edi], xmm0
MOVDQA [edi+16], xmm1
MOVDQA [edi+32], xmm2
MOVDQA [edi+48], xmm3
// 25. ; Increment pointers by cache line size and test for end of loop
add esi, 040h
add edi, 040h
cmp edi, edx
jne inner_start
}
}
update: this is the test program
void test(int table_size,int num_iter,int item_size){
char *src_table=alloc_aligned(table_size*item_size);//return value is aligned on 64 bytes
char *dst=alloc_aligned(item_size); //destination is always the same buffer
for (int i=0;i<num_iter;i++){
int location=my_rand()%table_size;
char *src=src_table+location*item_size;//selecting a different src every time
memcpy(dst,src,item_size);
}
}
main(){
test(1024*32,1024*1024,1024*32)
}