definitely yes!
Here is demonstration of a CRC-32 calculation which I wrote in C++, then optimized in x86 assembler using Visual Studio.
InitCRC32Table() should be called at program start.
CalcCRC32() will calculate the CRC for a given memory block.
Both function are implemented both in assembler and C++.
On a typical pentium machine, you will notice that the assembler CalcCRC32() function is 50% faster then the C++ code.
The assembler implementation is not MMX or SSE, but simple x86 code.
The compiler will never produce a code that is as efficient as a manually crafted assembler code.
DWORD* panCRC32Table = NULL; // CRC-32 CCITT 0x04C11DB7
void DoneCRCTables()
{
if (panCRC32Table )
{
delete[] panCRC32Table;
panCRC32Table= NULL;
}
}
void InitCRC32Table()
{
if (panCRC32Table) return;
panCRC32Table= new DWORD[256];
atexit(DoneCRCTables);
/*
for (int bx=0; bx<256; bx++)
{
DWORD eax= bx;
for (int cx=8; cx>0; cx--)
if (eax & 1)
eax= (eax>>1) ^ 0xEDB88320;
else
eax= (eax>>1) ;
panCRC32Table[bx]= eax;
}
*/
_asm cld
_asm mov edi, panCRC32Table
_asm xor ebx, ebx
p0: _asm mov eax, ebx
_asm mov ecx, 8
p1: _asm shr eax, 1
_asm jnc p2
_asm xor eax, 0xEDB88320 // bit-swapped 0x04C11DB7
p2: _asm loop p1
_asm stosd
_asm inc bl
_asm jnz p0
}
/*
DWORD inline CalcCRC32(UINT nLen, const BYTE* cBuf, DWORD nInitVal= 0)
{
DWORD crc= ~nInitVal;
for (DWORD n=0; n<nLen; n++)
crc= (crc>>8) ^ panCRC32Table[(crc & 0xFF) ^ cBuf[n]];
return ~crc;
}
*/
DWORD inline __declspec (naked) __fastcall CalcCRC32(UINT nLen ,
const BYTE* cBuf ,
DWORD nInitVal= 0 ) // used to calc CRC of chained bufs
{
_asm mov eax, [esp+4] // param3: nInitVal
_asm jecxz p2 // __fastcall param1 ecx: nLen
_asm not eax
_asm push esi
_asm push ebp
_asm mov esi, edx // __fastcall param2 edx: cBuf
_asm xor edx, edx
_asm mov ebp, panCRC32Table
_asm cld
p1: _asm mov dl , al
_asm shr eax, 8
_asm xor dl , [esi]
_asm xor eax, [ebp+edx*4]
_asm inc esi
_asm loop p1
_asm pop ebp
_asm pop esi
_asm not eax
p2: _asm ret 4 // eax- returned value. 4 because there is 1 param in stack
}
// test code:
#include "mmSystem.h" // timeGetTime
#pragma comment(lib, "Winmm.lib" )
InitCRC32Table();
BYTE* x= new BYTE[1000000];
for (int i= 0; i<1000000; i++) x[i]= 0;
DWORD d1= ::timeGetTime();
for (i= 0; i<1000; i++)
CalcCRC32(1000000, x, 0);
DWORD d2= ::timeGetTime();
TRACE("%d\n", d2-d1);