I have made both version of Mod128by64 'Russian peasant' division function: classic and speed optimised. Speed optimised can do on my 3Ghz PC more than 1000.000 random calculations per second and is more than three times faster than classic function.
If we compare the execution time of calculating 128 by 64 and calculating 64 by 64 bit modulo than this function is only about 50% slower.
Classic Russian peasant:
function Mod128by64Clasic(Dividend: PUInt128; Divisor: PUInt64): UInt64;
//In : eax = @Dividend
// : edx = @Divisor
//Out: eax:edx as Remainder
asm
//Registers inside rutine
//edx:ebp = Divisor
//ecx = Loop counter
//Result = esi:edi
push ebx //Store registers to stack
push esi
push edi
push ebp
mov ebp, [edx] //Load divisor to edx:ebp
mov edx, [edx + 4]
mov ecx, ebp //Div by 0 test
or ecx, edx
jz @DivByZero
push [eax] //Store Divisor to the stack
push [eax + 4]
push [eax + 8]
push [eax + 12]
xor edi, edi //Clear result
xor esi, esi
mov ecx, 128 //Load shift counter
@Do128BitsShift:
shl [esp + 12], 1 //Shift dividend from stack left for one bit
rcl [esp + 8], 1
rcl [esp + 4], 1
rcl [esp], 1
rcl edi, 1
rcl esi, 1
setc bh //Save 65th bit
sub edi, ebp //Compare dividend and divisor
sbb esi, edx //Subtract the divisor
sbb bh, 0 //Use 65th bit in bh
jnc @NoCarryAtCmp //Test...
add edi, ebp //Return privius dividend state
adc esi, edx
@NoCarryAtCmp:
loop @Do128BitsShift
//End of 128 bit division loop
mov eax, edi //Load result to eax:edx
mov edx, esi
@RestoreRegisters:
lea esp, esp + 16 //Restore Divisors space on stack
pop ebp //Restore Registers
pop edi
pop esi
pop ebx
ret
@DivByZero:
xor eax, eax //Here you can raise Div by 0 exception, now function only return 0.
xor edx, edx
jmp @RestoreRegisters
end;
Speed optimised Russian peasant:
function Mod128by64Oprimized(Dividend: PUInt128; Divisor: PUInt64): UInt64;
//In : eax = @Dividend
// : edx = @Divisor
//Out: eax:edx as Remainder
asm
//Registers inside rutine
//Divisor = edx:ebp
//Dividend = ebx:edx //We need 64 bits
//Result = esi:edi
//ecx = Loop counter and Dividend index
push ebx //Store registers to stack
push esi
push edi
push ebp
mov ebp, [edx] //Divisor = edx:ebp
mov edx, [edx + 4]
mov ecx, ebp //Div by 0 test
or ecx, edx
jz @DivByZero
xor edi, edi //Clear result
xor esi, esi
//Start of 64 bit division Loop
mov ecx, 15 //Load byte loop shift counter and Dividend index
@SkipShift8Bits: //Small Dividend numbers shift optimisation
cmp [eax + ecx], ch //Zero test
jnz @EndSkipShiftDividend
loop @SkipShift8Bits //Skip Compute 8 Bits unroled loop ?
@EndSkipShiftDividend:
test edx, $FF000000 //Huge Divisor Numbers Shift Optimisation
jz @Shift8Bits //This Divisor is > $00FFFFFF:FFFFFFFF
mov ecx, 8 //Load byte shift counter
mov esi, [eax + 12] //Do fast 56 bit (7 bytes) shift...
shr esi, cl //esi = $00XXXXXX
mov edi, [eax + 9] //Load for one byte right shifted 32 bit value
@Shift8Bits:
mov bl, [eax + ecx] //Load 8 bit part of Dividend
//Compute 8 Bits unroled loop
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove0 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow0
ja @DividentAbove0
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow0
@DividentAbove0:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow0:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove1 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow1
ja @DividentAbove1
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow1
@DividentAbove1:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow1:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove2 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow2
ja @DividentAbove2
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow2
@DividentAbove2:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow2:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove3 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow3
ja @DividentAbove3
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow3
@DividentAbove3:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow3:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove4 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow4
ja @DividentAbove4
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow4
@DividentAbove4:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow4:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove5 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow5
ja @DividentAbove5
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow5
@DividentAbove5:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow5:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove6 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow6
ja @DividentAbove6
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow6
@DividentAbove6:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow6:
shl bl, 1 //Shift dividend left for one bit
rcl edi, 1
rcl esi, 1
jc @DividentAbove7 //dividend hi bit set?
cmp esi, edx //dividend hi part larger?
jb @DividentBelow7
ja @DividentAbove7
cmp edi, ebp //dividend lo part larger?
jb @DividentBelow7
@DividentAbove7:
sub edi, ebp //Return privius dividend state
sbb esi, edx
@DividentBelow7:
//End of Compute 8 Bits (unroled loop)
dec cl //Decrement byte loop shift counter
jns @Shift8Bits //Last jump at cl = 0!!!
//End of division loop
mov eax, edi //Load result to eax:edx
mov edx, esi
@RestoreRegisters:
pop ebp //Restore Registers
pop edi
pop esi
pop ebx
ret
@DivByZero:
xor eax, eax //Here you can raise Div by 0 exception, now function only return 0.
xor edx, edx
jmp @RestoreRegisters
end;