Вот пример из System.pas (Delphi 2006). Думаю, быстрее сделать уже не получится
* The Initial Developer of the Original Code is
* Fastcode
*
* Portions created by the Initial Developer are Copyright (C) 2002-2004
* the Initial Developer. All Rights Reserved.
*
* Contributor(s): AMD, John O'Harrow and Dennis Christensen
// ------------------------------------------------------------------------------
// 64-bit unsigned division
// ------------------------------------------------------------------------------
// Dividend(EAX(hi):EDX(lo)), Divisor([ESP+8](hi):[ESP+4](lo)) // before reg pushing
procedure __lludiv;
asm
push ebp
push ebx
push esi
push edi
//
// Now the stack looks something like this:
//
// 24[esp]: divisor (high dword)
// 20[esp]: divisor (low dword)
// 16[esp]: return EIP
// 12[esp]: previous EBP
// 8[esp]: previous EBX
// 4[esp]: previous ESI
// [esp]: previous EDI
//
// dividend is pushed last, therefore the first in the args
// divisor next.
//
mov ebx,20[esp] // get the first low word
mov ecx,24[esp] // get the first high word
or ecx,ecx
jnz @__lludiv@slow_ldiv // both high words are zero
or edx,edx
jz @__lludiv@quick_ldiv
or ebx,ebx
jz @__lludiv@quick_ldiv // if ecx:ebx == 0 force a zero divide
// we don't expect this to actually
// work
@__lludiv@slow_ldiv:
mov ebp,ecx
mov ecx,64 // shift counter
xor edi,edi // fake a 64 bit dividend
xor esi,esi
@__lludiv@xloop:
shl eax,1 // shift dividend left one bit
rcl edx,1
rcl esi,1
rcl edi,1
cmp edi,ebp // dividend larger?
jb @__lludiv@nosub
ja @__lludiv@subtract
cmp esi,ebx // maybe
jb @__lludiv@nosub
@__lludiv@subtract:
sub esi,ebx
sbb edi,ebp // subtract the divisor
inc eax // build quotient
@__lludiv@nosub:
loop @__lludiv@xloop
//
// When done with the loop the four registers values' look like:
//
// | edi | esi | edx | eax |
// | remainder | quotient |
//
@__lludiv@finish:
pop edi
pop esi
pop ebx
pop ebp
ret 8
@__lludiv@quick_ldiv:
div ebx // unsigned divide
xor edx,edx
jmp @__lludiv@finish
end;