Just to illustrate, based on FredOverflow's answer (which is nice work, kudos and +1), a common pitfall regarding branches on x86. Here's FredOverflow's assembly as output by gcc:
movl 8(%ebp), %edx #1/.5
movl %edx, %eax #1/.5
andl $-16777216, %eax#1/.5
cmpl $1, %eax #1/.5
sbbl %eax, %eax #8/6
addl $4, %eax #1/.5
xorl %ecx, %ecx #1/.5
testl $-65536, %edx #1/.5
sete %cl #5
subl %ecx, %eax #1/.5
andl $-256, %edx #1/.5
sete %dl #5
movzbl %dl, %edx #1/.5
subl %edx, %eax #1/.5
# sum total: 29/21.5 cycles
(the latency, in cycles, is to be read as Prescott/Northwood)
Pascal Cuoq's hand-optimized assembly (also kudos):
cmpl $255, %edi #1/.5
setg %al #5
addb $3, %al #1/.5
cmpl $65535, %edi #1/.5
setle %dl #5
subb %dl, %al #1/.5
cmpl $16777215, %edi #1/.5
setle %dl #5
subb %dl, %al #1/.5
movzbl %al, %eax #1/.5
# sum total: 22/18.5 cycles
Edit: FredOverflow's solution using __builtin_clz()
:
movl 8(%ebp), %eax #1/.5
popl %ebp #1.5
orb $-1, %al #1/.5
bsrl %eax, %eax #16/8
sarl $3, %eax #1/4
addl $1, %eax #1/.5
ret
# sum total: 20/13.5 cycles
and the gcc assembly for your code:
movl $1, %eax #1/.5
movl %esp, %ebp #1/.5
movl 8(%ebp), %edx #1/.5
cmpl $255, %edx #1/.5
jbe .L3 #up to 9 cycles
cmpl $65535, %edx #1/.5
movb $2, %al #1/.5
jbe .L3 #up to 9 cycles
cmpl $16777216, %edx #1/.5
sbbl %eax, %eax #8/6
addl $4, %eax #1/.5
.L3:
ret
# sum total: 16/10 cycles - 34/28 cycles
in which the instruction cache line fetches which come as the side-effect of the jcc
instructions probably cost nothing for such a short function.
Branches can be a reasonable choice, depending on the input distribution.
Edit: added FredOverflow's solution which is using __builtin_clz()
.