Hmmm, dont know what happened, it submitted my answer before I had really started.
At first I didnt think I could do it with only two registers but then I decided I could and did. These solutions are register only, no memory (other than the ldr r0,= which you can replace with four instructions). If you use memory and hmmm, two registers you can cut down the number of instructions perhaps, str, bic, bic, ldrb, orr lsl, ldrb, orr lsl. Okay I did it in one instruction fewer but then you need the memory location and the stores and loads cost cycles so same amount of memory and more cycles for me to do it with memory. Someone else may have some good tricks. I think some of the newer cores have an endian swap instruction which would make it even easier.
.globl midswap
midswap:
mov r2,r0,lsl #8 ;@ r2 = BBCCDDAA
mov r3,r0,lsr #8 ;@ r3 = DDAABBCC (this might drag a sign bit, dont care)
and r2,r2,#0x00FF0000 ;@ r2 = 00CC0000
and r3,r3,#0x0000FF00 ;@ r3 = 0000BB00
bic r0,r0,#0x00FF0000 ;@ r0 = AA00CCDD
bic r0,r0,#0x0000FF00 ;@ r0 = AA0000DD
orr r0,r0,r2 ;@ r0 = AACC00DD
orr r0,r0,r3 ;@ r0 = AACCBBDD
bx lr ;@ or mov pc,lr for older arm cores
.globl tworegs
tworegs:
mov r2,r0,ror #8 ;@ r2 = DDAABBCC
bic r2,r2,#0xFF000000 ;@ r2 = 00AABBCC
bic r2,r2,#0x00FF0000 ;@ r2 = 0000BBCC
orr r2,r2,ror #16 ;@ r2 = BBCCBBCC
bic r2,r2,#0xFF000000 ;@ r2 = 00CCBBCC
bic r2,r2,#0x000000FF ;@ r2 = 00CCBB00
bic r0,r0,#0x00FF0000 ;@ r0 = AA00CCDD
bic r0,r0,#0x0000FF00 ;@ r0 = AA0000DD
orr r0,r0,r2 ;@ r0 = AACCBBDD
bx lr
testfun:
ldr r0,=0xAABBCCDD
bl midswap