[tremor] ARM ASM performance gains, EVC vs. GCC
Nicolas Pitre
nico at cam.org
Sat Oct 5 09:52:30 PDT 2002
On Fri, 4 Oct 2002, Werner Sharp wrote:
> Hi Nicolas,
>
> mdct386.asm is with #ifdef __i386__
> mdct.asm is with #ifdef 1
>
> the #ifdef 1 version gets a 6% performance boost in the one file I tried.
Okay.
First the explanation for the performance loss with my latest changes can be
easily explained. From mdct386.asm:
; 366 : XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
[...]
bl XPROD31 ; 000000B0
In misc.h the non __i386__ case defines functions like XPROD31 that are
clearly meant to be inlined. Until someone knows how to convince EVC to
actually inline those functions you won't be able to benefit from the
improvements those functions provide over the macros version.
Let's have a look at the macro version then. From mdct.asm:
|$L1680|
; 363 :
; 364 : do{
; 365 : oX-=4;
; 366 : XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
ldr r5, [r2]
sub r3, r3, #0x10 ; 0x10 = 16
ldr r4, [r0, #0x10] ; 0x10 = 16
mov r10, r5
mov r10, r10, asr #31
mov r9, r4
mul r10, r9, r10
mov r11, r4, asr #31
mul r9, r11, r5
add r11, r10, r9
umull r9, r10, r4, r5
mul r9, r4, r5
add r6, r11, r10
ldr r4, [r2, #4]
ldr r5, [r0, #0x18] ; 0x18 = 24
mov r11, r4, asr #31
str r9, [sp, #0x68] ; 0x68 = 104
mov r9, r4
mov r10, r5, asr #31
mul r10, r9, r10
mul r9, r11, r5
add r11, r10, r9
umull r9, r10, r4, r5
mul r9, r4, r5
add r4, r11, r10
add r11, r4, r6
mov r11, r11, lsl #1
str r9, [sp, #0x68] ; 0x68 = 104
str r11, [r3, #8]
ldr r5, [r2]
ldr r4, [r0, #0x18] ; 0x18 = 24
mov r10, r5
mov r10, r10, asr #31
mov r9, r4
mul r10, r9, r10
mov r11, r4, asr #31
mul r9, r11, r5
add r11, r10, r9
umull r9, r10, r4, r5
mul r9, r4, r5
ldr r4, [r2, #4]
add r6, r11, r10
ldr r5, [r0, #0x10] ; 0x10 = 16
mov r11, r4, asr #31
str r9, [sp, #0x68] ; 0x68 = 104
mov r9, r4
mov r10, r5, asr #31
mul r10, r9, r10
mul r9, r11, r5
add r2, lr, r2
add r11, r10, r9
umull r9, r10, r4, r5
mul r9, r4, r5
add r4, r11, r10
sub r11, r6, r4
mov r11, r11, lsl #1
str r9, [sp, #0x68] ; 0x68 = 104
str r11, [r3, #0xC] ; 0xC = 12
Whiew! 58 instructions for the above code!
Now let's see what GCC produces for the same code with the _same_ parameters
i.e. "#ifdef __i386__" changed to "#if 1" and _ARM_ASSEM_ undefined not to
fetch GCC's inline assembly code. We therefore obtain:
.L166:
ldr lr, [r7, #16]
ldr r0, [sl, #0]
ldr ip, [r7, #24]
ldr r3, [sl, #4]
smull r4, r5, lr, r0
smull r1, r2, ip, r3
sub r8, r8, #16
add r3, r2, r5
mov r3, r3, asl #1
str r3, [r8, #8]
ldr lr, [r7, #24]
ldr r0, [sl, #0]
ldr ip, [r7, #16]
ldr r3, [sl, #4]
smull r4, r5, lr, r0
smull r1, r2, ip, r3
rsb r3, r2, r5
mov r3, r3, asl #1
str r3, [r8, #12]
GCC emits 18 instructions for the same code in the same conditions which is
an obvious performance improvement.
But let's have a look at the code generated by EVC:
First obvious optimisation miss:
mov r10, r5
mov r10, r10, asr #31
Why EVC did not use a simple sincle instruction expression like this:
mov r10, r5, asr #31
This is a sign of a suboptimal implementation of the ARM architecture.
Next, why is this whole sign fixup with all operands? Why EVC isn't using
the signed long multiply (smull) instruction instead of umull with separate
manual signeness fixups? Go figure.
In my opinion this only shows that EVC is implementing the ARM architecture
quite poorly and no performance blasting assembly code might be expected
from it. At least not before someone manages to 1) convince EVC to honour
the "inline" function specifier and 2) make it work with some sort of inline
assembly like GCC does. And even then, GCC is producing better code even
without any inline assembly as shown above.
Maybe you guys should try to find a way to have GCC produce binaries
compatible with PocketPC?
Just to give you a hint, here's GCC's output for this whole do {} while loop
but this time with all the optimisations I recently provided turned on:
First the C code:
do{
oX-=4;
XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
XPROD31( iX[0], iX[2], T[0], T[1], &oX[0], &oX[1] ); T+=step;
iX-=8;
}while(iX>=in+n4);
GCC's output:
.L94:
ldr r0, [r5, #16]
ldr r1, [r5, #24]
ldmia r8, {r2, r3}
smull r4, ip, r0, r2
smlal r4, ip, r1, r3
rsb r0, r0, #0
smull r4, lr, r1, r2
smlal r4, lr, r0, r3
mov ip, ip, asl #1
sub r6, r6, #16
str ip, [r6, #8]
mov lr, lr, asl #1
str lr, [r6, #12]
ldr r0, [r8, r9]!
ldr r2, [r5, #8]
ldr r1, [r5, #0]
ldr r3, [r8, #4]
smull r4, ip, r1, r0
smlal r4, ip, r2, r3
rsb r1, r1, #0
smull r4, lr, r2, r0
smlal r4, lr, r1, r3
mov ip, ip, asl #1
str ip, [r6, #0]
sub r5, r5, #32
mov lr, lr, asl #1
cmp r5, r7
str lr, [r6, #4]
add r8, r8, r9
bcs .L94
Only 30 instructions! With the output from EVC of only half that C code
quoted earlier we can estimate that EVC will generate over 100 instructions
for that same piece of C code.
What do you think?
<p>Nicolas
--- >8 ----
List archives: http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'tremor-request at xiph.org'
containing only the word 'unsubscribe' in the body. No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.
More information about the Tremor
mailing list