[tremor] ARM ASM performance gains, EVC vs. GCC

Sat Oct 5 09:52:30 PDT 2002

On Fri, 4 Oct 2002, Werner Sharp wrote:

> Hi Nicolas,
> 
> mdct386.asm is with #ifdef __i386__
> mdct.asm is with #ifdef 1
> 
> the #ifdef 1 version gets a 6% performance boost in the one file I tried.

Okay.

First the explanation for the performance loss with my latest changes can be 
easily explained.  From mdct386.asm:

; 366  :     XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;

        [...]
        bl        XPROD31  ; 000000B0

In misc.h the non __i386__ case defines functions like XPROD31 that are 
clearly meant to be inlined.  Until someone knows how to convince EVC to 
actually inline those functions you won't be able to benefit from the 
improvements those functions provide over the macros version.

Let's have a look at the macro version then.  From mdct.asm:

|$L1680|

; 363  :
; 364  :   do{
; 365  :     oX-=4;
; 366  :     XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;

        ldr       r5, [r2]
        sub       r3, r3, #0x10  ; 0x10 = 16
        ldr       r4, [r0, #0x10]  ; 0x10 = 16
        mov       r10, r5
        mov       r10, r10, asr #31
        mov       r9, r4
        mul       r10, r9, r10
        mov       r11, r4, asr #31
        mul       r9, r11, r5
        add       r11, r10, r9
        umull     r9, r10, r4, r5
        mul       r9, r4, r5
        add       r6, r11, r10
        ldr       r4, [r2, #4]
        ldr       r5, [r0, #0x18]  ; 0x18 = 24
        mov       r11, r4, asr #31
        str       r9, [sp, #0x68]  ; 0x68 = 104
        mov       r9, r4
        mov       r10, r5, asr #31
        mul       r10, r9, r10
        mul       r9, r11, r5
        add       r11, r10, r9
        umull     r9, r10, r4, r5
        mul       r9, r4, r5
        add       r4, r11, r10
        add       r11, r4, r6
        mov       r11, r11, lsl #1
        str       r9, [sp, #0x68]  ; 0x68 = 104
        str       r11, [r3, #8]
        ldr       r5, [r2]
        ldr       r4, [r0, #0x18]  ; 0x18 = 24
        mov       r10, r5
        mov       r10, r10, asr #31
        mov       r9, r4
        mul       r10, r9, r10
        mov       r11, r4, asr #31
        mul       r9, r11, r5
        add       r11, r10, r9
        umull     r9, r10, r4, r5
        mul       r9, r4, r5
        ldr       r4, [r2, #4]
        add       r6, r11, r10
        ldr       r5, [r0, #0x10]  ; 0x10 = 16
        mov       r11, r4, asr #31
        str       r9, [sp, #0x68]  ; 0x68 = 104
        mov       r9, r4
        mov       r10, r5, asr #31
        mul       r10, r9, r10
        mul       r9, r11, r5
        add       r2, lr, r2
        add       r11, r10, r9
        umull     r9, r10, r4, r5
        mul       r9, r4, r5
        add       r4, r11, r10
        sub       r11, r6, r4
        mov       r11, r11, lsl #1
        str       r9, [sp, #0x68]  ; 0x68 = 104
        str       r11, [r3, #0xC]  ; 0xC = 12

Whiew!  58 instructions for the above code!

Now let's see what GCC produces for the same code with the _same_ parameters
i.e. "#ifdef __i386__" changed to "#if 1" and _ARM_ASSEM_ undefined not to
fetch GCC's inline assembly code.  We therefore obtain:

.L166:
        ldr     lr, [r7, #16]
        ldr     r0, [sl, #0]
        ldr     ip, [r7, #24]
        ldr     r3, [sl, #4]
        smull   r4, r5, lr, r0
        smull   r1, r2, ip, r3
        sub     r8, r8, #16
        add     r3, r2, r5
        mov     r3, r3, asl #1
        str     r3, [r8, #8]
        ldr     lr, [r7, #24]
        ldr     r0, [sl, #0]
        ldr     ip, [r7, #16]
        ldr     r3, [sl, #4]
        smull   r4, r5, lr, r0
        smull   r1, r2, ip, r3
        rsb     r3, r2, r5
        mov     r3, r3, asl #1
        str     r3, [r8, #12]

GCC emits 18 instructions for the same code in the same conditions which is 
an obvious performance improvement.

But let's have a look at the code generated by EVC:

First obvious optimisation miss:

        mov       r10, r5
        mov       r10, r10, asr #31

Why EVC did not use a simple sincle instruction expression like this:

        mov       r10, r5, asr #31

This is a sign of a suboptimal implementation of the ARM architecture.

Next, why is this whole sign fixup with all operands?  Why EVC isn't using 
the signed long multiply (smull) instruction instead of umull with separate 
manual signeness fixups?  Go figure.

In my opinion this only shows that EVC is implementing the ARM architecture
quite poorly and no performance blasting assembly code might be expected
from it.  At least not before someone manages to 1) convince EVC to honour 
the "inline" function specifier and 2) make it work with some sort of inline 
assembly like GCC does.  And even then, GCC is producing better code even 
without any inline assembly as shown above.

Maybe you guys should try to find a way to have GCC produce binaries 
compatible with PocketPC?

Just to give you a hint, here's GCC's output for this whole do {} while loop
but this time with all the optimisations I recently provided turned on:

First the C code:

  do{
    oX-=4;
    XPROD31( iX[4], iX[6], T[0], T[1], &oX[2], &oX[3] ); T+=step;
    XPROD31( iX[0], iX[2], T[0], T[1], &oX[0], &oX[1] ); T+=step;
    iX-=8;
  }while(iX>=in+n4);

GCC's output:

.L94:
        ldr     r0, [r5, #16]
        ldr     r1, [r5, #24]
        ldmia   r8, {r2, r3}
        smull   r4, ip, r0, r2
        smlal   r4, ip, r1, r3
        rsb     r0, r0, #0
        smull   r4, lr, r1, r2
        smlal   r4, lr, r0, r3
        mov     ip, ip, asl #1
        sub     r6, r6, #16
        str     ip, [r6, #8]
        mov     lr, lr, asl #1
        str     lr, [r6, #12]
        ldr     r0, [r8, r9]!
        ldr     r2, [r5, #8]
        ldr     r1, [r5, #0]
        ldr     r3, [r8, #4]
        smull   r4, ip, r1, r0
        smlal   r4, ip, r2, r3
        rsb     r1, r1, #0
        smull   r4, lr, r2, r0
        smlal   r4, lr, r1, r3
        mov     ip, ip, asl #1
        str     ip, [r6, #0]
        sub     r5, r5, #32
        mov     lr, lr, asl #1
        cmp     r5, r7
        str     lr, [r6, #4]
        add     r8, r8, r9
        bcs     .L94

Only 30 instructions!  With the output from EVC of only half that C code
quoted earlier we can estimate that EVC will generate over 100 instructions
for that same piece of C code.

What do you think?

<p>Nicolas

--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'tremor-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.