[xiph-commits] r17429 - experimental/derf/theora-ptalarbvorm/lib/arm
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Wed Sep 22 13:28:33 PDT 2010
Author: tterribe
Date: 2010-09-22 13:28:33 -0700 (Wed, 22 Sep 2010)
New Revision: 17429
Modified:
experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s
Log:
Use PLD in the NEON loop filter code.
After, oc_frag_copy_list, this was the primary source of cache misses in the
codec itself, and preloading actually saved 3.5% on a Cortex A8.
Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s 2010-09-22 19:50:32 UTC (rev 17428)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s 2010-09-22 20:28:33 UTC (rev 17429)
@@ -483,15 +483,15 @@
VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
- ; Stall
+ ADD r12,r0, #8
VADD.S16 Q0, Q0, Q8 ; 1,3
- SUB r12, r0, #1
- ; Stall
+ PLD [r12]
VADD.S16 Q0, Q0, Q8 ; 1,3
- ; Stall x2
+ PLD [r12,r1]
VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
- ; Stall x2
+ PLD [r12,r1, LSL #1]
VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
+ ADD r12,r12,r1, LSL #2
; We want to do
; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
@@ -502,20 +502,22 @@
; for a negation.
; Stall x3
VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
+ PLD [r12,-r1]
VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
- ; Stall x2
+ PLD [r12]
VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
+ PLD [r12,r1]
VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
- ; Stall x2
+ PLD [r12,r1,LSL #1]
VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
+ ADD r12,r12,r1, LSL #2
; Now we need to correct for the sign of f.
; For negative elements of Q0, we want to subtract the appropriate
; element of Q9. For positive elements we want to add them. No NEON
; instruction exists to do this, so we need to negate the negative
; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
- ; Stall x3
VADD.S16 Q9, Q9, Q0 ; 1,3
- ; Stall x2
+ PLD [r12,-r1]
VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
; Bah. No VRSBW.U8
; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
@@ -523,6 +525,7 @@
VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
+ SUB r12,r0, #1
VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
VST1.16 {D4[0]}, [r12], r1
VST1.16 {D2[0]}, [r12], r1
@@ -545,17 +548,16 @@
VLD1.64 {D0}, [r12 at 64], r1 ; D0 = SSOOKKGGCC884400 2,1
VLD1.64 {D2}, [r12 at 64], r1 ; D2 = TTPPLLHHDD995511 2,1
VLD1.64 {D4}, [r12 at 64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
- VLD1.64 {D6}, [r12 at 64], r1 ; D6 = VVRRNNJJFFBB7733 2,1
+ VLD1.64 {D6}, [r12 at 64] ; D6 = VVRRNNJJFFBB7733 2,1
VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
- ; Stall
+ ADD r12, #8
VADD.S16 Q0, Q0, Q8 ; 1,3
- SUB r12, r0, r1
- ; Stall
+ PLD [r12]
VADD.S16 Q0, Q0, Q8 ; 1,3
- ; Stall x2
+ PLD [r12,r1]
VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
- ; Stall x2
+ SUB r12, r0, r1
VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
; We want to do
; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
More information about the commits
mailing list