[xiph-commits] r17432 - experimental/derf/theora-ptalarbvorm/lib/arm
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Wed Sep 22 21:03:25 PDT 2010
Author: tterribe
Date: 2010-09-22 21:03:25 -0700 (Wed, 22 Sep 2010)
New Revision: 17432
Modified:
experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
Log:
Do even more aggressive preloading in oc_frag_copy_list_neon.
We now pre-load every fragment.
This gives another 1.5% speed-up at 720p on an A8, and is simpler code,
besides.
Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s 2010-09-23 03:56:17 UTC (rev 17431)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s 2010-09-23 04:03:25 UTC (rev 17432)
@@ -444,13 +444,6 @@
EXPORT oc_frag_recon_inter_neon
EXPORT oc_frag_recon_inter2_neon
-; The pre-loading here was an experiment.
-; The result was disappointing, but it _was_ slightly faster, so it might as
-; well be committed.
-; This value gives an 0.5% speed-up at 720p on a Cortex A8.
-; Must be 2**n-1 for n>0.
-OC_FRAG_PRELOAD_DIST_NEON * 3
-
oc_frag_copy_list_neon
; r0 = _dst_frame
; r1 = _src_frame
@@ -467,88 +460,44 @@
; Stall (2 on Xscale)
LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
; Stall (on XScale)
+ MOV r7, r6 ; Guarantee PLD points somewhere valid.
ofcl_neon_lp
ADD r4, r1, r6
VLD1.64 {D0}, [r4 at 64], r2
+ ADD r5, r0, r6
VLD1.64 {D1}, [r4 at 64], r2
- TST r12, #OC_FRAG_PRELOAD_DIST_NEON
+ SUBS r12, r12, #1
VLD1.64 {D2}, [r4 at 64], r2
+ LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
VLD1.64 {D3}, [r4 at 64], r2
- BEQ ofcl_neon_preload
+ LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
VLD1.64 {D4}, [r4 at 64], r2
+ ADDGT r7, r1, r6
VLD1.64 {D5}, [r4 at 64], r2
- ADD r5, r0, r6
+ PLD [r7]
VLD1.64 {D6}, [r4 at 64], r2
+ PLD [r7, r2]
VLD1.64 {D7}, [r4 at 64]
- SUBS r12, r12, #1
+ PLD [r7, r2, LSL #1]
VST1.64 {D0}, [r5 at 64], r2
+ ADDGT r7, r7, r2, LSL #2
VST1.64 {D1}, [r5 at 64], r2
- LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
+ PLD [r7, -r2]
VST1.64 {D2}, [r5 at 64], r2
+ PLD [r7]
VST1.64 {D3}, [r5 at 64], r2
- LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
+ PLD [r7, r2]
VST1.64 {D4}, [r5 at 64], r2
+ PLD [r7, r2, LSL #1]
VST1.64 {D5}, [r5 at 64], r2
+ ADDGT r7, r7, r2, LSL #2
VST1.64 {D6}, [r5 at 64], r2
+ PLD [r7, -r2]
VST1.64 {D7}, [r5 at 64]
BGT ofcl_neon_lp
ofcl_neon_end
LDMFD r13!,{r4-r7,PC}
-ofcl_neon_preload
- ADD r5, r0, r6
- VLD1.64 {D4}, [r4 at 64], r2
- LDR r6, [r3,#OC_FRAG_PRELOAD_DIST_NEON*4] ; r6 = _fragis[fragii]
- VLD1.64 {D5}, [r4 at 64], r2
- LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
- VLD1.64 {D6}, [r4 at 64], r2
- ADD r7, r1, r6
- VLD1.64 {D7}, [r4 at 64], r2
- PLD [r7]
- VST1.64 {D0}, [r5 at 64], r2
- PLD [r7, r2]
- VST1.64 {D1}, [r5 at 64], r2
- PLD [r7, r2,LSL #1]
- VST1.64 {D2}, [r5 at 64], r2
- ADD r7, r7, r2, LSL #2
- VST1.64 {D3}, [r5 at 64], r2
- PLD [r7, -r2]
- VST1.64 {D4}, [r5 at 64], r2
- PLD [r7]
- VST1.64 {D5}, [r5 at 64], r2
- LDR r6, [r3,#4]! ; r6 = _fragis[fragii]
- VST1.64 {D6}, [r5 at 64], r2
- LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
- VST1.64 {D7}, [r5 at 64], r2
- ADD r4, r1, r6
- VLD1.64 {D0}, [r4 at 64], r2
- PLD [r7, r2]
- VLD1.64 {D1}, [r4 at 64], r2
- PLD [r7, r2,LSL #1]
- VLD1.64 {D2}, [r4 at 64], r2
- ADD r7, r7, r2, LSL #2
- VLD1.64 {D3}, [r4 at 64], r2
- PLD [r7, -r2]
- VLD1.64 {D4}, [r4 at 64], r2
- VLD1.64 {D5}, [r4 at 64], r2
- ADD r5, r0, r6
- VLD1.64 {D6}, [r4 at 64], r2
- VLD1.64 {D7}, [r4 at 64], r2
- SUBS r12, r12, #2
- VST1.64 {D0}, [r5 at 64], r2
- VST1.64 {D1}, [r5 at 64], r2
- LDRGT r6, [r3,#4]! ; r6 = _fragis[fragii]
- VST1.64 {D2}, [r5 at 64], r2
- VST1.64 {D3}, [r5 at 64], r2
- LDRGT r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
- VST1.64 {D4}, [r5 at 64], r2
- VST1.64 {D5}, [r5 at 64], r2
- VST1.64 {D6}, [r5 at 64], r2
- VST1.64 {D7}, [r5 at 64], r2
- BGT ofcl_neon_lp
-; This should be impossible to reach.
- LDMFD r13!,{r4-r7,PC}
-
oc_frag_recon_intra_neon
; r0 = unsigned char *_dst
; r1 = int _ystride
More information about the commits
mailing list