[xiph-commits] r17432 - experimental/derf/theora-ptalarbvorm/lib/arm

tterribe at svn.xiph.org tterribe at svn.xiph.org
Wed Sep 22 21:03:25 PDT 2010


Author: tterribe
Date: 2010-09-22 21:03:25 -0700 (Wed, 22 Sep 2010)
New Revision: 17432

Modified:
   experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
Log:
Do even more aggressive preloading in oc_frag_copy_list_neon.

We now pre-load every fragment.
This gives another 1.5% speed-up at 720p on an A8, and is simpler code,
 besides.


Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	2010-09-23 03:56:17 UTC (rev 17431)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	2010-09-23 04:03:25 UTC (rev 17432)
@@ -444,13 +444,6 @@
 	EXPORT	oc_frag_recon_inter_neon
 	EXPORT	oc_frag_recon_inter2_neon
 
-; The pre-loading here was an experiment.
-; The result was disappointing, but it _was_ slightly faster, so it might as
-;  well be committed.
-; This value gives an 0.5% speed-up at 720p on a Cortex A8.
-; Must be 2**n-1 for n>0.
-OC_FRAG_PRELOAD_DIST_NEON	*	3
-
 oc_frag_copy_list_neon
 	; r0 = _dst_frame
 	; r1 = _src_frame
@@ -467,88 +460,44 @@
 	; Stall (2 on Xscale)
 	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
 	; Stall (on XScale)
+	MOV	r7, r6			; Guarantee PLD points somewhere valid.
 ofcl_neon_lp
 	ADD	r4, r1, r6
 	VLD1.64	{D0}, [r4 at 64], r2
+	ADD	r5, r0, r6
 	VLD1.64	{D1}, [r4 at 64], r2
-	TST	r12, #OC_FRAG_PRELOAD_DIST_NEON
+	SUBS	r12, r12, #1
 	VLD1.64	{D2}, [r4 at 64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
 	VLD1.64	{D3}, [r4 at 64], r2
-	BEQ	ofcl_neon_preload
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
 	VLD1.64	{D4}, [r4 at 64], r2
+	ADDGT	r7, r1, r6
 	VLD1.64	{D5}, [r4 at 64], r2
-	ADD	r5, r0, r6
+	PLD	[r7]
 	VLD1.64	{D6}, [r4 at 64], r2
+	PLD	[r7, r2]
 	VLD1.64	{D7}, [r4 at 64]
-	SUBS	r12, r12, #1
+	PLD	[r7, r2, LSL #1]
 	VST1.64	{D0}, [r5 at 64], r2
+	ADDGT	r7, r7, r2, LSL #2
 	VST1.64	{D1}, [r5 at 64], r2
-	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	PLD	[r7, -r2]
 	VST1.64	{D2}, [r5 at 64], r2
+	PLD	[r7]
 	VST1.64	{D3}, [r5 at 64], r2
-	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	PLD	[r7, r2]
 	VST1.64	{D4}, [r5 at 64], r2
+	PLD	[r7, r2, LSL #1]
 	VST1.64	{D5}, [r5 at 64], r2
+	ADDGT	r7, r7, r2, LSL #2
 	VST1.64	{D6}, [r5 at 64], r2
+	PLD	[r7, -r2]
 	VST1.64	{D7}, [r5 at 64]
 	BGT	ofcl_neon_lp
 ofcl_neon_end
 	LDMFD	r13!,{r4-r7,PC}
 
-ofcl_neon_preload
-	ADD	r5, r0, r6
-	VLD1.64	{D4}, [r4 at 64], r2
-	LDR	r6, [r3,#OC_FRAG_PRELOAD_DIST_NEON*4]	; r6 = _fragis[fragii]
-	VLD1.64	{D5}, [r4 at 64], r2
-	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
-	VLD1.64	{D6}, [r4 at 64], r2
-	ADD	r7, r1, r6
-	VLD1.64	{D7}, [r4 at 64], r2
-	PLD	[r7]
-	VST1.64	{D0}, [r5 at 64], r2
-	PLD	[r7, r2]
-	VST1.64	{D1}, [r5 at 64], r2
-	PLD	[r7, r2,LSL #1]
-	VST1.64	{D2}, [r5 at 64], r2
-	ADD	r7, r7, r2, LSL #2
-	VST1.64	{D3}, [r5 at 64], r2
-	PLD	[r7, -r2]
-	VST1.64	{D4}, [r5 at 64], r2
-	PLD	[r7]
-	VST1.64	{D5}, [r5 at 64], r2
-	LDR	r6, [r3,#4]!		; r6 = _fragis[fragii]
-	VST1.64	{D6}, [r5 at 64], r2
-	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
-	VST1.64	{D7}, [r5 at 64], r2
-	ADD	r4, r1, r6
-	VLD1.64	{D0}, [r4 at 64], r2
-	PLD	[r7, r2]
-	VLD1.64	{D1}, [r4 at 64], r2
-	PLD	[r7, r2,LSL #1]
-	VLD1.64	{D2}, [r4 at 64], r2
-	ADD	r7, r7, r2, LSL #2
-	VLD1.64	{D3}, [r4 at 64], r2
-	PLD	[r7, -r2]
-	VLD1.64	{D4}, [r4 at 64], r2
-	VLD1.64	{D5}, [r4 at 64], r2
-	ADD	r5, r0, r6
-	VLD1.64	{D6}, [r4 at 64], r2
-	VLD1.64	{D7}, [r4 at 64], r2
-	SUBS	r12, r12, #2
-	VST1.64	{D0}, [r5 at 64], r2
-	VST1.64	{D1}, [r5 at 64], r2
-	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
-	VST1.64	{D2}, [r5 at 64], r2
-	VST1.64	{D3}, [r5 at 64], r2
-	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
-	VST1.64	{D4}, [r5 at 64], r2
-	VST1.64	{D5}, [r5 at 64], r2
-	VST1.64	{D6}, [r5 at 64], r2
-	VST1.64	{D7}, [r5 at 64], r2
-	BGT	ofcl_neon_lp
-; This should be impossible to reach.
-	LDMFD	r13!,{r4-r7,PC}
-
 oc_frag_recon_intra_neon
 	; r0 =       unsigned char *_dst
 	; r1 =       int            _ystride



More information about the commits mailing list