[xiph-commits] r17415 - experimental/derf/theora-ptalarbvorm/lib/arm

tterribe at svn.xiph.org tterribe at svn.xiph.org
Tue Sep 21 23:42:10 PDT 2010


Author: tterribe
Date: 2010-09-21 23:42:10 -0700 (Tue, 21 Sep 2010)
New Revision: 17415

Modified:
   experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
Log:
A simplistic experiment with preloading on ARM.

This didn't work very well, but it did give an 0.5% improvement at 720p, so
 it's probably worth committing.


Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	2010-09-22 05:24:53 UTC (rev 17414)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	2010-09-22 06:42:10 UTC (rev 17415)
@@ -444,6 +444,13 @@
 	EXPORT	oc_frag_recon_inter_neon
 	EXPORT	oc_frag_recon_inter2_neon
 
+; The pre-loading here was an experiment.
+; The result was disappointing, but it _was_ slightly faster, so it might as
+;  well be committed.
+; This value gives an 0.5% speed-up at 720p on a Cortex A8.
+; Must be 2**n-1 for n>0.
+OC_FRAG_PRELOAD_DIST_NEON	*	3
+
 oc_frag_copy_list_neon
 	; r0 = _dst_frame
 	; r1 = _src_frame
@@ -452,10 +459,10 @@
 	; <> = _nfragis
 	; <> = _frag_buf_offs
 	LDR	r12,[r13]		; r12 = _nfragis
-	STMFD	r13!,{r4-r6,r14}
-	SUBS	r12, r12, #1
-	LDRGE	r6, [r3],#4		; r6 = _fragis[fragii]
-	LDRGE	r14,[r13,#4*5]		; r14 = _frag_buf_offs
+	STMFD	r13!,{r4-r7,r14}
+	CMP	r12, #1
+	LDRGE	r6, [r3]		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
 	BLT	ofcl_neon_end
 	; Stall (2 on Xscale)
 	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
@@ -464,27 +471,83 @@
 	ADD	r4, r1, r6
 	VLD1.64	{D0}, [r4 at 64], r2
 	VLD1.64	{D1}, [r4 at 64], r2
+	TST	r12, #OC_FRAG_PRELOAD_DIST_NEON
 	VLD1.64	{D2}, [r4 at 64], r2
 	VLD1.64	{D3}, [r4 at 64], r2
-	ADD	r5, r6, r0
-	VLD1.64	{D4}, [r4], r2
+	BEQ	ofcl_neon_preload
+	VLD1.64	{D4}, [r4 at 64], r2
+	VLD1.64	{D5}, [r4 at 64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D6}, [r4 at 64], r2
+	VLD1.64	{D7}, [r4 at 64]
 	SUBS	r12, r12, #1
+	VST1.64	{D0}, [r5 at 64], r2
+	VST1.64	{D1}, [r5 at 64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VST1.64	{D2}, [r5 at 64], r2
+	VST1.64	{D3}, [r5 at 64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VST1.64	{D4}, [r5 at 64], r2
+	VST1.64	{D5}, [r5 at 64], r2
+	VST1.64	{D6}, [r5 at 64], r2
+	VST1.64	{D7}, [r5 at 64]
+	BGT	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r7,PC}
+
+ofcl_neon_preload
+	ADD	r5, r0, r6
+	VLD1.64	{D4}, [r4 at 64], r2
+	LDR	r6, [r3,#OC_FRAG_PRELOAD_DIST_NEON*4]	; r6 = _fragis[fragii]
 	VLD1.64	{D5}, [r4 at 64], r2
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
 	VLD1.64	{D6}, [r4 at 64], r2
+	ADD	r7, r1, r6
 	VLD1.64	{D7}, [r4 at 64], r2
+	PLD	[r7]
 	VST1.64	{D0}, [r5 at 64], r2
-	LDRGE	r6, [r3],#4		; r6 = _fragis[fragii]
+	PLD	[r7, r2]
 	VST1.64	{D1}, [r5 at 64], r2
+	PLD	[r7, r2,LSL #1]
 	VST1.64	{D2}, [r5 at 64], r2
+	ADD	r7, r7, r2, LSL #2
 	VST1.64	{D3}, [r5 at 64], r2
+	PLD	[r7, -r2]
 	VST1.64	{D4}, [r5 at 64], r2
-	LDRGE	r6, [r14,r6, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	PLD	[r7]
 	VST1.64	{D5}, [r5 at 64], r2
+	LDR	r6, [r3,#4]!		; r6 = _fragis[fragii]
 	VST1.64	{D6}, [r5 at 64], r2
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
 	VST1.64	{D7}, [r5 at 64], r2
-	BGE	ofcl_neon_lp
-ofcl_neon_end
-	LDMFD	r13!,{r4-r6,PC}
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4 at 64], r2
+	PLD	[r7, r2]
+	VLD1.64	{D1}, [r4 at 64], r2
+	PLD	[r7, r2,LSL #1]
+	VLD1.64	{D2}, [r4 at 64], r2
+	ADD	r7, r7, r2, LSL #2
+	VLD1.64	{D3}, [r4 at 64], r2
+	PLD	[r7, -r2]
+	VLD1.64	{D4}, [r4 at 64], r2
+	VLD1.64	{D5}, [r4 at 64], r2
+	ADD	r5, r0, r6
+	VLD1.64	{D6}, [r4 at 64], r2
+	VLD1.64	{D7}, [r4 at 64], r2
+	SUBS	r12, r12, #2
+	VST1.64	{D0}, [r5 at 64], r2
+	VST1.64	{D1}, [r5 at 64], r2
+	LDRGT	r6, [r3,#4]!		; r6 = _fragis[fragii]
+	VST1.64	{D2}, [r5 at 64], r2
+	VST1.64	{D3}, [r5 at 64], r2
+	LDRGT	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	VST1.64	{D4}, [r5 at 64], r2
+	VST1.64	{D5}, [r5 at 64], r2
+	VST1.64	{D6}, [r5 at 64], r2
+	VST1.64	{D7}, [r5 at 64], r2
+	BGT	ofcl_neon_lp
+; This should be impossible to reach.
+	LDMFD	r13!,{r4-r7,PC}
 
 oc_frag_recon_intra_neon
 	; r0 =       unsigned char *_dst



More information about the commits mailing list