[xiph-commits] r17481 - in trunk/theora/lib: . arm

tterribe at svn.xiph.org tterribe at svn.xiph.org
Sun Oct 3 15:49:42 PDT 2010


Author: tterribe
Date: 2010-10-03 15:49:42 -0700 (Sun, 03 Oct 2010)
New Revision: 17481

Modified:
   trunk/theora/lib/arm/armbits.s
   trunk/theora/lib/arm/armfrag.s
   trunk/theora/lib/arm/armidct.s
   trunk/theora/lib/arm/armloop.s
   trunk/theora/lib/decode.c
Log:
Add PROC/ENDP markings to the ARM asm (currently ignored by the GNU toolchain).
Also slightly simplify the MB mode and MV decoding.
The new code uses slightly less cache and fewer lookups.


Modified: trunk/theora/lib/arm/armbits.s
===================================================================
--- trunk/theora/lib/arm/armbits.s	2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armbits.s	2010-10-03 22:49:42 UTC (rev 17481)
@@ -21,32 +21,33 @@
 	EXPORT oc_pack_read1_arm
 	EXPORT oc_huff_token_decode_arm
 
-oc_pack_read_arm
+oc_pack_read1_arm PROC
 	; r0 = oc_pack_buf *_b
-	; r1 = int          _bits
 	ADD r12,r0,#8
 	LDMIA r12,{r2,r3}      ; r2 = window
 	; Stall...             ; r3 = available
 	; Stall...
-	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
-	BLT oc_pack_read_refill
-	RSB r0,r1,#32          ; r0 = 32-_bits
-	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
-	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
 	STMIA r12,{r2,r3}      ; window = r2
 	                       ; available = r3
 	MOV PC,r14
+	ENDP
 
-oc_pack_read1_arm
+oc_pack_read_arm PROC
 	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
 	ADD r12,r0,#8
 	LDMIA r12,{r2,r3}      ; r2 = window
 	; Stall...             ; r3 = available
 	; Stall...
-	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
-	BLT oc_pack_read1_refill
-	MOV r0,r2,LSR #31      ; r0 = window>>31
-	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
 	STMIA r12,{r2,r3}      ; window = r2
 	                       ; available = r3
 	MOV PC,r14
@@ -117,10 +118,11 @@
 	STMIA r12,{r2,r3}      ; window = r2
 	                       ; available = r3
 	LDMFD r13!,{r10,r11,PC}
+	ENDP
 
 
 
-oc_huff_token_decode_arm
+oc_huff_token_decode_arm PROC
 	; r0 = oc_pack_buf       *_b
 	; r1 = const ogg_int16_t *_tree
 	STMFD r13!,{r4,r5,r10,r14}
@@ -223,5 +225,6 @@
 	                       ; available = r5
 	AND r0,r14,#255        ; r0 = node&255
 	LDMFD r13!,{r4,r5,r10,pc}
+	ENDP
 
 	END

Modified: trunk/theora/lib/arm/armfrag.s
===================================================================
--- trunk/theora/lib/arm/armfrag.s	2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armfrag.s	2010-10-03 22:49:42 UTC (rev 17481)
@@ -24,7 +24,7 @@
 	EXPORT	oc_frag_recon_inter_arm
 	EXPORT	oc_frag_recon_inter2_arm
 
-oc_frag_copy_list_arm
+oc_frag_copy_list_arm PROC
 	; r0 = _dst_frame
 	; r1 = _src_frame
 	; r2 = _ystride
@@ -132,8 +132,9 @@
 	SUBS	r14,r14,#1
 	BGT	ofrintra_lp_arm
 	LDMFD	r13!,{r4,r5,PC}
+	ENDP
 
-oc_frag_recon_inter_arm
+oc_frag_recon_inter_arm PROC
 	; r0 =       unsigned char *dst
 	; r1 = const unsigned char *src
 	; r2 =       int            ystride
@@ -194,8 +195,9 @@
 	SUBS	r9, r9, #1
 	BGT	ofrinter_lp_arm
 	LDMFD	r13!,{r5,r9-r11,PC}
+	ENDP
 
-oc_frag_recon_inter2_arm
+oc_frag_recon_inter2_arm PROC
 	; r0 =       unsigned char *dst
 	; r1 = const unsigned char *src1
 	; r2 = const unsigned char *src2
@@ -274,11 +276,12 @@
 	SUBS	r14,r14,#1
 	BGT	ofrinter2_lp_arm
 	LDMFD	r13!,{r4-r8,PC}
+	ENDP
 
  [ OC_ARM_ASM_EDSP
 	EXPORT	oc_frag_copy_list_edsp
 
-oc_frag_copy_list_edsp
+oc_frag_copy_list_edsp PROC
 	; r0 = _dst_frame
 	; r1 = _src_frame
 	; r2 = _ystride
@@ -320,6 +323,7 @@
 	BGE	ofcl_edsp_lp
 ofcl_edsp_end
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
  ]
 
  [ OC_ARM_ASM_MEDIA
@@ -327,7 +331,7 @@
 	EXPORT	oc_frag_recon_inter_v6
 	EXPORT	oc_frag_recon_inter2_v6
 
-oc_frag_recon_intra_v6
+oc_frag_recon_intra_v6 PROC
 	; r0 =       unsigned char *_dst
 	; r1 =       int            _ystride
 	; r2 = const ogg_int16_t    _residue[64]
@@ -356,8 +360,9 @@
 	STRD	r2, [r0], r1
 	BGT	ofrintra_v6_lp
 	LDMFD	r13!,{r4-r6,PC}
+	ENDP
 
-oc_frag_recon_inter_v6
+oc_frag_recon_inter_v6 PROC
 	; r0 =       unsigned char *_dst
 	; r1 = const unsigned char *_src
 	; r2 =       int            _ystride
@@ -395,8 +400,9 @@
 	STRD	r4, [r0], r2
 	BGT	ofrinter_v6_lp
 	LDMFD	r13!,{r4-r7,PC}
+	ENDP
 
-oc_frag_recon_inter2_v6
+oc_frag_recon_inter2_v6 PROC
 	; r0 =       unsigned char *_dst
 	; r1 = const unsigned char *_src1
 	; r2 = const unsigned char *_src2
@@ -436,6 +442,7 @@
 	STRD	r8, [r0], r3
 	BGT	ofrinter2_v6_lp
 	LDMFD	r13!,{r4-r9,PC}
+	ENDP
  ]
 
  [ OC_ARM_ASM_NEON
@@ -444,7 +451,7 @@
 	EXPORT	oc_frag_recon_inter_neon
 	EXPORT	oc_frag_recon_inter2_neon
 
-oc_frag_copy_list_neon
+oc_frag_copy_list_neon PROC
 	; r0 = _dst_frame
 	; r1 = _src_frame
 	; r2 = _ystride
@@ -497,8 +504,9 @@
 	BGT	ofcl_neon_lp
 ofcl_neon_end
 	LDMFD	r13!,{r4-r7,PC}
+	ENDP
 
-oc_frag_recon_intra_neon
+oc_frag_recon_intra_neon PROC
 	; r0 =       unsigned char *_dst
 	; r1 =       int            _ystride
 	; r2 = const ogg_int16_t    _residue[64]
@@ -530,8 +538,9 @@
 	VST1.64	{D22},[r0 at 64], r1
 	VST1.64	{D23},[r0 at 64], r1
 	MOV	PC,R14
+	ENDP
 
-oc_frag_recon_inter_neon
+oc_frag_recon_inter_neon PROC
 	; r0 =       unsigned char *_dst
 	; r1 = const unsigned char *_src
 	; r2 =       int            _ystride
@@ -578,8 +587,9 @@
 	VST1.64	{D22},[r0 at 64], r2
 	VST1.64	{D23},[r0 at 64], r2
 	MOV	PC,R14
+	ENDP
 
-oc_frag_recon_inter2_neon
+oc_frag_recon_inter2_neon PROC
 	; r0 =       unsigned char *_dst
 	; r1 = const unsigned char *_src1
 	; r2 = const unsigned char *_src2
@@ -640,6 +650,7 @@
 	VST1.64	{D22},[r0 at 64], r3
 	VST1.64	{D23},[r0 at 64], r3
 	MOV	PC,R14
+	ENDP
  ]
 
 	END

Modified: trunk/theora/lib/arm/armidct.s
===================================================================
--- trunk/theora/lib/arm/armidct.s	2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armidct.s	2010-10-03 22:49:42 UTC (rev 17481)
@@ -21,7 +21,7 @@
 	EXPORT	oc_idct8x8_1_arm
 	EXPORT	oc_idct8x8_arm
 
-oc_idct8x8_1_arm
+oc_idct8x8_1_arm PROC
 	; r0 = ogg_int16_t  *_y
 	; r1 = ogg_uint16_t  _dc
 	ORR	r1, r1, r1, LSL #16
@@ -37,8 +37,9 @@
 	STMIA	r0!,{r1,r2,r3,r12}
 	STMIA	r0!,{r1,r2,r3,r12}
 	MOV	PC, r14
+	ENDP
 
-oc_idct8x8_arm
+oc_idct8x8_arm PROC
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x
 	; r2 = int          _last_zzi
@@ -92,8 +93,9 @@
 	BL	idct8core_down_arm
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-oc_idct8x8_10_arm
+oc_idct8x8_10_arm PROC
 	STMFD	r13!,{r4-r11,r14}
 	SUB	r13,r13,#64*2
 ; Row transforms
@@ -128,8 +130,9 @@
 	BL	idct4core_down_arm
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-oc_idct8x8_6_arm
+oc_idct8x8_6_arm PROC
 	STMFD	r13!,{r4-r7,r9-r11,r14}
 	SUB	r13,r13,#64*2
 ; Row transforms
@@ -161,8 +164,9 @@
 	BL	idct3core_down_arm
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
 
-oc_idct8x8_3_arm
+oc_idct8x8_3_arm PROC
 	STMFD	r13!,{r4-r7,r9-r11,r14}
 	SUB	r13,r13,#64*2
 ; Row transforms
@@ -189,8 +193,9 @@
 	BL	idct2core_down_arm
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r7,r9-r11,PC}
+	ENDP
 
-idct1core_arm
+idct1core_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r3, [r1], #16
@@ -208,8 +213,9 @@
 	STRH	r3, [r0, #94]
 	STRH	r3, [r0, #110]
 	MOV	PC,R14
+	ENDP
 
-idct2core_arm
+idct2core_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r9, [r1], #16		; r9 = x[0]
@@ -244,8 +250,9 @@
 	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
 	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
 	MOV	PC,r14
+	ENDP
 
-idct2core_down_arm
+idct2core_down_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r9, [r1], #16		; r9 = x[0]
@@ -292,8 +299,9 @@
 	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
 	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
 	MOV	PC,r14
+	ENDP
 
-idct3core_arm
+idct3core_arm PROC
 	LDRSH	r9, [r1], #16		; r9 = x[0]
 	LDR	r12,OC_C4S4		; r12= OC_C4S4
 	LDRSH	r3, [r1, #-12]		; r3 = x[2]
@@ -337,8 +345,9 @@
 	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
 	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
 	MOV	PC,R14
+	ENDP
 
-idct3core_down_arm
+idct3core_down_arm PROC
 	LDRSH	r9, [r1], #16		; r9 = x[0]
 	LDR	r12,OC_C4S4		; r12= OC_C4S4
 	LDRSH	r3, [r1, #-12]		; r3 = x[2]
@@ -394,8 +403,9 @@
 	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
 	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
 	MOV	PC,R14
+	ENDP
 
-idct4core_arm
+idct4core_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r9, [r1], #16		; r9 = x[0]
@@ -451,8 +461,9 @@
 	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
 	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
 	MOV	PC,r14
+	ENDP
 
-idct4core_down_arm
+idct4core_down_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r9, [r1], #16		; r9 = x[0]
@@ -520,8 +531,9 @@
 	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
 	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
 	MOV	PC,r14
+	ENDP
 
-idct8core_arm
+idct8core_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r2, [r1],#16		; r2 = x[0]
@@ -621,8 +633,9 @@
 	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
 	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
 	LDMFD	r13!,{r1,PC}
+	ENDP
 
-idct8core_down_arm
+idct8core_down_arm PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r2, [r1],#16		; r2 = x[0]
@@ -735,12 +748,13 @@
 	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
 	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
 	LDMFD	r13!,{r1,PC}
+	ENDP
 
  [ OC_ARM_ASM_MEDIA
 	EXPORT	oc_idct8x8_1_v6
 	EXPORT	oc_idct8x8_v6
 
-oc_idct8x8_1_v6
+oc_idct8x8_1_v6 PROC
 	; r0 = ogg_int16_t  *_y
 	; r1 = ogg_uint16_t  _dc
 	ORR	r2, r1, r1, LSL #16
@@ -762,8 +776,9 @@
 	STRD	r2, [r0], #8
 	STRD	r2, [r0], #8
 	MOV	PC, r14
+	ENDP
 
-oc_idct8x8_v6
+oc_idct8x8_v6 PROC
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x
 	; r2 = int          _last_zzi
@@ -815,8 +830,9 @@
 	BL	idct8_8core_down_v6
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-oc_idct8x8_10_v6
+oc_idct8x8_10_v6 PROC
 	STMFD	r13!,{r4-r11,r14}
 	SUB	r13,r13,#64*2+4
 ; Row transforms
@@ -847,8 +863,9 @@
 	BL	idct4_4core_down_v6
 	ADD	r13,r13,#64*2+4
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-oc_idct8x8_3_v6
+oc_idct8x8_3_v6 PROC
 	STMFD	r13!,{r4-r8,r14}
 	SUB	r13,r13,#64*2
 ; Row transforms
@@ -870,8 +887,9 @@
 	BL	idct2_2core_down_v6
 	ADD	r13,r13,#64*2
 	LDMFD	r13!,{r4-r8,PC}
+	ENDP
 
-idct2_1core_v6
+idct2_1core_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -912,6 +930,7 @@
 	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
 	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
 	MOV	PC,r14
+	ENDP
  ]
 
 	ALIGN 8
@@ -931,7 +950,7 @@
 	DCD	46341 ; B505
 
  [ OC_ARM_ASM_MEDIA
-idct2_2core_down_v6
+idct2_2core_down_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -1000,13 +1019,14 @@
 	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
 	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
 	MOV	PC,r14
+	ENDP
 
 ; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
 ;  pay for increased branch mis-prediction to get here, but in practice it
 ;  doesn't seem to slow anything down to take it out, and it's less code this
 ;  way.
  [ 0
-oc_idct8x8_6_v6
+oc_idct8x8_6_v6 PROC
 	STMFD	r13!,{r4-r8,r10,r11,r14}
 	SUB	r13,r13,#64*2+4
 ; Row transforms
@@ -1035,8 +1055,9 @@
 	BL	idct3_3core_down_v6
 	ADD	r13,r13,#64*2+4
 	LDMFD	r13!,{r4-r8,r10,r11,PC}
+	ENDP
 
-idct1core_v6
+idct1core_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 	LDRSH	r3, [r1], #16
@@ -1051,8 +1072,9 @@
 	STRH	r3, [r0, #62]
 	STRH	r3, [r0, #94]
 	MOV	PC,R14
+	ENDP
 
-idct3_2core_v6
+idct3_2core_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -1083,6 +1105,7 @@
 	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
 ; Stage 3:
 	B	idct4_3core_stage3_v6
+	ENDP
 
 ; Another copy so the LDRD offsets are less than +/- 255.
 	ALIGN 8
@@ -1095,7 +1118,7 @@
 OC_C2S6_3_v6
 	DCD	60547 ; EC83
 
-idct3_3core_down_v6
+idct3_3core_down_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -1127,9 +1150,10 @@
 	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
 ; Stage 3:
 	B	idct4_4core_down_stage3_v6
+	ENDP
  ]
 
-idct4_3core_v6
+idct4_3core_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -1196,6 +1220,7 @@
 	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
 	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
 	MOV	PC,r14
+	ENDP
 
 ; Another copy so the LDRD offsets are less than +/- 255.
 	ALIGN 8
@@ -1212,7 +1237,7 @@
 OC_C3S5_4_v6
 	DCD	54491 ; D4DB
 
-idct4_4core_down_v6
+idct4_4core_down_v6 PROC
 	; r0 =       ogg_int16_t *_y (destination)
 	; r1 = const ogg_int16_t *_x (source)
 ; Stage 1:
@@ -1259,8 +1284,9 @@
 	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
 	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
 	B	idct8_8core_down_stage3_5_v6
+	ENDP
 
-idct8_8core_v6
+idct8_8core_v6 PROC
 	STMFD	r13!,{r0,r14}
 ; Stage 1:
 	;5-6 rotation by 3pi/16
@@ -1342,6 +1368,7 @@
 	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
 	LDMFD	r13!,{r0,r14}
 	B	idct4_3core_stage3_5_v6
+	ENDP
 
 ; Another copy so the LDRD offsets are less than +/- 255.
 	ALIGN 8
@@ -1358,7 +1385,7 @@
 OC_C3S5_8_v6
 	DCD	54491 ; D4DB
 
-idct8_8core_down_v6
+idct8_8core_down_v6 PROC
 	STMFD	r13!,{r0,r14}
 ; Stage 1:
 	;5-6 rotation by 3pi/16
@@ -1487,6 +1514,7 @@
 	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
 	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
 	MOV	PC,r14
+	ENDP
  ]
 
  [ OC_ARM_ASM_NEON
@@ -1504,7 +1532,7 @@
 	DCW	25080 ; 30FC (C6S2)
 	DCW	12785 ; 31F1 (C7S1)
 
-oc_idct8x8_1_neon
+oc_idct8x8_1_neon PROC
 	; r0 = ogg_int16_t  *_y
 	; r1 = ogg_uint16_t  _dc
 	VDUP.S16	Q0, r1
@@ -1514,8 +1542,9 @@
 	VST1.64		{D0, D1, D2, D3}, [r0 at 128]!
 	VST1.64		{D0, D1, D2, D3}, [r0 at 128]
 	MOV	PC, r14
+	ENDP
 
-oc_idct8x8_neon
+oc_idct8x8_neon PROC
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x
 	; r2 = int          _last_zzi
@@ -1606,8 +1635,9 @@
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
+	ENDP
 
-oc_idct8x8_stage123_neon
+oc_idct8x8_stage123_neon PROC
 ; Stages 1 & 2
 	VMULL.S16	Q4, D18,D1[3]
 	VMULL.S16	Q5, D19,D1[3]
@@ -1703,8 +1733,9 @@
 	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
 	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
 	MOV	PC, r14
+	ENDP
 
-oc_idct8x8_10_neon
+oc_idct8x8_10_neon PROC
 	ADR	r3, OC_IDCT_CONSTS_NEON
 	VLD1.64		{D0,D1},          [r3 at 128]
 	MOV	r2, r1
@@ -1871,6 +1902,7 @@
 	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
 	VSTMIA		r0, {D16-D31}
 	MOV	PC, r14
+	ENDP
  ]
 
 	END

Modified: trunk/theora/lib/arm/armloop.s
===================================================================
--- trunk/theora/lib/arm/armloop.s	2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armloop.s	2010-10-03 22:49:42 UTC (rev 17481)
@@ -25,7 +25,7 @@
 OC_FRAG_CODED_FLAG	*	1
 
 	; Vanilla ARM v4 version
-loop_filter_h_arm
+loop_filter_h_arm PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int           *_bv
@@ -58,8 +58,9 @@
 	BGT	lfh_arm_lp
 	SUB	r0, r0, r1, LSL #3
 	LDMFD	r13!,{r3-r6,PC}
+	ENDP
 
-loop_filter_v_arm
+loop_filter_v_arm PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int           *_bv
@@ -92,8 +93,9 @@
 	BGT	lfv_arm_lp
 	SUB	r0, r0, #8
 	LDMFD	r13!,{r3-r6,PC}
+	ENDP
 
-oc_loop_filter_frag_rows_arm
+oc_loop_filter_frag_rows_arm PROC
 	; r0 = _ref_frame_data
 	; r1 = _ystride
 	; r2 = _bv
@@ -158,12 +160,13 @@
 	CMP	r4, r5
 	BLT	oslffri_arm_lp1
 	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
 
  [ OC_ARM_ASM_MEDIA
 	EXPORT	oc_loop_filter_init_v6
 	EXPORT	oc_loop_filter_frag_rows_v6
 
-oc_loop_filter_init_v6
+oc_loop_filter_init_v6 PROC
 	; r0 = _bv
 	; r1 = _flimit (=L from the spec)
 	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
@@ -172,6 +175,7 @@
 	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
 	STR	r1, [r0]
 	MOV	PC,r14
+	ENDP
 
 ; We could use the same strategy as the v filter below, but that would require
 ;  40 instructions to load the data and transpose it into columns and another
@@ -184,7 +188,7 @@
 ;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
 ; His is a lot less code, though, because it only does two rows at once instead
 ;  of four.
-loop_filter_h_v6
+loop_filter_h_v6 PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int            _ll
@@ -196,8 +200,9 @@
 	BL loop_filter_h_core_v6
 	SUB	r0, r0, r1, LSL #2
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-loop_filter_h_core_v6
+loop_filter_h_core_v6 PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int            _ll
@@ -278,6 +283,7 @@
 	; Single issue
 	STRB	r8, [r0,#-1]
 	MOV	PC,r14
+	ENDP
 
 ; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
 ;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
@@ -294,7 +300,7 @@
 ; It executes about 2/3 the number of instructions of David Conrad's approach,
 ;  but requires more code, because it does all eight columns at once, instead
 ;  of four at a time.
-loop_filter_v_v6
+loop_filter_v_v6 PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int            _ll
@@ -370,8 +376,9 @@
 	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
 	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
 	LDMFD	r13!,{r4-r11,PC}
+	ENDP
 
-oc_loop_filter_frag_rows_v6
+oc_loop_filter_frag_rows_v6 PROC
 	; r0 = _ref_frame_data
 	; r1 = _ystride
 	; r2 = _bv
@@ -436,21 +443,23 @@
 	CMP	r4, r5
 	BLT	oslffri_v6_lp1
 	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
  ]
 
  [ OC_ARM_ASM_NEON
 	EXPORT	oc_loop_filter_init_neon
 	EXPORT	oc_loop_filter_frag_rows_neon
 
-oc_loop_filter_init_neon
+oc_loop_filter_init_neon PROC
 	; r0 = _bv
 	; r1 = _flimit (=L from the spec)
 	MOV		r1, r1, LSL #1  ; r1 = 2*L
 	VDUP.S16	Q15, r1		; Q15= 2L in U16s
 	VST1.64		{D30,D31}, [r0 at 128]
 	MOV	PC,r14
+	ENDP
 
-loop_filter_h_neon
+loop_filter_h_neon PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int           *_bv
@@ -536,8 +545,9 @@
 	VST1.16	{D4[3]}, [r12], r1
 	VST1.16	{D2[3]}, [r12], r1
 	MOV	PC,r14
+	ENDP
 
-loop_filter_v_neon
+loop_filter_v_neon PROC
 	; r0 = unsigned char *_pix
 	; r1 = int            _ystride
 	; r2 = int           *_bv
@@ -593,8 +603,9 @@
 	VST1.64	{D2}, [r12 at 64], r1
 	VST1.64	{D4}, [r12 at 64], r1
 	MOV	PC,r14
+	ENDP
 
-oc_loop_filter_frag_rows_neon
+oc_loop_filter_frag_rows_neon PROC
 	; r0 = _ref_frame_data
 	; r1 = _ystride
 	; r2 = _bv
@@ -659,6 +670,7 @@
 	CMP	r4, r5
 	BLT	oslffri_neon_lp1
 	LDMFD	r13!,{r0,r4-r11,PC}
+	ENDP
  ]
 
 	END

Modified: trunk/theora/lib/decode.c
===================================================================
--- trunk/theora/lib/decode.c	2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/decode.c	2010-10-03 22:49:42 UTC (rev 17481)
@@ -600,6 +600,7 @@
 static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
   const oc_sb_map   *sb_maps;
   const oc_sb_flags *sb_flags;
+  signed char       *mb_modes;
   oc_fragment       *frags;
   unsigned           nsbs;
   unsigned           sbi;
@@ -622,6 +623,7 @@
   else flag=0;
   sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
   sb_flags=_dec->state.sb_flags;
+  mb_modes=_dec->state.mb_modes;
   frags=_dec->state.frags;
   sbi=nsbs=run_count=0;
   coded_fragis=_dec->state.coded_fragis;
@@ -632,7 +634,9 @@
     for(;sbi<nsbs;sbi++){
       int quadi;
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+        int quad_coded;
         int bi;
+        quad_coded=0;
         for(bi=0;bi<4;bi++){
           ptrdiff_t fragi;
           fragi=sb_maps[sbi][quadi][bi];
@@ -650,10 +654,13 @@
             }
             if(coded)coded_fragis[ncoded_fragis++]=fragi;
             else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+            quad_coded|=coded;
             frags[fragi].coded=coded;
             frags[fragi].refi=OC_FRAME_NONE;
           }
         }
+        /*Remember if there's a coded luma block in this macro block.*/
+        if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
       }
     }
     _dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
@@ -694,9 +701,7 @@
 
 /*Unpacks the list of macro block modes for INTER frames.*/
 static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
-  const oc_mb_map     *mb_maps;
   signed char         *mb_modes;
-  const oc_fragment   *frags;
   const unsigned char *alphabet;
   unsigned char        scheme0_alphabet[8];
   const ogg_int16_t   *mode_tree;
@@ -723,22 +728,14 @@
   else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
   mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
   mb_modes=_dec->state.mb_modes;
-  mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
   nmbs=_dec->state.nmbs;
-  frags=_dec->state.frags;
   for(mbi=0;mbi<nmbs;mbi++){
-    if(mb_modes[mbi]!=OC_MODE_INVALID){
-      /*Check for a coded luma block in this macro block.*/
-      if(frags[mb_maps[mbi][0][0]].coded
-       ||frags[mb_maps[mbi][0][1]].coded
-       ||frags[mb_maps[mbi][0][2]].coded
-       ||frags[mb_maps[mbi][0][3]].coded){
-        /*We found one, decode a mode.*/
-        mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
-      }
-      /*There were none: INTER_NOMV is forced.*/
-      else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+    if(mb_modes[mbi]>0){
+      /*We have a coded luma block; decode a mode.*/
+      mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
     }
+    /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
+       fact that OC_MODE_INTER_NOMV is already 0.*/
   }
 }
 
@@ -839,80 +836,65 @@
     if(mb_mode!=OC_MODE_INVALID){
       oc_mv     mbmv;
       ptrdiff_t fragi;
-      int       coded[13];
-      int       codedi;
-      int       ncoded;
       int       mapi;
       int       mapii;
       int       refi;
-      /*Search for at least one coded fragment.*/
-      ncoded=mapii=0;
-      do{
-        mapi=map_idxs[mapii];
-        fragi=mb_maps[mbi][mapi>>2][mapi&3];
-        if(frags[fragi].coded)coded[ncoded++]=mapi;
-      }
-      while(++mapii<map_nidxs);
-      if(ncoded<=0)continue;
-      refi=OC_FRAME_FOR_MODE(mb_mode);
-      switch(mb_mode){
-        case OC_MODE_INTER_MV_FOUR:{
-          oc_mv       lbmvs[4];
-          int         bi;
-          /*Mark the tail of the list, so we don't accidentally go past it.*/
-          coded[ncoded]=-1;
-          for(bi=codedi=0;bi<4;bi++){
-            if(coded[codedi]==bi){
-              codedi++;
-              fragi=mb_maps[mbi][0][bi];
-              frags[fragi].refi=refi;
-              frags[fragi].mb_mode=mb_mode;
-              lbmvs[bi]=oc_mv_unpack(&_dec->opb,mv_comp_tree);
-              frag_mvs[fragi]=lbmvs[bi];
-            }
-            else lbmvs[bi]=0;
+      if(mb_mode==OC_MODE_INTER_MV_FOUR){
+        oc_mv lbmvs[4];
+        int   bi;
+        prior_mv=last_mv;
+        for(bi=0;bi<4;bi++){
+          fragi=mb_maps[mbi][0][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+            frag_mvs[fragi]=lbmvs[bi];
           }
-          if(codedi>0){
-            prior_mv=last_mv;
-            last_mv=lbmvs[coded[codedi-1]];
+          else lbmvs[bi]=0;
+        }
+        (*set_chroma_mvs)(cbmvs,lbmvs);
+        for(mapii=4;mapii<map_nidxs;mapii++){
+          mapi=map_idxs[mapii];
+          bi=mapi&3;
+          fragi=mb_maps[mbi][mapi>>2][bi];
+          if(frags[fragi].coded){
+            frags[fragi].refi=OC_FRAME_PREV;
+            frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+            frag_mvs[fragi]=cbmvs[bi];
           }
-          if(codedi<ncoded){
-            (*set_chroma_mvs)(cbmvs,lbmvs);
-            for(;codedi<ncoded;codedi++){
-              mapi=coded[codedi];
-              bi=mapi&3;
-              fragi=mb_maps[mbi][mapi>>2][bi];
-              frags[fragi].refi=refi;
-              frags[fragi].mb_mode=mb_mode;
-              frag_mvs[fragi]=cbmvs[bi];
-            }
-          }
-        }break;
-        case OC_MODE_INTER_MV:{
-          prior_mv=last_mv;
-          last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
-        }break;
-        case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
-        case OC_MODE_INTER_MV_LAST2:{
-          mbmv=prior_mv;
-          prior_mv=last_mv;
-          last_mv=mbmv;
-        }break;
-        case OC_MODE_GOLDEN_MV:{
-          mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
-        }break;
-        default:mbmv=0;break;
+        }
       }
-      /*4MV mode fills in the fragments itself.
-        For all other modes we can use this common code.*/
-      if(mb_mode!=OC_MODE_INTER_MV_FOUR){
-        for(codedi=0;codedi<ncoded;codedi++){
-          mapi=coded[codedi];
+      else{
+        switch(mb_mode){
+          case OC_MODE_INTER_MV:{
+            prior_mv=last_mv;
+            last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
+          case OC_MODE_INTER_MV_LAST2:{
+            mbmv=prior_mv;
+            prior_mv=last_mv;
+            last_mv=mbmv;
+          }break;
+          case OC_MODE_GOLDEN_MV:{
+            mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+          }break;
+          default:mbmv=0;break;
+        }
+        /*Fill in the MVs for the fragments.*/
+        refi=OC_FRAME_FOR_MODE(mb_mode);
+        mapii=0;
+        do{
+          mapi=map_idxs[mapii];
           fragi=mb_maps[mbi][mapi>>2][mapi&3];
-          frags[fragi].refi=refi;
-          frags[fragi].mb_mode=mb_mode;
-          frag_mvs[fragi]=mbmv;
+          if(frags[fragi].coded){
+            frags[fragi].refi=refi;
+            frags[fragi].mb_mode=mb_mode;
+            frag_mvs[fragi]=mbmv;
+          }
         }
+        while(++mapii<map_nidxs);
       }
     }
   }



More information about the commits mailing list