[xiph-commits] r17481 - in trunk/theora/lib: . arm
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Sun Oct 3 15:49:42 PDT 2010
Author: tterribe
Date: 2010-10-03 15:49:42 -0700 (Sun, 03 Oct 2010)
New Revision: 17481
Modified:
trunk/theora/lib/arm/armbits.s
trunk/theora/lib/arm/armfrag.s
trunk/theora/lib/arm/armidct.s
trunk/theora/lib/arm/armloop.s
trunk/theora/lib/decode.c
Log:
Add PROC/ENDP markings to the ARM asm (currently ignored by the GNU toolchain).
Also slightly simplify the MB mode and MV decoding.
The new code uses slightly less cache and fewer lookups.
Modified: trunk/theora/lib/arm/armbits.s
===================================================================
--- trunk/theora/lib/arm/armbits.s 2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armbits.s 2010-10-03 22:49:42 UTC (rev 17481)
@@ -21,32 +21,33 @@
EXPORT oc_pack_read1_arm
EXPORT oc_huff_token_decode_arm
-oc_pack_read_arm
+oc_pack_read1_arm PROC
; r0 = oc_pack_buf *_b
- ; r1 = int _bits
ADD r12,r0,#8
LDMIA r12,{r2,r3} ; r2 = window
; Stall... ; r3 = available
; Stall...
- SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT
- BLT oc_pack_read_refill
- RSB r0,r1,#32 ; r0 = 32-_bits
- MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
- MOV r2,r2,LSL r1 ; r2 = window<<=_bits
+ SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT
+ BLT oc_pack_read1_refill
+ MOV r0,r2,LSR #31 ; r0 = window>>31
+ MOV r2,r2,LSL #1 ; r2 = window<<=1
STMIA r12,{r2,r3} ; window = r2
; available = r3
MOV PC,r14
+ ENDP
-oc_pack_read1_arm
+oc_pack_read_arm PROC
; r0 = oc_pack_buf *_b
+ ; r1 = int _bits
ADD r12,r0,#8
LDMIA r12,{r2,r3} ; r2 = window
; Stall... ; r3 = available
; Stall...
- SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT
- BLT oc_pack_read1_refill
- MOV r0,r2,LSR #31 ; r0 = window>>31
- MOV r2,r2,LSL #1 ; r2 = window<<=1
+ SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT
+ BLT oc_pack_read_refill
+ RSB r0,r1,#32 ; r0 = 32-_bits
+ MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
+ MOV r2,r2,LSL r1 ; r2 = window<<=_bits
STMIA r12,{r2,r3} ; window = r2
; available = r3
MOV PC,r14
@@ -117,10 +118,11 @@
STMIA r12,{r2,r3} ; window = r2
; available = r3
LDMFD r13!,{r10,r11,PC}
+ ENDP
-oc_huff_token_decode_arm
+oc_huff_token_decode_arm PROC
; r0 = oc_pack_buf *_b
; r1 = const ogg_int16_t *_tree
STMFD r13!,{r4,r5,r10,r14}
@@ -223,5 +225,6 @@
; available = r5
AND r0,r14,#255 ; r0 = node&255
LDMFD r13!,{r4,r5,r10,pc}
+ ENDP
END
Modified: trunk/theora/lib/arm/armfrag.s
===================================================================
--- trunk/theora/lib/arm/armfrag.s 2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armfrag.s 2010-10-03 22:49:42 UTC (rev 17481)
@@ -24,7 +24,7 @@
EXPORT oc_frag_recon_inter_arm
EXPORT oc_frag_recon_inter2_arm
-oc_frag_copy_list_arm
+oc_frag_copy_list_arm PROC
; r0 = _dst_frame
; r1 = _src_frame
; r2 = _ystride
@@ -132,8 +132,9 @@
SUBS r14,r14,#1
BGT ofrintra_lp_arm
LDMFD r13!,{r4,r5,PC}
+ ENDP
-oc_frag_recon_inter_arm
+oc_frag_recon_inter_arm PROC
; r0 = unsigned char *dst
; r1 = const unsigned char *src
; r2 = int ystride
@@ -194,8 +195,9 @@
SUBS r9, r9, #1
BGT ofrinter_lp_arm
LDMFD r13!,{r5,r9-r11,PC}
+ ENDP
-oc_frag_recon_inter2_arm
+oc_frag_recon_inter2_arm PROC
; r0 = unsigned char *dst
; r1 = const unsigned char *src1
; r2 = const unsigned char *src2
@@ -274,11 +276,12 @@
SUBS r14,r14,#1
BGT ofrinter2_lp_arm
LDMFD r13!,{r4-r8,PC}
+ ENDP
[ OC_ARM_ASM_EDSP
EXPORT oc_frag_copy_list_edsp
-oc_frag_copy_list_edsp
+oc_frag_copy_list_edsp PROC
; r0 = _dst_frame
; r1 = _src_frame
; r2 = _ystride
@@ -320,6 +323,7 @@
BGE ofcl_edsp_lp
ofcl_edsp_end
LDMFD r13!,{r4-r11,PC}
+ ENDP
]
[ OC_ARM_ASM_MEDIA
@@ -327,7 +331,7 @@
EXPORT oc_frag_recon_inter_v6
EXPORT oc_frag_recon_inter2_v6
-oc_frag_recon_intra_v6
+oc_frag_recon_intra_v6 PROC
; r0 = unsigned char *_dst
; r1 = int _ystride
; r2 = const ogg_int16_t _residue[64]
@@ -356,8 +360,9 @@
STRD r2, [r0], r1
BGT ofrintra_v6_lp
LDMFD r13!,{r4-r6,PC}
+ ENDP
-oc_frag_recon_inter_v6
+oc_frag_recon_inter_v6 PROC
; r0 = unsigned char *_dst
; r1 = const unsigned char *_src
; r2 = int _ystride
@@ -395,8 +400,9 @@
STRD r4, [r0], r2
BGT ofrinter_v6_lp
LDMFD r13!,{r4-r7,PC}
+ ENDP
-oc_frag_recon_inter2_v6
+oc_frag_recon_inter2_v6 PROC
; r0 = unsigned char *_dst
; r1 = const unsigned char *_src1
; r2 = const unsigned char *_src2
@@ -436,6 +442,7 @@
STRD r8, [r0], r3
BGT ofrinter2_v6_lp
LDMFD r13!,{r4-r9,PC}
+ ENDP
]
[ OC_ARM_ASM_NEON
@@ -444,7 +451,7 @@
EXPORT oc_frag_recon_inter_neon
EXPORT oc_frag_recon_inter2_neon
-oc_frag_copy_list_neon
+oc_frag_copy_list_neon PROC
; r0 = _dst_frame
; r1 = _src_frame
; r2 = _ystride
@@ -497,8 +504,9 @@
BGT ofcl_neon_lp
ofcl_neon_end
LDMFD r13!,{r4-r7,PC}
+ ENDP
-oc_frag_recon_intra_neon
+oc_frag_recon_intra_neon PROC
; r0 = unsigned char *_dst
; r1 = int _ystride
; r2 = const ogg_int16_t _residue[64]
@@ -530,8 +538,9 @@
VST1.64 {D22},[r0 at 64], r1
VST1.64 {D23},[r0 at 64], r1
MOV PC,R14
+ ENDP
-oc_frag_recon_inter_neon
+oc_frag_recon_inter_neon PROC
; r0 = unsigned char *_dst
; r1 = const unsigned char *_src
; r2 = int _ystride
@@ -578,8 +587,9 @@
VST1.64 {D22},[r0 at 64], r2
VST1.64 {D23},[r0 at 64], r2
MOV PC,R14
+ ENDP
-oc_frag_recon_inter2_neon
+oc_frag_recon_inter2_neon PROC
; r0 = unsigned char *_dst
; r1 = const unsigned char *_src1
; r2 = const unsigned char *_src2
@@ -640,6 +650,7 @@
VST1.64 {D22},[r0 at 64], r3
VST1.64 {D23},[r0 at 64], r3
MOV PC,R14
+ ENDP
]
END
Modified: trunk/theora/lib/arm/armidct.s
===================================================================
--- trunk/theora/lib/arm/armidct.s 2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armidct.s 2010-10-03 22:49:42 UTC (rev 17481)
@@ -21,7 +21,7 @@
EXPORT oc_idct8x8_1_arm
EXPORT oc_idct8x8_arm
-oc_idct8x8_1_arm
+oc_idct8x8_1_arm PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_uint16_t _dc
ORR r1, r1, r1, LSL #16
@@ -37,8 +37,9 @@
STMIA r0!,{r1,r2,r3,r12}
STMIA r0!,{r1,r2,r3,r12}
MOV PC, r14
+ ENDP
-oc_idct8x8_arm
+oc_idct8x8_arm PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
; r2 = int _last_zzi
@@ -92,8 +93,9 @@
BL idct8core_down_arm
ADD r13,r13,#64*2
LDMFD r13!,{r4-r11,PC}
+ ENDP
-oc_idct8x8_10_arm
+oc_idct8x8_10_arm PROC
STMFD r13!,{r4-r11,r14}
SUB r13,r13,#64*2
; Row transforms
@@ -128,8 +130,9 @@
BL idct4core_down_arm
ADD r13,r13,#64*2
LDMFD r13!,{r4-r11,PC}
+ ENDP
-oc_idct8x8_6_arm
+oc_idct8x8_6_arm PROC
STMFD r13!,{r4-r7,r9-r11,r14}
SUB r13,r13,#64*2
; Row transforms
@@ -161,8 +164,9 @@
BL idct3core_down_arm
ADD r13,r13,#64*2
LDMFD r13!,{r4-r7,r9-r11,PC}
+ ENDP
-oc_idct8x8_3_arm
+oc_idct8x8_3_arm PROC
STMFD r13!,{r4-r7,r9-r11,r14}
SUB r13,r13,#64*2
; Row transforms
@@ -189,8 +193,9 @@
BL idct2core_down_arm
ADD r13,r13,#64*2
LDMFD r13!,{r4-r7,r9-r11,PC}
+ ENDP
-idct1core_arm
+idct1core_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r3, [r1], #16
@@ -208,8 +213,9 @@
STRH r3, [r0, #94]
STRH r3, [r0, #110]
MOV PC,R14
+ ENDP
-idct2core_arm
+idct2core_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r9, [r1], #16 ; r9 = x[0]
@@ -244,8 +250,9 @@
STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
MOV PC,r14
+ ENDP
-idct2core_down_arm
+idct2core_down_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r9, [r1], #16 ; r9 = x[0]
@@ -292,8 +299,9 @@
STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
MOV PC,r14
+ ENDP
-idct3core_arm
+idct3core_arm PROC
LDRSH r9, [r1], #16 ; r9 = x[0]
LDR r12,OC_C4S4 ; r12= OC_C4S4
LDRSH r3, [r1, #-12] ; r3 = x[2]
@@ -337,8 +345,9 @@
STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
MOV PC,R14
+ ENDP
-idct3core_down_arm
+idct3core_down_arm PROC
LDRSH r9, [r1], #16 ; r9 = x[0]
LDR r12,OC_C4S4 ; r12= OC_C4S4
LDRSH r3, [r1, #-12] ; r3 = x[2]
@@ -394,8 +403,9 @@
STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
MOV PC,R14
+ ENDP
-idct4core_arm
+idct4core_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r9, [r1], #16 ; r9 = x[0]
@@ -451,8 +461,9 @@
STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
MOV PC,r14
+ ENDP
-idct4core_down_arm
+idct4core_down_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r9, [r1], #16 ; r9 = x[0]
@@ -520,8 +531,9 @@
STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
MOV PC,r14
+ ENDP
-idct8core_arm
+idct8core_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r2, [r1],#16 ; r2 = x[0]
@@ -621,8 +633,9 @@
STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
LDMFD r13!,{r1,PC}
+ ENDP
-idct8core_down_arm
+idct8core_down_arm PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r2, [r1],#16 ; r2 = x[0]
@@ -735,12 +748,13 @@
STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
LDMFD r13!,{r1,PC}
+ ENDP
[ OC_ARM_ASM_MEDIA
EXPORT oc_idct8x8_1_v6
EXPORT oc_idct8x8_v6
-oc_idct8x8_1_v6
+oc_idct8x8_1_v6 PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_uint16_t _dc
ORR r2, r1, r1, LSL #16
@@ -762,8 +776,9 @@
STRD r2, [r0], #8
STRD r2, [r0], #8
MOV PC, r14
+ ENDP
-oc_idct8x8_v6
+oc_idct8x8_v6 PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
; r2 = int _last_zzi
@@ -815,8 +830,9 @@
BL idct8_8core_down_v6
ADD r13,r13,#64*2
LDMFD r13!,{r4-r11,PC}
+ ENDP
-oc_idct8x8_10_v6
+oc_idct8x8_10_v6 PROC
STMFD r13!,{r4-r11,r14}
SUB r13,r13,#64*2+4
; Row transforms
@@ -847,8 +863,9 @@
BL idct4_4core_down_v6
ADD r13,r13,#64*2+4
LDMFD r13!,{r4-r11,PC}
+ ENDP
-oc_idct8x8_3_v6
+oc_idct8x8_3_v6 PROC
STMFD r13!,{r4-r8,r14}
SUB r13,r13,#64*2
; Row transforms
@@ -870,8 +887,9 @@
BL idct2_2core_down_v6
ADD r13,r13,#64*2
LDMFD r13!,{r4-r8,PC}
+ ENDP
-idct2_1core_v6
+idct2_1core_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -912,6 +930,7 @@
SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]
STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
MOV PC,r14
+ ENDP
]
ALIGN 8
@@ -931,7 +950,7 @@
DCD 46341 ; B505
[ OC_ARM_ASM_MEDIA
-idct2_2core_down_v6
+idct2_2core_down_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -1000,13 +1019,14 @@
PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4
STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
MOV PC,r14
+ ENDP
; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
; pay for increased branch mis-prediction to get here, but in practice it
; doesn't seem to slow anything down to take it out, and it's less code this
; way.
[ 0
-oc_idct8x8_6_v6
+oc_idct8x8_6_v6 PROC
STMFD r13!,{r4-r8,r10,r11,r14}
SUB r13,r13,#64*2+4
; Row transforms
@@ -1035,8 +1055,9 @@
BL idct3_3core_down_v6
ADD r13,r13,#64*2+4
LDMFD r13!,{r4-r8,r10,r11,PC}
+ ENDP
-idct1core_v6
+idct1core_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
LDRSH r3, [r1], #16
@@ -1051,8 +1072,9 @@
STRH r3, [r0, #62]
STRH r3, [r0, #94]
MOV PC,R14
+ ENDP
-idct3_2core_v6
+idct3_2core_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -1083,6 +1105,7 @@
SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
; Stage 3:
B idct4_3core_stage3_v6
+ ENDP
; Another copy so the LDRD offsets are less than +/- 255.
ALIGN 8
@@ -1095,7 +1118,7 @@
OC_C2S6_3_v6
DCD 60547 ; EC83
-idct3_3core_down_v6
+idct3_3core_down_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -1127,9 +1150,10 @@
SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
; Stage 3:
B idct4_4core_down_stage3_v6
+ ENDP
]
-idct4_3core_v6
+idct4_3core_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -1196,6 +1220,7 @@
SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]
STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
MOV PC,r14
+ ENDP
; Another copy so the LDRD offsets are less than +/- 255.
ALIGN 8
@@ -1212,7 +1237,7 @@
OC_C3S5_4_v6
DCD 54491 ; D4DB
-idct4_4core_down_v6
+idct4_4core_down_v6 PROC
; r0 = ogg_int16_t *_y (destination)
; r1 = const ogg_int16_t *_x (source)
; Stage 1:
@@ -1259,8 +1284,9 @@
PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8
B idct8_8core_down_stage3_5_v6
+ ENDP
-idct8_8core_v6
+idct8_8core_v6 PROC
STMFD r13!,{r0,r14}
; Stage 1:
;5-6 rotation by 3pi/16
@@ -1342,6 +1368,7 @@
SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2]
LDMFD r13!,{r0,r14}
B idct4_3core_stage3_5_v6
+ ENDP
; Another copy so the LDRD offsets are less than +/- 255.
ALIGN 8
@@ -1358,7 +1385,7 @@
OC_C3S5_8_v6
DCD 54491 ; D4DB
-idct8_8core_down_v6
+idct8_8core_down_v6 PROC
STMFD r13!,{r0,r14}
; Stage 1:
;5-6 rotation by 3pi/16
@@ -1487,6 +1514,7 @@
PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4
STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
MOV PC,r14
+ ENDP
]
[ OC_ARM_ASM_NEON
@@ -1504,7 +1532,7 @@
DCW 25080 ; 30FC (C6S2)
DCW 12785 ; 31F1 (C7S1)
-oc_idct8x8_1_neon
+oc_idct8x8_1_neon PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_uint16_t _dc
VDUP.S16 Q0, r1
@@ -1514,8 +1542,9 @@
VST1.64 {D0, D1, D2, D3}, [r0 at 128]!
VST1.64 {D0, D1, D2, D3}, [r0 at 128]
MOV PC, r14
+ ENDP
-oc_idct8x8_neon
+oc_idct8x8_neon PROC
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
; r2 = int _last_zzi
@@ -1606,8 +1635,9 @@
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
+ ENDP
-oc_idct8x8_stage123_neon
+oc_idct8x8_stage123_neon PROC
; Stages 1 & 2
VMULL.S16 Q4, D18,D1[3]
VMULL.S16 Q5, D19,D1[3]
@@ -1703,8 +1733,9 @@
VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2]
VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]'
MOV PC, r14
+ ENDP
-oc_idct8x8_10_neon
+oc_idct8x8_10_neon PROC
ADR r3, OC_IDCT_CONSTS_NEON
VLD1.64 {D0,D1}, [r3 at 128]
MOV r2, r1
@@ -1871,6 +1902,7 @@
VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
VSTMIA r0, {D16-D31}
MOV PC, r14
+ ENDP
]
END
Modified: trunk/theora/lib/arm/armloop.s
===================================================================
--- trunk/theora/lib/arm/armloop.s 2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/arm/armloop.s 2010-10-03 22:49:42 UTC (rev 17481)
@@ -25,7 +25,7 @@
OC_FRAG_CODED_FLAG * 1
; Vanilla ARM v4 version
-loop_filter_h_arm
+loop_filter_h_arm PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int *_bv
@@ -58,8 +58,9 @@
BGT lfh_arm_lp
SUB r0, r0, r1, LSL #3
LDMFD r13!,{r3-r6,PC}
+ ENDP
-loop_filter_v_arm
+loop_filter_v_arm PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int *_bv
@@ -92,8 +93,9 @@
BGT lfv_arm_lp
SUB r0, r0, #8
LDMFD r13!,{r3-r6,PC}
+ ENDP
-oc_loop_filter_frag_rows_arm
+oc_loop_filter_frag_rows_arm PROC
; r0 = _ref_frame_data
; r1 = _ystride
; r2 = _bv
@@ -158,12 +160,13 @@
CMP r4, r5
BLT oslffri_arm_lp1
LDMFD r13!,{r0,r4-r11,PC}
+ ENDP
[ OC_ARM_ASM_MEDIA
EXPORT oc_loop_filter_init_v6
EXPORT oc_loop_filter_frag_rows_v6
-oc_loop_filter_init_v6
+oc_loop_filter_init_v6 PROC
; r0 = _bv
; r1 = _flimit (=L from the spec)
MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
@@ -172,6 +175,7 @@
PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
STR r1, [r0]
MOV PC,r14
+ ENDP
; We could use the same strategy as the v filter below, but that would require
; 40 instructions to load the data and transpose it into columns and another
@@ -184,7 +188,7 @@
; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
; His is a lot less code, though, because it only does two rows at once instead
; of four.
-loop_filter_h_v6
+loop_filter_h_v6 PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int _ll
@@ -196,8 +200,9 @@
BL loop_filter_h_core_v6
SUB r0, r0, r1, LSL #2
LDMFD r13!,{r4-r11,PC}
+ ENDP
-loop_filter_h_core_v6
+loop_filter_h_core_v6 PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int _ll
@@ -278,6 +283,7 @@
; Single issue
STRB r8, [r0,#-1]
MOV PC,r14
+ ENDP
; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
@@ -294,7 +300,7 @@
; It executes about 2/3 the number of instructions of David Conrad's approach,
; but requires more code, because it does all eight columns at once, instead
; of four at a time.
-loop_filter_v_v6
+loop_filter_v_v6 PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int _ll
@@ -370,8 +376,9 @@
UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
STRD r8, [r0] ; [p6:p2] = [r9: r8]
LDMFD r13!,{r4-r11,PC}
+ ENDP
-oc_loop_filter_frag_rows_v6
+oc_loop_filter_frag_rows_v6 PROC
; r0 = _ref_frame_data
; r1 = _ystride
; r2 = _bv
@@ -436,21 +443,23 @@
CMP r4, r5
BLT oslffri_v6_lp1
LDMFD r13!,{r0,r4-r11,PC}
+ ENDP
]
[ OC_ARM_ASM_NEON
EXPORT oc_loop_filter_init_neon
EXPORT oc_loop_filter_frag_rows_neon
-oc_loop_filter_init_neon
+oc_loop_filter_init_neon PROC
; r0 = _bv
; r1 = _flimit (=L from the spec)
MOV r1, r1, LSL #1 ; r1 = 2*L
VDUP.S16 Q15, r1 ; Q15= 2L in U16s
VST1.64 {D30,D31}, [r0 at 128]
MOV PC,r14
+ ENDP
-loop_filter_h_neon
+loop_filter_h_neon PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int *_bv
@@ -536,8 +545,9 @@
VST1.16 {D4[3]}, [r12], r1
VST1.16 {D2[3]}, [r12], r1
MOV PC,r14
+ ENDP
-loop_filter_v_neon
+loop_filter_v_neon PROC
; r0 = unsigned char *_pix
; r1 = int _ystride
; r2 = int *_bv
@@ -593,8 +603,9 @@
VST1.64 {D2}, [r12 at 64], r1
VST1.64 {D4}, [r12 at 64], r1
MOV PC,r14
+ ENDP
-oc_loop_filter_frag_rows_neon
+oc_loop_filter_frag_rows_neon PROC
; r0 = _ref_frame_data
; r1 = _ystride
; r2 = _bv
@@ -659,6 +670,7 @@
CMP r4, r5
BLT oslffri_neon_lp1
LDMFD r13!,{r0,r4-r11,PC}
+ ENDP
]
END
Modified: trunk/theora/lib/decode.c
===================================================================
--- trunk/theora/lib/decode.c 2010-10-03 22:48:49 UTC (rev 17480)
+++ trunk/theora/lib/decode.c 2010-10-03 22:49:42 UTC (rev 17481)
@@ -600,6 +600,7 @@
static void oc_dec_coded_flags_unpack(oc_dec_ctx *_dec){
const oc_sb_map *sb_maps;
const oc_sb_flags *sb_flags;
+ signed char *mb_modes;
oc_fragment *frags;
unsigned nsbs;
unsigned sbi;
@@ -622,6 +623,7 @@
else flag=0;
sb_maps=(const oc_sb_map *)_dec->state.sb_maps;
sb_flags=_dec->state.sb_flags;
+ mb_modes=_dec->state.mb_modes;
frags=_dec->state.frags;
sbi=nsbs=run_count=0;
coded_fragis=_dec->state.coded_fragis;
@@ -632,7 +634,9 @@
for(;sbi<nsbs;sbi++){
int quadi;
for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
+ int quad_coded;
int bi;
+ quad_coded=0;
for(bi=0;bi<4;bi++){
ptrdiff_t fragi;
fragi=sb_maps[sbi][quadi][bi];
@@ -650,10 +654,13 @@
}
if(coded)coded_fragis[ncoded_fragis++]=fragi;
else *(uncoded_fragis-++nuncoded_fragis)=fragi;
+ quad_coded|=coded;
frags[fragi].coded=coded;
frags[fragi].refi=OC_FRAME_NONE;
}
}
+ /*Remember if there's a coded luma block in this macro block.*/
+ if(!pli)mb_modes[sbi<<2|quadi]=quad_coded;
}
}
_dec->state.ncoded_fragis[pli]=ncoded_fragis-prev_ncoded_fragis;
@@ -694,9 +701,7 @@
/*Unpacks the list of macro block modes for INTER frames.*/
static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
- const oc_mb_map *mb_maps;
signed char *mb_modes;
- const oc_fragment *frags;
const unsigned char *alphabet;
unsigned char scheme0_alphabet[8];
const ogg_int16_t *mode_tree;
@@ -723,22 +728,14 @@
else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
mode_tree=mode_scheme==7?OC_CLC_MODE_TREE:OC_VLC_MODE_TREE;
mb_modes=_dec->state.mb_modes;
- mb_maps=(const oc_mb_map *)_dec->state.mb_maps;
nmbs=_dec->state.nmbs;
- frags=_dec->state.frags;
for(mbi=0;mbi<nmbs;mbi++){
- if(mb_modes[mbi]!=OC_MODE_INVALID){
- /*Check for a coded luma block in this macro block.*/
- if(frags[mb_maps[mbi][0][0]].coded
- ||frags[mb_maps[mbi][0][1]].coded
- ||frags[mb_maps[mbi][0][2]].coded
- ||frags[mb_maps[mbi][0][3]].coded){
- /*We found one, decode a mode.*/
- mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
- }
- /*There were none: INTER_NOMV is forced.*/
- else mb_modes[mbi]=OC_MODE_INTER_NOMV;
+ if(mb_modes[mbi]>0){
+ /*We have a coded luma block; decode a mode.*/
+ mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
}
+ /*For other valid macro blocks, INTER_NOMV is forced, but we rely on the
+ fact that OC_MODE_INTER_NOMV is already 0.*/
}
}
@@ -839,80 +836,65 @@
if(mb_mode!=OC_MODE_INVALID){
oc_mv mbmv;
ptrdiff_t fragi;
- int coded[13];
- int codedi;
- int ncoded;
int mapi;
int mapii;
int refi;
- /*Search for at least one coded fragment.*/
- ncoded=mapii=0;
- do{
- mapi=map_idxs[mapii];
- fragi=mb_maps[mbi][mapi>>2][mapi&3];
- if(frags[fragi].coded)coded[ncoded++]=mapi;
- }
- while(++mapii<map_nidxs);
- if(ncoded<=0)continue;
- refi=OC_FRAME_FOR_MODE(mb_mode);
- switch(mb_mode){
- case OC_MODE_INTER_MV_FOUR:{
- oc_mv lbmvs[4];
- int bi;
- /*Mark the tail of the list, so we don't accidentally go past it.*/
- coded[ncoded]=-1;
- for(bi=codedi=0;bi<4;bi++){
- if(coded[codedi]==bi){
- codedi++;
- fragi=mb_maps[mbi][0][bi];
- frags[fragi].refi=refi;
- frags[fragi].mb_mode=mb_mode;
- lbmvs[bi]=oc_mv_unpack(&_dec->opb,mv_comp_tree);
- frag_mvs[fragi]=lbmvs[bi];
- }
- else lbmvs[bi]=0;
+ if(mb_mode==OC_MODE_INTER_MV_FOUR){
+ oc_mv lbmvs[4];
+ int bi;
+ prior_mv=last_mv;
+ for(bi=0;bi<4;bi++){
+ fragi=mb_maps[mbi][0][bi];
+ if(frags[fragi].coded){
+ frags[fragi].refi=OC_FRAME_PREV;
+ frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+ lbmvs[bi]=last_mv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+ frag_mvs[fragi]=lbmvs[bi];
}
- if(codedi>0){
- prior_mv=last_mv;
- last_mv=lbmvs[coded[codedi-1]];
+ else lbmvs[bi]=0;
+ }
+ (*set_chroma_mvs)(cbmvs,lbmvs);
+ for(mapii=4;mapii<map_nidxs;mapii++){
+ mapi=map_idxs[mapii];
+ bi=mapi&3;
+ fragi=mb_maps[mbi][mapi>>2][bi];
+ if(frags[fragi].coded){
+ frags[fragi].refi=OC_FRAME_PREV;
+ frags[fragi].mb_mode=OC_MODE_INTER_MV_FOUR;
+ frag_mvs[fragi]=cbmvs[bi];
}
- if(codedi<ncoded){
- (*set_chroma_mvs)(cbmvs,lbmvs);
- for(;codedi<ncoded;codedi++){
- mapi=coded[codedi];
- bi=mapi&3;
- fragi=mb_maps[mbi][mapi>>2][bi];
- frags[fragi].refi=refi;
- frags[fragi].mb_mode=mb_mode;
- frag_mvs[fragi]=cbmvs[bi];
- }
- }
- }break;
- case OC_MODE_INTER_MV:{
- prior_mv=last_mv;
- last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
- }break;
- case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
- case OC_MODE_INTER_MV_LAST2:{
- mbmv=prior_mv;
- prior_mv=last_mv;
- last_mv=mbmv;
- }break;
- case OC_MODE_GOLDEN_MV:{
- mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
- }break;
- default:mbmv=0;break;
+ }
}
- /*4MV mode fills in the fragments itself.
- For all other modes we can use this common code.*/
- if(mb_mode!=OC_MODE_INTER_MV_FOUR){
- for(codedi=0;codedi<ncoded;codedi++){
- mapi=coded[codedi];
+ else{
+ switch(mb_mode){
+ case OC_MODE_INTER_MV:{
+ prior_mv=last_mv;
+ last_mv=mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+ }break;
+ case OC_MODE_INTER_MV_LAST:mbmv=last_mv;break;
+ case OC_MODE_INTER_MV_LAST2:{
+ mbmv=prior_mv;
+ prior_mv=last_mv;
+ last_mv=mbmv;
+ }break;
+ case OC_MODE_GOLDEN_MV:{
+ mbmv=oc_mv_unpack(&_dec->opb,mv_comp_tree);
+ }break;
+ default:mbmv=0;break;
+ }
+ /*Fill in the MVs for the fragments.*/
+ refi=OC_FRAME_FOR_MODE(mb_mode);
+ mapii=0;
+ do{
+ mapi=map_idxs[mapii];
fragi=mb_maps[mbi][mapi>>2][mapi&3];
- frags[fragi].refi=refi;
- frags[fragi].mb_mode=mb_mode;
- frag_mvs[fragi]=mbmv;
+ if(frags[fragi].coded){
+ frags[fragi].refi=refi;
+ frags[fragi].mb_mode=mb_mode;
+ frag_mvs[fragi]=mbmv;
+ }
}
+ while(++mapii<map_nidxs);
}
}
}
More information about the commits
mailing list