[xiph-commits] r17428 - experimental/derf/theora-ptalarbvorm/lib/arm
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Wed Sep 22 12:50:33 PDT 2010
Author: tterribe
Date: 2010-09-22 12:50:32 -0700 (Wed, 22 Sep 2010)
New Revision: 17428
Modified:
experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
Log:
Add DC-only iDCT asm.
This was just a 1-line for loop, but gcc failed to vectorize it on ARM, and
it gets executed quite a lot, especially at low bitrates.
This saves about 3% on a Cortex A8.
Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s 2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s 2010-09-22 19:50:32 UTC (rev 17428)
@@ -18,8 +18,26 @@
GET armopts.s
+ EXPORT oc_idct8x8_1_arm
EXPORT oc_idct8x8_arm
+oc_idct8x8_1_arm
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_uint16_t _dc
+ ORR r1, r1, r1, LSL #16
+ MOV r2, r1
+ MOV r3, r1
+ MOV r12,r1
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ STMIA r0!,{r1,r2,r3,r12}
+ MOV PC, r14
+
oc_idct8x8_arm
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
@@ -719,8 +737,32 @@
LDMFD r13!,{r1,PC}
[ OC_ARM_ASM_MEDIA
+ EXPORT oc_idct8x8_1_v6
EXPORT oc_idct8x8_v6
+oc_idct8x8_1_v6
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_uint16_t _dc
+ ORR r2, r1, r1, LSL #16
+ ORR r3, r1, r1, LSL #16
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ STRD r2, [r0], #8
+ MOV PC, r14
+
oc_idct8x8_v6
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
@@ -1448,6 +1490,7 @@
]
[ OC_ARM_ASM_NEON
+ EXPORT oc_idct8x8_1_neon
EXPORT oc_idct8x8_neon
ALIGN 16
@@ -1461,6 +1504,17 @@
DCW 25080 ; 30FC (C6S2)
DCW 12785 ; 31F1 (C7S1)
+oc_idct8x8_1_neon
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_uint16_t _dc
+ VDUP.S16 Q0, r1
+ VMOV Q1, Q0
+ VST1.64 {D0, D1, D2, D3}, [r0 at 128]!
+ VST1.64 {D0, D1, D2, D3}, [r0 at 128]!
+ VST1.64 {D0, D1, D2, D3}, [r0 at 128]!
+ VST1.64 {D0, D1, D2, D3}, [r0 at 128]
+ MOV PC, r14
+
oc_idct8x8_neon
; r0 = ogg_int16_t *_y
; r1 = ogg_int16_t *_x
Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armint.h 2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armint.h 2010-09-22 19:50:32 UTC (rev 17428)
@@ -68,7 +68,10 @@
int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
@@ -86,7 +89,10 @@
int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
@@ -103,7 +109,10 @@
int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c 2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c 2010-09-22 19:50:32 UTC (rev 17428)
@@ -50,6 +50,7 @@
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
_state->opt_vtable.idct8x8=oc_idct8x8_arm;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
/*Note: We _must_ set this function pointer, because the macro in armint.h
calls it with different arguments, so the C version will segfault.*/
_state->opt_vtable.state_loop_filter_frag_rows=
@@ -68,6 +69,7 @@
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
_state->opt_vtable.idct8x8=oc_idct8x8_v6;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
_state->opt_vtable.state_loop_filter_frag_rows=
(oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
@@ -80,6 +82,7 @@
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+ _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
_state->opt_vtable.state_loop_filter_frag_rows=
(oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
@@ -92,4 +95,137 @@
# endif
}
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+ unsigned char *dst;
+ ptrdiff_t frag_buf_off;
+ int ystride;
+ int mb_mode;
+ /*Apply the inverse transform.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ ogg_uint16_t p;
+ /*We round this dequant product (and not any of the others) because there's
+ no iDCT rounding.*/
+ p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+ oc_idct8x8_1_arm(_dct_coeffs+64,p);
+ }
+ else{
+ /*First, dequantize the DC coefficient.*/
+ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+ oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+ }
+ /*Fill in the target buffer.*/
+ frag_buf_off=_state->frag_buf_offs[_fragi];
+ mb_mode=_state->frags[_fragi].mb_mode;
+ ystride=_state->ref_ystride[_pli];
+ dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+ if(mb_mode==OC_MODE_INTRA){
+ oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+ }
+ else{
+ const unsigned char *ref;
+ int mvoffsets[2];
+ ref=
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+ +frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+ _state->frag_mvs[_fragi])>1){
+ oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+ _dct_coeffs+64);
+ }
+ else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+ }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+ unsigned char *dst;
+ ptrdiff_t frag_buf_off;
+ int ystride;
+ int mb_mode;
+ /*Apply the inverse transform.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ ogg_uint16_t p;
+ /*We round this dequant product (and not any of the others) because there's
+ no iDCT rounding.*/
+ p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+ oc_idct8x8_1_v6(_dct_coeffs+64,p);
+ }
+ else{
+ /*First, dequantize the DC coefficient.*/
+ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+ oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+ }
+ /*Fill in the target buffer.*/
+ frag_buf_off=_state->frag_buf_offs[_fragi];
+ mb_mode=_state->frags[_fragi].mb_mode;
+ ystride=_state->ref_ystride[_pli];
+ dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+ if(mb_mode==OC_MODE_INTRA){
+ oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+ }
+ else{
+ const unsigned char *ref;
+ int mvoffsets[2];
+ ref=
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+ +frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+ _state->frag_mvs[_fragi])>1){
+ oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+ _dct_coeffs+64);
+ }
+ else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+ }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+ unsigned char *dst;
+ ptrdiff_t frag_buf_off;
+ int ystride;
+ int mb_mode;
+ /*Apply the inverse transform.*/
+ /*Special case only having a DC component.*/
+ if(_last_zzi<2){
+ ogg_uint16_t p;
+ /*We round this dequant product (and not any of the others) because there's
+ no iDCT rounding.*/
+ p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+ oc_idct8x8_1_neon(_dct_coeffs+64,p);
+ }
+ else{
+ /*First, dequantize the DC coefficient.*/
+ _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+ oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+ }
+ /*Fill in the target buffer.*/
+ frag_buf_off=_state->frag_buf_offs[_fragi];
+ mb_mode=_state->frags[_fragi].mb_mode;
+ ystride=_state->ref_ystride[_pli];
+ dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+ if(mb_mode==OC_MODE_INTRA){
+ oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+ }
+ else{
+ const unsigned char *ref;
+ int mvoffsets[2];
+ ref=
+ _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+ +frag_buf_off;
+ if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+ _state->frag_mvs[_fragi])>1){
+ oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+ _dct_coeffs+64);
+ }
+ else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+ }
+}
+# endif
+# endif
+
#endif
More information about the commits
mailing list