[xiph-commits] r17428 - experimental/derf/theora-ptalarbvorm/lib/arm

tterribe at svn.xiph.org tterribe at svn.xiph.org
Wed Sep 22 12:50:33 PDT 2010


Author: tterribe
Date: 2010-09-22 12:50:32 -0700 (Wed, 22 Sep 2010)
New Revision: 17428

Modified:
   experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
   experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
   experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
Log:
Add DC-only iDCT asm.

This was just a 1-line for loop, but gcc failed to vectorize it on ARM, and
 it gets executed quite a lot, especially at low bitrates.
This saves about 3% on a Cortex A8.


Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s	2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s	2010-09-22 19:50:32 UTC (rev 17428)
@@ -18,8 +18,26 @@
 
 	GET	armopts.s
 
+	EXPORT	oc_idct8x8_1_arm
 	EXPORT	oc_idct8x8_arm
 
+oc_idct8x8_1_arm
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r1, r1, r1, LSL #16
+	MOV	r2, r1
+	MOV	r3, r1
+	MOV	r12,r1
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	STMIA	r0!,{r1,r2,r3,r12}
+	MOV	PC, r14
+
 oc_idct8x8_arm
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x
@@ -719,8 +737,32 @@
 	LDMFD	r13!,{r1,PC}
 
  [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_idct8x8_1_v6
 	EXPORT	oc_idct8x8_v6
 
+oc_idct8x8_1_v6
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	ORR	r2, r1, r1, LSL #16
+	ORR	r3, r1, r1, LSL #16
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	STRD	r2, [r0], #8
+	MOV	PC, r14
+
 oc_idct8x8_v6
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x
@@ -1448,6 +1490,7 @@
  ]
 
  [ OC_ARM_ASM_NEON
+	EXPORT	oc_idct8x8_1_neon
 	EXPORT	oc_idct8x8_neon
 
 	ALIGN 16
@@ -1461,6 +1504,17 @@
 	DCW	25080 ; 30FC (C6S2)
 	DCW	12785 ; 31F1 (C7S1)
 
+oc_idct8x8_1_neon
+	; r0 = ogg_int16_t  *_y
+	; r1 = ogg_uint16_t  _dc
+	VDUP.S16	Q0, r1
+	VMOV		Q1, Q0
+	VST1.64		{D0, D1, D2, D3}, [r0 at 128]!
+	VST1.64		{D0, D1, D2, D3}, [r0 at 128]!
+	VST1.64		{D0, D1, D2, D3}, [r0 at 128]!
+	VST1.64		{D0, D1, D2, D3}, [r0 at 128]
+	MOV	PC, r14
+
 oc_idct8x8_neon
 	; r0 = ogg_int16_t *_y
 	; r1 = ogg_int16_t *_x

Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armint.h	2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armint.h	2010-09-22 19:50:32 UTC (rev 17428)
@@ -68,7 +68,10 @@
  int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_arm(ogg_int16_t _y[64],ogg_uint16_t _dc);
 void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
  int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
  ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
@@ -86,7 +89,10 @@
  int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_v6(ogg_int16_t _y[64],ogg_uint16_t _dc);
 void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
 void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
  int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
@@ -103,7 +109,10 @@
  int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_1_neon(ogg_int16_t _y[64],ogg_uint16_t _dc);
 void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
 void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
  int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,

Modified: experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c	2010-09-22 19:41:13 UTC (rev 17427)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c	2010-09-22 19:50:32 UTC (rev 17428)
@@ -50,6 +50,7 @@
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
   _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  _state->opt_vtable.state_frag_recon=oc_state_frag_recon_arm;
   /*Note: We _must_ set this function pointer, because the macro in armint.h
      calls it with different arguments, so the C version will segfault.*/
   _state->opt_vtable.state_loop_filter_frag_rows=
@@ -68,6 +69,7 @@
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
     _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_v6;
     _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
     _state->opt_vtable.state_loop_filter_frag_rows=
      (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
@@ -80,6 +82,7 @@
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.state_frag_recon=oc_state_frag_recon_neon;
     _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
     _state->opt_vtable.state_loop_filter_frag_rows=
      (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
@@ -92,4 +95,137 @@
 # endif
 }
 
+void oc_state_frag_recon_arm(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_arm(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_arm(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_arm(dst,ystride,_dct_coeffs+64);
+  }
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_arm(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_arm(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_state_frag_recon_v6(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_v6(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_v6(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_v6(dst,ystride,_dct_coeffs+64);
+  }
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_v6(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_v6(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_state_frag_recon_neon(const oc_theora_state *_state,ptrdiff_t _fragi,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_uint16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
+    oc_idct8x8_1_neon(_dct_coeffs+64,p);
+  }
+  else{
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_neon(_dct_coeffs+64,_dct_coeffs,_last_zzi);
+  }
+  /*Fill in the target buffer.*/
+  frag_buf_off=_state->frag_buf_offs[_fragi];
+  mb_mode=_state->frags[_fragi].mb_mode;
+  ystride=_state->ref_ystride[_pli];
+  dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_neon(dst,ystride,_dct_coeffs+64);
+  }
+  else{
+    const unsigned char *ref;
+    int                  mvoffsets[2];
+    ref=
+     _state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]
+     +frag_buf_off;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
+     _state->frag_mvs[_fragi])>1){
+      oc_frag_recon_inter2_neon(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
+       _dct_coeffs+64);
+    }
+    else oc_frag_recon_inter_neon(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+  }
+}
+#  endif
+# endif
+
 #endif



More information about the commits mailing list