[xiph-commits] r17749 - in trunk/theora/lib: . arm

Sun Dec 12 22:27:49 PST 2010

Author: tterribe
Date: 2010-12-12 22:27:49 -0800 (Sun, 12 Dec 2010)
New Revision: 17749

Added:
   trunk/theora/lib/arm/armenc.c
   trunk/theora/lib/arm/armenc.h
   trunk/theora/lib/arm/armenquant.s
Modified:
   trunk/theora/lib/Makefile.am
   trunk/theora/lib/encint.h
Log:
Add a NEON version of oc_enc_quantize().


Modified: trunk/theora/lib/Makefile.am
===================================================================

--- trunk/theora/lib/Makefile.am	2010-12-12 09:40:50 UTC (rev 17748)
+++ trunk/theora/lib/Makefile.am	2010-12-13 06:27:49 UTC (rev 17749)
@@ -3,14 +3,16 @@
 
 EXTRA_DIST = \
 	encoder_disabled.c \
+	arm/arm2gnu.pl \
+	arm/armopts.s.in \
 	arm/armcpu.c \
 	arm/armbits.h \
 	arm/armbits.s \
 	arm/armfrag.s \
 	arm/armidct.s \
 	arm/armint.h \
-	arm/armopts.s.in \
-	arm/arm2gnu.pl \
+	arm/armenc.h \
+	arm/armenquant.s \
 	c64x/c64xint.h \
 	c64x/c64xdec.h \
 	x86/mmxfrag.c \
@@ -61,7 +63,9 @@
 
 encoder_shared_x86_64_sources =
 
-encoder_uniq_arm_sources =
+encoder_uniq_arm_sources = \
+	armenquant-gnu.S \
+	arm/armenc.c
 
 if CPU_arm
 BUILT_SOURCES = \
@@ -268,12 +272,14 @@
 	armfrag-gnu.S \
 	armidct-gnu.S \
 	armloop-gnu.S \
-	armopts-gnu.S
+	armopts-gnu.S \
+	armenquant-gnu.S
 
 # automake doesn't do dependency tracking for asm files, that I can tell
 armfrag-gnu.S: armopts-gnu.S
 armidct-gnu.S: armopts-gnu.S
 armloop-gnu.S: armopts-gnu.S
+armenquant-gnu.S: armopts-gnu.S
 
 # convert ARM asm to GNU as format
 %-gnu.S: $(srcdir)/arm/%.s

Copied: trunk/theora/lib/arm/armenc.c (from rev 17748, trunk/theora/lib/arm/armstate.c)
===================================================================
--- trunk/theora/lib/arm/armenc.c	                        (rev 0)
+++ trunk/theora/lib/arm/armenc.c	2010-12-13 06:27:49 UTC (rev 17749)
@@ -0,0 +1,54 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armenc.h"
+
+#if defined(OC_ARM_ASM)
+
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=_enc->state.cpu_flags;
+  oc_enc_accel_init_c(_enc);
+# if defined(OC_ENC_USE_VTABLE)
+  /*TODO: Add ARMv4 functions here.*/
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add EDSP functions here.*/
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    /*TODO: Add Media functions here.*/
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_neon;
+    _enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_neon;
+    _enc->opt_vtable.quantize=oc_enc_quantize_neon;
+#    endif
+    _enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
+    _enc->opt_data.enquant_table_alignment=16;
+  }
+#   endif
+#  endif
+# endif
+}
+#endif

Copied: trunk/theora/lib/arm/armenc.h (from rev 17748, trunk/theora/lib/arm/armint.h)
===================================================================
--- trunk/theora/lib/arm/armenc.h	                        (rev 0)
+++ trunk/theora/lib/arm/armenc.h	2010-12-13 06:27:49 UTC (rev 17749)
@@ -0,0 +1,44 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armenc_H)
+# define _arm_armenc_H (1)
+# include "armint.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_arm
+#  define OC_ENC_USE_VTABLE (1)
+# endif
+
+# include "../encint.h"
+
+# if defined(OC_ARM_ASM)
+void oc_enc_accel_init_arm(oc_enc_ctx *_enc);
+
+#  if defined(OC_ARM_ASM_EDSP)
+#   if defined(OC_ARM_ASM_MEDIA)
+#    if defined(OC_ARM_ASM_NEON)
+void oc_enc_enquant_table_init_neon(void *_enquant,
+ const ogg_uint16_t _dequant[64]);
+void oc_enc_enquant_table_fixup_neon(void *_enquant[3][3][2],int _nqis);
+int oc_enc_quantize_neon(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
+ const ogg_uint16_t _dequant[64],const void *_enquant);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

Added: trunk/theora/lib/arm/armenquant.s
===================================================================
--- trunk/theora/lib/arm/armenquant.s	                        (rev 0)
+++ trunk/theora/lib/arm/armenquant.s	2010-12-13 06:27:49 UTC (rev 17749)
@@ -0,0 +1,164 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_enc_enquant_table_init_neon
+	EXPORT	oc_enc_enquant_table_fixup_neon
+	EXPORT	oc_enc_quantize_neon
+
+oc_enc_enquant_table_init_neon PROC
+	; r0 = void               *_enquant
+	; r1 = const ogg_uint16_t  _dequant[64]
+	STMFD r13!,{r0,r14}
+	; Initialize the table using the C routine
+	BLX	oc_enc_enquant_table_init_c
+	LDR	r0, [r13],#4
+	MOV	r1, #2
+	; Now partially de-interleave it, so that the first row is all
+	;  multipliers, the second row is all shift factors, etc.
+	; Also, negate the shifts for use by VSHL.
+oeeti_neon_lp
+	SUBS	r1, r1, #1
+	VLDMIA		r0, {D16-D31}
+	VUZP.16		Q8, Q9
+	VNEG.S16	Q9, Q9
+	VUZP.16		Q10,Q11
+	VNEG.S16	Q11,Q11
+	VUZP.16		Q12,Q13
+	VNEG.S16	Q13,Q13
+	VUZP.16		Q14,Q15
+	VNEG.S16	Q15,Q15
+	VSTMIA		r0!,{D16-D31}
+	BNE	oeeti_neon_lp
+	LDR	PC, [r13],#4
+	ENDP
+
+oc_enc_enquant_table_fixup_neon PROC
+	; r0 = void *_enquant[3][3][2]
+	; r1 = int   _nqis
+	STR	r14, [r13,#-4]!
+oeetf_neon_lp1
+	SUBS	r1, r1, #1
+	BEQ	oeetf_neon_end1
+	MOV	r14,#3
+oeetf_neon_lp2
+	LDR	r2, [r0]
+	SUBS	r14,r14,#1
+	LDRH	r3, [r2]
+	LDRH	r12,[r2,#32]
+	LDR	r2, [r0,#8]
+	STRH	r3, [r2]
+	STRH	r12,[r2,#32]
+	LDR	r2, [r0,#4]
+	LDRH	r3, [r2]
+	LDRH	r12,[r2,#32]
+	LDR	r2, [r0,#12]
+	ADD	r0, r0, #24
+	STRH	r3, [r2]
+	STRH	r12,[r2,#32]
+	BNE	oeetf_neon_lp2
+	SUB	r0, r0, #64
+	B	oeetf_neon_lp1
+oeetf_neon_end1
+	LDR	PC, [r13],#4
+	ENDP
+
+oc_enc_quantize_neon PROC
+	; r0 = ogg_int16_t        _qdct[64]
+	; r1 = const ogg_int16_t  _dct[64]
+	; r2 = const ogg_int16_t  _dequant[64]
+	; r3 = const void        *_enquant
+	STMFD	r13!,{r4,r5,r14}
+	; The loop counter goes in the high half of r14
+	MOV	r14,#0xFFFCFFFF
+oeq_neon_lp
+	; Load the next two rows of the data and the quant matrices.
+	VLD1.64		{D16,D17,D18,D19},[r1 at 128]!
+	VLD1.64		{D20,D21,D22,D23},[r2 at 128]!
+	; Add in the signed rounding bias from the quantizers.
+	; Note that the VHADD relies on the fact that the quantizers are all
+	;  even (they're in fact multiples of four) in order to round correctly
+	;  on the entries being negated.
+	VSHR.S16	Q0, Q8, #15
+	VSHR.S16	Q1, Q9, #15
+	VLD1.64		{D24,D25,D26,D27},[r3 at 128]!
+	VHADD.S16	Q10,Q0, Q10
+	VHADD.S16	Q11,Q1, Q11
+	VLD1.64		{D28,D29,D30,D31},[r3 at 128]!
+	ADDS	r14,r14,#1<<16
+	VEOR.S16	Q10,Q0, Q10
+	VEOR.S16	Q11,Q1, Q11
+	VADD.S16	Q8, Q8, Q10
+	VADD.S16	Q9, Q9, Q11
+	; Perform the actual division and save the result.
+	VQDMULH.S16	Q12,Q8, Q12
+	VQDMULH.S16	Q14,Q9, Q14
+	VADD.S16	Q8, Q8, Q8
+	VADD.S16	Q9, Q9, Q9
+	VADD.S16	Q8, Q8, Q12
+	VADD.S16	Q9, Q9, Q14
+	VSHL.S16	Q8, Q13
+	VSHL.S16	Q9, Q15
+	VSUB.S16	Q8, Q8, Q0
+	VSUB.S16	Q9, Q9, Q1
+	VST1.64		{D16,D17,D18,D19},[r0 at 128]!
+	; Now pull out a bitfield marking the non-zero coefficients.
+	; Sadly, NEON has no PMOVMSKB; emulating it requires 7 instructions.
+	VQMOVN.S16	D16,Q8
+	VQMOVN.S16	D17,Q9
+	VCEQ.S8		Q8, #0
+	VNEG.S8		Q8, Q8          ; D16=.......3.......2.......1.......0
+	                                ;     .......7.......6.......5.......4
+	                                ; D17=.......B.......A.......9.......8
+	                                ;     .......F.......E.......D.......C
+	VZIP.8		D16,D17         ; D16=.......9.......1.......8.......0
+	                                ;     .......B.......3.......A.......2
+	                                ; D17=.......D.......5.......C.......4
+	                                ;     .......F.......7.......E.......6
+	VSHL.U8		D17,D17,#4      ; D17=...D.......5.......C.......4....
+	                                ;     ...F.......7.......E.......6....
+	VORR		D16,D16,D17     ; D16=...D...9...5...1...C...8...4...0
+	                                ;     ...F...B...7...3...E...A...6...2
+	; Shift over the bitfields from previous iterations and
+	;  finish compacting the bitfield from the last iteration.
+	ORR	r4, r5, LSL #2          ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+	ORR	r4, r4, LSR #15         ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+	PKHTB	r14,r14,r12,ASR #16     ; r14=i|A
+	PKHBT	r12,r4, r12,LSL #16     ; r12=B|C
+	VMOV		r4, r5, D16
+	BLT	oeq_neon_lp
+	; Start with the low half while the NEON register transfers.
+	PKHBT	r0, r14,r12             ; r0 =B|A
+	MVN	r0, r0
+	CLZ	r0, r0
+	RSB	r0, r0, #31
+	; Stall 8-10 more cycles waiting for the last transfer.
+	ORR	r4, r5, LSL #2          ; r4 =.F.D.B.9.7.5.3.1.E.C.A.8.6.4.2.0
+	ORR	r4, r4, LSR #15         ; r4 =.F.D.B.9.7.5.3.1FEDCBA9876543210
+	PKHBT	r1, r12,r4, LSL #16     ; r1 = D|C
+	MVNS	r1, r1
+	CLZNE	r1, r1
+	RSBNE	r0, r1, #63
+	LDMFD	r13!,{r4,r5,PC}
+	ENDP
+ ]
+
+	END

Modified: trunk/theora/lib/encint.h
===================================================================
--- trunk/theora/lib/encint.h	2010-12-12 09:40:50 UTC (rev 17748)
+++ trunk/theora/lib/encint.h	2010-12-13 06:27:49 UTC (rev 17749)
@@ -51,6 +51,9 @@
 #   include "x86/x86enc.h"
 #  endif
 # endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armenc.h"
+# endif
 
 # if !defined(oc_enc_accel_init)
 #  define oc_enc_accel_init oc_enc_accel_init_c