[xiph-commits] r17410 - in experimental/derf/theora-ptalarbvorm: . lib lib/arm lib/c64x lib/x86 lib/x86_vc m4
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Tue Sep 21 14:53:48 PDT 2010
Author: tterribe
Date: 2010-09-21 14:53:48 -0700 (Tue, 21 Sep 2010)
New Revision: 17410
Added:
experimental/derf/theora-ptalarbvorm/lib/arm/
experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl
experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h
experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s
experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c
experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h
experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s
experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in
experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h
experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4
Removed:
experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c
experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h
experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h
Modified:
experimental/derf/theora-ptalarbvorm/AUTHORS
experimental/derf/theora-ptalarbvorm/autogen.sh
experimental/derf/theora-ptalarbvorm/configure.ac
experimental/derf/theora-ptalarbvorm/lib/Makefile.am
experimental/derf/theora-ptalarbvorm/lib/analyze.c
experimental/derf/theora-ptalarbvorm/lib/bitpack.c
experimental/derf/theora-ptalarbvorm/lib/bitpack.h
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
experimental/derf/theora-ptalarbvorm/lib/decint.h
experimental/derf/theora-ptalarbvorm/lib/decode.c
experimental/derf/theora-ptalarbvorm/lib/encint.h
experimental/derf/theora-ptalarbvorm/lib/fragment.c
experimental/derf/theora-ptalarbvorm/lib/huffdec.c
experimental/derf/theora-ptalarbvorm/lib/huffdec.h
experimental/derf/theora-ptalarbvorm/lib/idct.c
experimental/derf/theora-ptalarbvorm/lib/internal.h
experimental/derf/theora-ptalarbvorm/lib/state.c
experimental/derf/theora-ptalarbvorm/lib/state.h
experimental/derf/theora-ptalarbvorm/lib/tokenize.c
experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c
experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h
experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c
Log:
Initial port of Robin Watts's Theorarm optimizations.
This includes the assembly for motion compensation, the iDCT, and the loop
filter, with minor fixes, additions, and speed improvements.
David Schleef and Ralph Giles contributed to the autoconf and automake support.
Cristian Adam contributed to testing the CPU detection on Windows.
New assembly was written for the libtheora bitpacker, as well as new ARMv6
versions of the iDCT and loop filter, and a new NEON version of the iDCT.
The iDCT was also modified to zero out its input coefficients afterwards, so
they can be used for the next block, avoiding the need to clear many
coefficients which were already zero (as an alternative to Robin's
oc_memzero16_64, which always cleared all the coefficients).
CPU detection for Linux and Windows was written to allow execution of a single
binary on multiple ARM CPUs; iOS detection is needed.
We currently do not try to detect the _minimum_ CPU the code is being compiled
for, and thus will include some code for processors older than is strictly
needed, though it is certainly possible to fix this.
A number of other updates were applied to the optimizations for other
architectures to adapt to interface changes that helped ARM, and some
long-standing bugs were fixed (such as an off-by-one in the iDCT for every
architecture that was harmless, but hurt performance).
The x86 Windows assembly had bitrotted a bit, and has been restored closer to
something that should work, but is completely untested.
What has _not_ been ported over yet is assembly for some of the less-frequently
used zero-filling, border padding, coded block flag reading, the libogg2
bitpacker, any of the post-processing, as well as most of the C code changes,
in particular the MV handling and the use of pointer arithmetic instead of
indexing in loops.
Some or all of this may follow in subsequent commits.
The current code, however, already benchmarks faster than the original Theorarm
version on ARMv6 or NEON-capable hardware.
Some of that is due to the new routines written for these CPUs, while the rest
is due to other general decoder improvemnts since the 1.1 release Theorarm was
based on.
Modified: experimental/derf/theora-ptalarbvorm/AUTHORS
===================================================================
--- experimental/derf/theora-ptalarbvorm/AUTHORS 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/AUTHORS 2010-09-21 21:53:48 UTC (rev 17410)
@@ -48,5 +48,7 @@
Rodolphe Ortalo
- Bug fixes
+Robin Watts
+ - ARM code optimisations
and other Xiph.org contributors
Modified: experimental/derf/theora-ptalarbvorm/autogen.sh
===================================================================
--- experimental/derf/theora-ptalarbvorm/autogen.sh 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/autogen.sh 2010-09-21 21:53:48 UTC (rev 17410)
@@ -127,4 +127,4 @@
autoconf || exit 1
cd $olddir
-$srcdir/configure --enable-maintainer-mode "$@" && /bin/echo
+$srcdir/configure "$@" && /bin/echo
Modified: experimental/derf/theora-ptalarbvorm/configure.ac
===================================================================
--- experimental/derf/theora-ptalarbvorm/configure.ac 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/configure.ac 2010-09-21 21:53:48 UTC (rev 17410)
@@ -12,7 +12,6 @@
AM_CONFIG_HEADER([config.h])
AC_CONFIG_SRCDIR([lib/fdct.c])
AM_INIT_AUTOMAKE
-AM_MAINTAINER_MODE
dnl we use doc_DATA in doc/Makefile.am which requires autoconf >= 2.60
dnl to define docdir for us.
@@ -55,6 +54,8 @@
dnl Check for programs
dnl --------------------------------------------------
+AM_PROG_AS
+
dnl save $CFLAGS since AC_PROG_CC likes to insert "-g -O2"
dnl if $CFLAGS is blank
cflags_save="$CFLAGS"
@@ -195,6 +196,8 @@
cpu_x86_64=no
cpu_x86_32=no
+cpu_arm=no
+cpu_c64x=no
AC_ARG_ENABLE(asm,
AS_HELP_STRING([--disable-asm], [Disable assembly optimizations]),
[ ac_enable_asm=$enableval ], [ ac_enable_asm=yes] )
@@ -203,7 +206,7 @@
cpu_optimization="no optimization for your platform, please send a patch"
case $target_cpu in
i[[3456]]86)
- cpu_x86_32=yes
+ cpu_x86_32=yes
cpu_optimization="32 bit x86"
AC_DEFINE([OC_X86_ASM], [], [make use of x86 asm optimization])
if test "x$target_vendor" = "xapple"; then
@@ -216,6 +219,99 @@
AC_DEFINE([OC_X86_ASM], [], [make use of x86 asm optimization])
AC_DEFINE([OC_X86_64_ASM], [], [make use of x86_64 asm optimization])
;;
+ arm*)
+ cpu_arm=yes
+ cpu_optimization="ARM"
+ AC_DEFINE([OC_ARM_ASM], [], [make use of arm asm optimization])
+ AC_ARG_ENABLE(asflag-probe,
+ AS_HELP_STRING([--disable-asflag-probe], [Disable instructions not supported by the default ASFLAGS (ARM only).]),
+ [ ac_enable_asflag_probe=$enableval ], [ ac_enable_asflag_probe=yes] )
+
+ dnl our ARM assembly requires perl to run the arm2gnu reformatter
+ AC_CHECK_PROG([HAVE_PERL], perl, yes, no)
+ if test "x$HAVE_PERL" = "xno"; then
+ AC_MSG_WARN([*** ARM assembly requires perl -- disabling optimizations])
+ cpu_arm=no
+ cpu_optimization="(missing perl dependency for ARM)"
+ fi
+
+ dnl AC_TRY_ASSEMBLE uses CFLAGS instead of CCASFLAGS
+ save_CFLAGS="$CFLAGS"
+ ARM_CCASFLAGS=
+ dnl Test for instruction set support with the default CCASFLAGS.
+ AS_ASM_ARM_NEON([HAVE_ARM_ASM_NEON=1],[HAVE_ARM_ASM_NEON=0])
+ AS_ASM_ARM_MEDIA([HAVE_ARM_ASM_MEDIA=1],[HAVE_ARM_ASM_MEDIA=0])
+ AS_ASM_ARM_EDSP([HAVE_ARM_ASM_EDSP=1],[HAVE_ARM_ASM_EDSP=0])
+ dnl gas will not assemble instructions unless the architecture explicitly
+ dnl supports it (unlike on x86).
+ dnl Try to speculatively add ASFLAGS to enable usage of these instructions
+ dnl at assembly time (actual support is detected at runtime).
+ dnl If the user has already specified -march or -mcpu flags, this may give
+ dnl some spurious warnings (use --disable-asflag-probe to avoid this if
+ dnl you don't want run-time support for instructions not available on the
+ dnl architecture you specified).
+ dnl Order here is important.
+ if test "x${ac_enable_asflag_probe}" = xyes; then
+ if test x$HAVE_ARM_ASM_NEON != x1 ; then
+ dnl Try to set some flags to enable NEON instructions.
+ AC_MSG_NOTICE([trying custom CCASFLAGS to enable NEON instructions...])
+ ARM_CCASFLAGS="-mfpu=neon -march=armv7-a"
+ CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+ AS_ASM_ARM_NEON([HAVE_ARM_ASM_NEON=1],[HAVE_ARM_ASM_NEON=0])
+ if test x$HAVE_ARM_ASM_NEON != x1 ; then
+ ARM_CCASFLAGS=
+ CFLAGS="$save_CFLAGS"
+ fi
+ fi
+ if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+ dnl Try to set some flags to enable ARMv6 media instructions.
+ AC_MSG_NOTICE([trying custom CCASFLAGS to enable ARMv6 media instructions...])
+ ARM_CCASFLAGS="-march=armv6j"
+ CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+ AS_ASM_ARM_MEDIA([HAVE_ARM_ASM_MEDIA=1],[HAVE_ARM_ASM_MEDIA=0])
+ if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+ ARM_CCASFLAGS=
+ CFLAGS="$save_CFLAGS"
+ fi
+ fi
+ if test x$HAVE_ARM_ASM_EDSP != x1 ; then
+ dnl Try to set some flags to enable EDSP instructions.
+ AC_MSG_NOTICE([trying custom CCASFLAGS to enable EDSP compilation...])
+ ARM_CCASFLAGS="-march=armv5e"
+ CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+ AS_ASM_ARM_EDSP([HAVE_ARM_ASM_EDSP=1],[HAVE_ARM_ASM_EDSP=0])
+ if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+ ARM_CCASFLAGS=
+ CFLAGS="$save_CFLAGS"
+ fi
+ fi
+ fi
+
+ dnl Only enable if we passed the perl test above
+ if test x$cpu_arm = xyes; then
+ if test x$HAVE_ARM_ASM_EDSP = x1 ; then
+ AC_DEFINE(OC_ARM_ASM_EDSP, 1,
+ [Define if assembler supports EDSP instructions])
+ cpu_optimization="$cpu_optimization (EDSP)"
+ fi
+ AC_SUBST(HAVE_ARM_ASM_EDSP)
+ if test x$HAVE_ARM_ASM_MEDIA = x1 ; then
+ AC_DEFINE(OC_ARM_ASM_MEDIA, 1,
+ [Define if assembler supports ARMv6 media instructions])
+ cpu_optimization="$cpu_optimization (Media)"
+ fi
+ AC_SUBST(HAVE_ARM_ASM_MEDIA)
+ if test x$HAVE_ARM_ASM_NEON = x1 ; then
+ AC_DEFINE(OC_ARM_ASM_NEON, 1,
+ [Define if compiler supports NEON instructions])
+ cpu_optimization="$cpu_optimization (NEON)"
+ fi
+ AC_SUBST(HAVE_ARM_ASM_NEON)
+ fi
+
+ CFLAGS="$save_CFLAGS"
+ CCASFLAGS="$CCASFLAGS $ARM_CCASFLAGS"
+ ;;
tic6x)
cpu_c64x=yes
cpu_optimization="TI C64x+"
@@ -227,6 +323,7 @@
fi
AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
+AM_CONDITIONAL([CPU_arm], [test x$cpu_arm = xyes])
AM_CONDITIONAL([CPU_c64x], [test x$cpu_c64x = xyes])
# Test whenever ld supports -version-script
@@ -477,6 +574,7 @@
AC_OUTPUT([
Makefile
lib/Makefile
+ lib/arm/armopts.s
include/Makefile include/theora/Makefile
examples/Makefile
doc/Makefile doc/Doxyfile doc/spec/Makefile
Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am 2010-09-21 21:53:48 UTC (rev 17410)
@@ -3,7 +3,17 @@
EXTRA_DIST = \
encoder_disabled.c \
- x86/cpu.c \
+ arm/armcpu.c \
+ arm/armbits.h \
+ arm/armbits.s \
+ arm/armfrag.s \
+ arm/armidct.s \
+ arm/armint.h \
+ arm/armopts.s.in \
+ arm/arm2gnu.pl \
+ c64x/c64xint.h \
+ c64x/c64xdec.h \
+ x86/x86cpu.c \
x86/mmxencfrag.c \
x86/mmxfdct.c \
x86/sse2encfrag.c \
@@ -13,16 +23,13 @@
x86/x86enc.h \
x86/x86enquant.c \
x86/mmxfrag.c \
- x86/mmxfrag.h \
x86/mmxidct.c \
x86/mmxloop.h \
x86/mmxstate.c \
x86/sse2idct.c \
x86/x86int.h \
x86/x86state.c \
- x86_vc \
- c64x/c64xint.h \
- c64x/c64xdec.h
+ x86_vc
lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
@@ -44,7 +51,7 @@
x86/sse2fdct.c
encoder_shared_x86_sources = \
- x86/cpu.c \
+ x86/x86cpu.c \
x86/mmxfrag.c \
x86/mmxidct.c \
x86/mmxstate.c \
@@ -53,6 +60,26 @@
encoder_shared_x86_64_sources =
+encoder_uniq_arm_sources =
+
+if CPU_arm
+BUILT_SOURCES = \
+ armbits-gnu.S \
+ armfrag-gnu.S \
+ armidct-gnu.S \
+ armloop-gnu.S \
+ armopts-gnu.S
+endif
+
+encoder_shared_arm_sources = \
+ armbits-gnu.S \
+ armfrag-gnu.S \
+ armidct-gnu.S \
+ armloop-gnu.S \
+ armopts-gnu.S \
+ arm/armcpu.c \
+ arm/armstate.c
+
if CPU_x86_64
encoder_uniq_arch_sources = \
$(encoder_uniq_x86_sources) \
@@ -65,10 +92,15 @@
encoder_uniq_arch_sources = $(encoder_uniq_x86_sources)
encoder_shared_arch_sources = $(encoder_shared_x86_sources)
else
+if CPU_arm
+encoder_uniq_arch_sources = $(encoder_uniq_arm_sources)
+encoder_shared_arch_sources = $(encoder_shared_arm_sources)
+else
encoder_uniq_arch_sources =
encoder_shared_arch_sources =
endif
endif
+endif
encoder_uniq_sources = \
analyze.c \
@@ -100,13 +132,24 @@
endif
decoder_x86_sources = \
- x86/cpu.c \
+ x86/x86cpu.c \
x86/mmxidct.c \
x86/mmxfrag.c \
x86/mmxstate.c \
x86/sse2idct.c \
x86/x86state.c
+decoder_arm_sources = \
+ arm/armcpu.c \
+ arm/armstate.c
+
+nodist_decoder_arm_sources = \
+ armbits-gnu.S \
+ armfrag-gnu.S \
+ armidct-gnu.S \
+ armloop-gnu.S \
+ armopts-gnu.S
+
decoder_c64x_sources = \
c64x/c64xdec.c \
c64x/c64xfrag.c \
@@ -115,17 +158,26 @@
if CPU_x86_64
decoder_arch_sources = $(decoder_x86_sources)
+nodist_decoder_arch_sources =
else
if CPU_x86_32
decoder_arch_sources = $(decoder_x86_sources)
+nodist_decoder_arch_sources =
else
+if CPU_arm
+decoder_arch_sources = $(decoder_arm_sources)
+nodist_decoder_arch_sources = $(nodist_decoder_arm_sources)
+else
if CPU_c64x
decoder_arch_sources = $(decoder_c64x_sources)
+nodist_decoder_arch_sources =
else
decoder_arch_sources =
+nodist_decoder_arch_sources =
endif
endif
endif
+endif
decoder_sources = \
apiwrapper.c \
@@ -160,8 +212,10 @@
ocintrin.h \
quant.h \
state.h \
- x86/cpu.h \
- x86/mmxfrag.h \
+ arm/armcpu.h \
+ c64x/c64xdec.h \
+ c64x/c64xint.h \
+ x86/x86cpu.h \
x86/mmxloop.h \
x86/sse2trans.h \
x86/x86enc.h \
@@ -170,6 +224,8 @@
libtheoradec_la_SOURCES = \
$(decoder_sources) \
Version_script-dec theoradec.exp
+nodist_libtheoradec_la_SOURCES = \
+ $(nodist_decoder_arch_sources)
libtheoradec_la_LDFLAGS = \
-version-info @THDEC_LIB_CURRENT@:@THDEC_LIB_REVISION@:@THDEC_LIB_AGE@ \
@THEORADEC_LDFLAGS@ @CAIRO_LIBS@ \
@@ -188,6 +244,8 @@
$(decoder_sources) \
$(encoder_uniq_sources) \
Version_script theora.exp
+nodist_libtheora_la_SOURCES = \
+ $(nodist_decoder_arch_sources)
libtheora_la_LDFLAGS = \
-version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
@THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS) \
@@ -202,3 +260,19 @@
# contstruct various symbol export list files
.def.exp : defexp.awk
awk -f defexp.awk $< > $@
+
+CLEANFILES = \
+ armbits-gnu.S \
+ armfrag-gnu.S \
+ armidct-gnu.S \
+ armloop-gnu.S \
+ armopts-gnu.S
+
+# automake doesn't do dependency tracking for asm files, that I can tell
+armfrag-gnu.S: armopts-gnu.S
+armidct-gnu.S: armopts-gnu.S
+armloop-gnu.S: armopts-gnu.S
+
+# convert ARM asm to GNU as format
+%-gnu.S: $(srcdir)/arm/%.s
+ $(srcdir)/arm/arm2gnu.pl < $< > $@
Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/analyze.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -24,9 +24,6 @@
-typedef struct oc_fr_state oc_fr_state;
-typedef struct oc_qii_state oc_qii_state;
-typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
typedef struct oc_rd_metric oc_rd_metric;
typedef struct oc_mode_choice oc_mode_choice;
@@ -223,43 +220,6 @@
-/*State to track coded block flags and their bit cost.
- We use opportunity cost to measure the bits required to code or skip the next
- block, using the cheaper of the cost to code it fully or partially, so long
- as both are possible.*/
-struct oc_fr_state{
- /*The number of bits required for the coded block flags so far this frame.*/
- ptrdiff_t bits;
- /*The length of the current run for the partial super block flag, not
- including the current super block.*/
- unsigned sb_partial_count:16;
- /*The length of the current run for the full super block flag, not
- including the current super block.*/
- unsigned sb_full_count:16;
- /*The length of the coded block flag run when the current super block
- started.*/
- unsigned b_coded_count_prev:6;
- /*The coded block flag when the current super block started.*/
- signed int b_coded_prev:2;
- /*The length of the current coded block flag run.*/
- unsigned b_coded_count:6;
- /*The current coded block flag.*/
- signed int b_coded:2;
- /*The number of blocks processed in the current super block.*/
- unsigned b_count:5;
- /*Whether or not it is cheaper to code the current super block partially,
- even if it could still be coded fully.*/
- unsigned sb_prefer_partial:1;
- /*Whether the last super block was coded partially.*/
- signed int sb_partial:2;
- /*The number of bits required for the flags for the current super block.*/
- unsigned sb_bits:6;
- /*Whether the last non-partial super block was coded fully.*/
- signed int sb_full:2;
-};
-
-
-
static void oc_fr_state_init(oc_fr_state *_fr){
_fr->bits=0;
_fr->sb_partial_count=0;
@@ -492,16 +452,6 @@
-struct oc_qii_state{
- ptrdiff_t bits;
- unsigned qi01_count:14;
- signed int qi01:2;
- unsigned qi12_count:14;
- signed int qi12:2;
-};
-
-
-
static void oc_qii_state_init(oc_qii_state *_qs){
_qs->bits=0;
_qs->qi01_count=0;
@@ -555,41 +505,11 @@
-/*Temporary encoder state for the analysis pipeline.*/
-struct oc_enc_pipeline_state{
- int bounding_values[256];
- oc_fr_state fr[3];
- oc_qii_state qs[3];
- /*Skip SSD storage for the current MCU in each plane.*/
- unsigned *skip_ssd[3];
- /*Coded/uncoded fragment lists for each plane for the current MCU.*/
- ptrdiff_t *coded_fragis[3];
- ptrdiff_t *uncoded_fragis[3];
- ptrdiff_t ncoded_fragis[3];
- ptrdiff_t nuncoded_fragis[3];
- /*The starting fragment for the current MCU in each plane.*/
- ptrdiff_t froffset[3];
- /*The starting row for the current MCU in each plane.*/
- int fragy0[3];
- /*The ending row for the current MCU in each plane.*/
- int fragy_end[3];
- /*The starting superblock for the current MCU in each plane.*/
- unsigned sbi0[3];
- /*The ending superblock for the current MCU in each plane.*/
- unsigned sbi_end[3];
- /*The number of tokens for zzi=1 for each color plane.*/
- int ndct_tokens1[3];
- /*The outstanding eob_run count for zzi=1 for each color plane.*/
- int eob_run1[3];
- /*Whether or not the loop filter is enabled.*/
- int loop_filter;
-};
-
-
static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
ptrdiff_t *coded_fragis;
unsigned mcu_nvsbs;
ptrdiff_t mcu_nfrags;
+ int flimit;
int hdec;
int vdec;
int pli;
@@ -649,8 +569,9 @@
_pipe->eob_run1[pli]=0;
}
/*Initialize the bounding value array for the loop filter.*/
- _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
- _pipe->bounding_values);
+ flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
+ _pipe->loop_filter=flimit!=0;
+ if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
}
/*Sets the current MCU stripe to super block row _sby.
@@ -692,10 +613,15 @@
int refi;
/*Copy over all the uncoded fragments from this plane and advance the uncoded
fragment list.*/
- _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
- oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
- _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
- _pipe->nuncoded_fragis[_pli]=0;
+ if(_pipe->nuncoded_fragis[_pli]>0){
+ _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+ oc_frag_copy_list(&_enc->state,
+ _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]],
+ _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]],
+ _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+ _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
+ _pipe->nuncoded_fragis[_pli]=0;
+ }
/*Perform DC prediction.*/
oc_enc_pred_dc_frag_rows(_enc,_pli,
_pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
@@ -741,8 +667,8 @@
oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
oc_fr_state *_fr,oc_token_checkpoint **_stack){
- OC_ALIGN16(ogg_int16_t dct[64]);
- OC_ALIGN16(ogg_int16_t data[64]);
+ ogg_int16_t *dct;
+ ogg_int16_t *data;
oc_qii_state qs;
const ogg_uint16_t *dequant;
ogg_uint16_t dequant_dc;
@@ -773,6 +699,8 @@
+frag_offs;
borderi=frags[_fragi].borderi;
qii=frags[_fragi].qii;
+ data=_enc->pipe.dct_data;
+ dct=data+64;
if(qii&~3){
#if !defined(OC_COLLECT_METRICS)
if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
@@ -872,7 +800,7 @@
}
else{
data[0]=dc*dequant_dc;
- oc_idct8x8(&_enc->state,data,nonzero+1);
+ oc_idct8x8(&_enc->state,data,data,nonzero+1);
}
if(nqis>1){
oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
@@ -1675,7 +1603,6 @@
/*Analysis stage for an INTRA frame.*/
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
- oc_enc_pipeline_state pipe;
ogg_int64_t activity_sum;
ogg_int64_t luma_sum;
unsigned activity_avg;
@@ -1698,7 +1625,7 @@
int pli;
_enc->state.frame_type=OC_INTRA_FRAME;
oc_enc_tokenize_start(_enc);
- oc_enc_pipeline_init(_enc,&pipe);
+ oc_enc_pipeline_init(_enc,&_enc->pipe);
oc_enc_mode_rd_init(_enc);
activity_sum=luma_sum=0;
activity_avg=_enc->activity_avg;
@@ -1725,10 +1652,10 @@
ptrdiff_t cfroffset;
unsigned sbi;
unsigned sbi_end;
- notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
- sbi_end=pipe.sbi_end[0];
- cfroffset=pipe.froffset[1];
- for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+ notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+ sbi_end=_enc->pipe.sbi_end[0];
+ cfroffset=_enc->pipe.froffset[1];
+ for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
int quadi;
/*Mode addressing is through Y plane, always 4 MB per SB.*/
for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
@@ -1763,10 +1690,10 @@
oc_mcenc_search(_enc,mbi);
}
if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
- oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi,rd_scale);
+ oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
}
mb_modes[mbi]=OC_MODE_INTRA;
- oc_enc_mb_transform_quantize_intra_luma(_enc,&pipe,
+ oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
mbi,rd_scale,rd_iscale);
/*Propagate final MB mode and MVs to the chroma blocks.*/
for(mapii=4;mapii<nmap_idxs;mapii++){
@@ -1786,12 +1713,12 @@
}
}
}
- oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+ oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
/*Code chroma planes.*/
for(pli=1;pli<3;pli++){
- oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
- pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
- oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+ oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
+ pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+ oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
}
notstart=1;
}
@@ -2316,7 +2243,6 @@
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
oc_set_chroma_mvs_func set_chroma_mvs;
- oc_enc_pipeline_state pipe;
oc_qii_state intra_luma_qs;
oc_mv last_mv;
oc_mv prior_mv;
@@ -2356,7 +2282,7 @@
_enc->state.frame_type=OC_INTER_FRAME;
oc_mode_scheme_chooser_reset(&_enc->chooser);
oc_enc_tokenize_start(_enc);
- oc_enc_pipeline_init(_enc,&pipe);
+ oc_enc_pipeline_init(_enc,&_enc->pipe);
oc_enc_mode_rd_init(_enc);
if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
_enc->mv_bits[0]=_enc->mv_bits[1]=0;
@@ -2391,10 +2317,10 @@
mcu_nvsbs=_enc->mcu_nvsbs;
for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
ptrdiff_t cfroffset;
- notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
- sbi_end=pipe.sbi_end[0];
- cfroffset=pipe.froffset[1];
- for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+ notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+ sbi_end=_enc->pipe.sbi_end[0];
+ cfroffset=_enc->pipe.froffset[1];
+ for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
int quadi;
/*Mode addressing is through Y plane, always 4 MB per SB.*/
for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
@@ -2448,7 +2374,7 @@
/*Estimate the cost of coding this MB in a keyframe.*/
if(_allow_keyframe){
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
- pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
+ _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
intrabits+=modes[OC_MODE_INTRA].rate;
for(bi=0;bi<4;bi++){
oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
@@ -2456,24 +2382,28 @@
}
}
/*Estimate the cost in a delta frame for various modes.*/
- oc_skip_cost(_enc,&pipe,mbi,rd_scale,skip_ssd);
+ oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
if(sp_level<OC_SP_LEVEL_NOMC){
oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
- OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
- pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
- pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
- OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
- OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
- OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
- pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
/*The explicit MV modes (2,6,7) have not yet gone through halfpel
refinement.
We choose the explicit MV mode that's already furthest ahead on
@@ -2483,7 +2413,8 @@
inter_mv_pref=_enc->lambda*3;
if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
- embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
}
else{
modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
@@ -2495,7 +2426,8 @@
embs[mbi].refined|=0x80;
}
oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
- embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
}
else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
modes[OC_MODE_INTER_MV].cost){
@@ -2505,7 +2437,7 @@
}
mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
- pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
}
if(!(embs[mbi].refined&0x04)){
oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
@@ -2513,7 +2445,7 @@
}
mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
- pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
/*Finally, pick the mode with the cheapest estimated R-D cost.*/
mb_mode=OC_MODE_INTER_NOMV;
if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
@@ -2544,11 +2476,13 @@
}
else{
oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
- OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
- pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+ _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
- OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+ OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+ skip_ssd,rd_scale);
mb_mode=OC_MODE_INTER_NOMV;
if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
mb_mode=OC_MODE_INTRA;
@@ -2589,7 +2523,7 @@
fragi=sb_maps[mbi>>2][mbi&3][bi];
frags[fragi].qii=modes[mb_mode].qii[bi];
}
- if(oc_enc_mb_transform_quantize_inter_luma(_enc,&pipe,mbi,
+ if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
int orig_mb_mode;
orig_mb_mode=mb_mode;
@@ -2693,16 +2627,16 @@
mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
}
}
- oc_fr_state_flush_sb(pipe.fr+0);
- sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
- sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
+ oc_fr_state_flush_sb(_enc->pipe.fr+0);
+ sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
+ sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
}
- oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+ oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
/*Code chroma planes.*/
for(pli=1;pli<3;pli++){
- oc_enc_sb_transform_quantize_inter_chroma(_enc,&pipe,
- pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
- oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+ oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
+ pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+ oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
}
notstart=1;
}
@@ -2724,7 +2658,7 @@
inaccuracy is small.
We don't need to add the luma plane coding flag costs, because they are
already included in the MB rate estimates.*/
- for(pli=1;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
+ for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
if(interbits>intrabits)return 1;
}
_enc->ncoded_mbis=ncoded_mbis;
Copied: experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl (from rev 17378, branches/theorarm-merge-branch/arm2gnu.pl)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+
+my $bigend; # little/big endian
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+ if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+ $_ = shift;
+ last if /^--/;
+ if (/^-n/) {
+ $nflag++;
+ next;
+ }
+ die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n"; # automatically add newline on print
+$n=0;
+
+$thumb = 0; # ARM mode by default, not Thumb.
+
+LINE:
+while (<>) {
+
+ # For ADRLs we need to add a new line after the substituted one.
+ $addPadding = 0;
+
+ # First, we do not dare to touch *anything* inside double quotes, do we?
+ # Second, if you want a dollar character in the string,
+ # insert two of them -- that's how ARM C and assembler treat strings.
+ s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1: .ascii \"/ && do { s/\$\$/\$/g; next };
+ s/\bDCB\b[ \t]*\"/.ascii \"/ && do { s/\$\$/\$/g; next };
+ s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/ && do { s/\$\$/\$/g; next };
+ # If there's nothing on a line but a comment, don't try to apply any further
+ # substitutions (this is a cheap hack to avoid mucking up the license header)
+ s/^([ \t]*);/$1@/ && do { s/\$\$/\$/g; next };
+ # If substituted -- leave immediately !
+
+ s/@/,:/;
+ s/;/@/;
+ while ( /@.*'/ ) {
+ s/(@.*)'/$1/g;
+ }
+ s/\{FALSE\}/0/g;
+ s/\{TRUE\}/1/g;
+ s/\{(\w\w\w\w+)\}/$1/g;
+ s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+ s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+ s/\bIMPORT\b/.extern/;
+ s/\bEXPORT\b/.global/;
+ s/^(\s+)\[/$1IF/;
+ s/^(\s+)\|/$1ELSE/;
+ s/^(\s+)\]/$1ENDIF/;
+ s/IF *:DEF:/ .ifdef/;
+ s/IF *:LNOT: *:DEF:/ .ifndef/;
+ s/ELSE/ .else/;
+ s/ENDIF/ .endif/;
+
+ if( /\bIF\b/ ) {
+ s/\bIF\b/ .if/;
+ s/=/==/;
+ }
+ if ( $n == 2) {
+ s/\$/\\/g;
+ }
+ if ($n == 1) {
+ s/\$//g;
+ s/label//g;
+ $n = 2;
+ }
+ if ( /MACRO/ ) {
+ s/MACRO *\n/.macro/;
+ $n=1;
+ }
+ if ( /\bMEND\b/ ) {
+ s/\bMEND\b/.endm/;
+ $n=0;
+ }
+
+ # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+ #
+ if ( /\bAREA\b/ ) {
+ s/^(.+)CODE(.+)READONLY(.*)/ .text/;
+ s/^(.+)DATA(.+)READONLY(.*)/ .section .rdata\n .align 2/;
+ s/^(.+)\|\|\.data\|\|(.+)/ .data\n .align 2/;
+ s/^(.+)\|\|\.bss\|\|(.+)/ .bss/;
+ }
+
+ s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/; # ||.constdata$3||
+ s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/; # ||.bss$2||
+ s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/; # ||.data$2||
+ s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+ s/^(\s+)\%(\s)/ .space $1/;
+
+ s/\|(.+)\.(\d+)\|/\.$1_$2/; # |L80.123| -> .L80_123
+ s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+ s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+ if (/\bPROC\b/)
+ {
+ print " .thumb_func" if ($thumb);
+ s/\bPROC\b/@ $&/;
+ }
+ s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+ s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+ s/\bENDP\b/@ $&/;
+ s/\bSUBT\b/@ $&/;
+ s/\bDATA\b/@ $&/; # DATA directive is deprecated -- Asm guide, p.7-25
+ s/\bKEEP\b/@ $&/;
+ s/\bEXPORTAS\b/@ $&/;
+ s/\|\|(.)+\bEQU\b/@ $&/;
+ s/\|\|([\w\$]+)\|\|/$1/;
+ s/\bENTRY\b/@ $&/;
+ s/\bASSERT\b/@ $&/;
+ s/\bGBLL\b/@ $&/;
+ s/\bGBLA\b/@ $&/;
+ s/^\W+OPT\b/@ $&/;
+ s/:OR:/|/g;
+ s/:SHL:/<</g;
+ s/:SHR:/>>/g;
+ s/:AND:/&/g;
+ s/:LAND:/&&/g;
+ s/CPSR/cpsr/;
+ s/SPSR/spsr/;
+ s/ALIGN$/.balign 4/;
+ s/ALIGN\s+([0-9x]+)$/.balign $1/;
+ s/psr_cxsf/psr_all/;
+ s/LTORG/.ltorg/;
+ s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+ s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+ # {PC} + 0xdeadfeed --> . + 0xdeadfeed
+ s/\{PC\} \+/ \. +/;
+
+ # Single hex constant on the line !
+ #
+ # >>> NOTE <<<
+ # Double-precision floats in gcc are always mixed-endian, which means
+ # bytes in two words are little-endian, but words are big-endian.
+ # So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+ # and 0xfeed0000 at high address.
+ #
+ s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+ # Only decimal constants on the line, no hex !
+ s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+ # Single hex constant on the line !
+# s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+ # Only decimal constants on the line, no hex !
+# s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+ s/\bDCFS[ \t]+0x/.word 0x/;
+ s/\bDCFS\b/.float/;
+
+ s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+ s/\bDCD\b/.word/;
+ s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+ s/\bDCW\b/.short/;
+ s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+ s/\bDCB\b/.byte/;
+ s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+ s/^[A-Za-z_\.]\w+/$&:/;
+ s/^(\d+)/$1:/;
+ s/\%(\d+)/$1b_or_f/;
+ s/\%[Bb](\d+)/$1b/;
+ s/\%[Ff](\d+)/$1f/;
+ s/\%[Ff][Tt](\d+)/$1f/;
+ s/&([\dA-Fa-f]+)/0x$1/;
+ if ( /\b2_[01]+\b/ ) {
+ s/\b2_([01]+)\b/conv$1&&&&/g;
+ while ( /[01][01][01][01]&&&&/ ) {
+ s/0000&&&&/&&&&0/g;
+ s/0001&&&&/&&&&1/g;
+ s/0010&&&&/&&&&2/g;
+ s/0011&&&&/&&&&3/g;
+ s/0100&&&&/&&&&4/g;
+ s/0101&&&&/&&&&5/g;
+ s/0110&&&&/&&&&6/g;
+ s/0111&&&&/&&&&7/g;
+ s/1000&&&&/&&&&8/g;
+ s/1001&&&&/&&&&9/g;
+ s/1010&&&&/&&&&A/g;
+ s/1011&&&&/&&&&B/g;
+ s/1100&&&&/&&&&C/g;
+ s/1101&&&&/&&&&D/g;
+ s/1110&&&&/&&&&E/g;
+ s/1111&&&&/&&&&F/g;
+ }
+ s/000&&&&/&&&&0/g;
+ s/001&&&&/&&&&1/g;
+ s/010&&&&/&&&&2/g;
+ s/011&&&&/&&&&3/g;
+ s/100&&&&/&&&&4/g;
+ s/101&&&&/&&&&5/g;
+ s/110&&&&/&&&&6/g;
+ s/111&&&&/&&&&7/g;
+ s/00&&&&/&&&&0/g;
+ s/01&&&&/&&&&1/g;
+ s/10&&&&/&&&&2/g;
+ s/11&&&&/&&&&3/g;
+ s/0&&&&/&&&&0/g;
+ s/1&&&&/&&&&1/g;
+ s/conv&&&&/0x/g;
+ }
+
+ if ( /commandline/)
+ {
+ if( /-bigend/)
+ {
+ $bigend=1;
+ }
+ }
+
+ if ( /\bDCDU\b/ )
+ {
+ my $cmd=$_;
+ my $value;
+ my $w1;
+ my $w2;
+ my $w3;
+ my $w4;
+
+ s/\s+DCDU\b/@ $&/;
+
+ $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+ $value = $1;
+ $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+ $w1 = $1;
+ $w2 = $2;
+ $w3 = $3;
+ $w4 = $4;
+
+ if( $bigend ne "")
+ {
+ # big endian
+
+ print " .byte 0x".$w1;
+ print " .byte 0x".$w2;
+ print " .byte 0x".$w3;
+ print " .byte 0x".$w4;
+ }
+ else
+ {
+ # little endian
+
+ print " .byte 0x".$w4;
+ print " .byte 0x".$w3;
+ print " .byte 0x".$w2;
+ print " .byte 0x".$w1;
+ }
+
+ }
+
+
+ if ( /\badrl\b/i )
+ {
+ s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+ $addPadding = 1;
+ }
+ s/\bEND\b/@ END/;
+} continue {
+ printf ("%s", $_) if $printit;
+ if ($addPadding != 0)
+ {
+ printf (" mov r0,r0\n");
+ $addPadding = 0;
+ }
+}
+
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,32 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+# define oc_pack_read oc_pack_read_arm
+# define oc_pack_read1 oc_pack_read1_arm
+# define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,227 @@
+;********************************************************************
+;* *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+;* *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;* *
+;********************************************************************
+;
+; function:
+; last mod: $Id: armbits.c 17344 2010-07-21 01:42:18Z tterribe $
+;
+;********************************************************************
+
+ AREA |.text|, CODE, READONLY
+
+ EXPORT oc_pack_read_arm
+ EXPORT oc_pack_read1_arm
+ EXPORT oc_huff_token_decode_arm
+
+oc_pack_read_arm
+ ; r0 = oc_pack_buf *_b
+ ; r1 = int _bits
+ ADD r12,r0,#8
+ LDMIA r12,{r2,r3} ; r2 = window
+ ; Stall... ; r3 = available
+ ; Stall...
+ SUBS r3,r3,r1 ; r3 = available-_bits, available<_bits => LT
+ BLT oc_pack_read_refill
+ RSB r0,r1,#32 ; r0 = 32-_bits
+ MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
+ MOV r2,r2,LSL r1 ; r2 = window<<=_bits
+ STMIA r12,{r2,r3} ; window = r2
+ ; available = r3
+ MOV PC,r14
+
+oc_pack_read1_arm
+ ; r0 = oc_pack_buf *_b
+ ADD r12,r0,#8
+ LDMIA r12,{r2,r3} ; r2 = window
+ ; Stall... ; r3 = available
+ ; Stall...
+ SUBS r3,r3,#1 ; r3 = available-1, available<1 => LT
+ BLT oc_pack_read1_refill
+ MOV r0,r2,LSR #31 ; r0 = window>>31
+ MOV r2,r2,LSL #1 ; r2 = window<<=1
+ STMIA r12,{r2,r3} ; window = r2
+ ; available = r3
+ MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+ MOV r1,#1
+oc_pack_read_refill
+ STMFD r13!,{r10,r11,r14}
+ LDMIA r0,{r10,r11} ; r10 = stop
+ ; r11 = ptr
+ RSB r0,r1,#32 ; r0 = 32-_bits
+ RSB r3,r3,r0 ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+; (allowing us to chain condition codes) because available will never be
+; larger than 32 (or we wouldn't be here), and thus 32-available will never be
+; negative.
+ CMP r10,r11 ; ptr<stop => HI
+ CMPHI r3,#7 ; available<=24 => HI
+ LDRHIB r14,[r11],#1 ; r14 = *ptr++
+ SUBHI r3,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
+ CMPHI r10,r11 ; ptr<stop => HI
+ CMPHI r3,#7 ; available<=24 => HI
+ LDRHIB r14,[r11],#1 ; r14 = *ptr++
+ SUBHI r3,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
+ CMPHI r10,r11 ; ptr<stop => HI
+ CMPHI r3,#7 ; available<=24 => HI
+ LDRHIB r14,[r11],#1 ; r14 = *ptr++
+ SUBHI r3,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
+ CMPHI r10,r11 ; ptr<stop => HI
+ CMPHI r3,#7 ; available<=24 => HI
+ LDRHIB r14,[r11],#1 ; r14 = *ptr++
+ SUBHI r3,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r2,r14,LSL r3 ; r2 = window|=r14<<32-available
+ SUBS r3,r0,r3 ; r3 = available-=_bits, available<bits => GT
+ BLT oc_pack_read_refill_last
+ MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
+ MOV r2,r2,LSL r1 ; r2 = window<<=_bits
+ STR r11,[r12,#-4] ; ptr = r11
+ STMIA r12,{r2,r3} ; window = r2
+ ; available = r3
+ LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+; stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+ CMP r11,r10 ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+; to fill up the window.
+ LDRLOB r14,[r11] ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+ MOVHS r14,#1 ; (HS) r14 = 1
+ ADDLO r10,r3,r1 ; (LO) r10 = available
+ STRHS r14,[r12,#8] ; (HS) eof = 1
+ ANDLO r10,r10,#7 ; (LO) r10 = available&7
+ MOVHS r3,#1<<30 ; (HS) available = OC_LOTS_OF_BITS
+ ORRLO r2,r14,LSL r10 ; (LO) r2 = window|=*ptr>>(available&7)
+ MOV r0,r2,LSR r0 ; r0 = window>>32-_bits
+ MOV r2,r2,LSL r1 ; r2 = window<<=_bits
+ STR r11,[r12,#-4] ; ptr = r11
+ STMIA r12,{r2,r3} ; window = r2
+ ; available = r3
+ LDMFD r13!,{r10,r11,PC}
+
+
+
+oc_huff_token_decode_arm
+ ; r0 = oc_pack_buf *_b
+ ; r1 = const ogg_int16_t *_tree
+ STMFD r13!,{r4,r5,r10,r14}
+ LDRSH r10,[r1] ; r10 = n=_tree[0]
+ LDMIA r0,{r2-r5} ; r2 = stop
+ ; Stall... ; r3 = ptr
+ ; Stall... ; r4 = window
+ ; r5 = available
+ CMP r10,r5 ; n>available => GT
+ BGT oc_huff_token_decode_refill0
+ RSB r14,r10,#32 ; r14 = 32-n
+ MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
+ ADD r14,r1,r14,LSL #1 ; r14 = _tree+bits
+ LDRSH r12,[r14,#2] ; r12 = node=_tree[1+bits]
+ ; Stall...
+ ; Stall...
+ RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
+ BMI oc_huff_token_decode_continue
+ MOV r10,r14,LSR #8 ; r10 = n=node>>8
+ MOV r4,r4,LSL r10 ; r4 = window<<=n
+ SUB r5,r10 ; r5 = available-=n
+ STMIB r0,{r3-r5} ; ptr = r3
+ ; window = r4
+ ; available = r5
+ AND r0,r14,#255 ; r0 = node&255
+ LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+ ADD r12,r1,r12,LSL #1 ; r12 = _tree+node
+ MOV r4,r4,LSL r10 ; r4 = window<<=n
+ SUB r5,r5,r10 ; r5 = available-=n
+ LDRSH r10,[r12],#2 ; r10 = n=_tree[node]
+ ; Stall... ; r12 = _tree+node+1
+ ; Stall...
+ CMP r10,r5 ; n>available => GT
+ BGT oc_huff_token_decode_refill
+ RSB r14,r10,#32 ; r14 = 32-n
+ MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
+ ADD r12,r12,r14 ;
+ LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits]
+ ; Stall...
+ ; Stall...
+ RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
+ BMI oc_huff_token_decode_continue
+ MOV r10,r14,LSR #8 ; r10 = n=node>>8
+ MOV r4,r4,LSL r10 ; r4 = window<<=n
+ SUB r5,r10 ; r5 = available-=n
+ STMIB r0,{r3-r5} ; ptr = r3
+ ; window = r4
+ ; available = r5
+ AND r0,r14,#255 ; r0 = node&255
+ LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+ ADD r12,r1,#2 ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+ CMP r2,r3 ; ptr<stop => HI
+ LDRHIB r14,[r3],#1 ; r14 = *ptr++
+ RSBHI r5,r5,#24 ; (HI) available = 32-(available+=8)
+ RSBLS r5,r5,#32 ; (LS) r5 = 32-available
+ ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
+ CMPHI r2,r3 ; ptr<stop => HI
+ LDRHIB r14,[r3],#1 ; r14 = *ptr++
+ SUBHI r5,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+; (allowing us to chain condition codes) because available will never be
+; larger than 32 (or we wouldn't be here), and thus 32-available will never be
+; negative.
+ CMPHI r2,r3 ; ptr<stop => HI
+ CMPHI r5,#7 ; available<=24 => HI
+ LDRHIB r14,[r3],#1 ; r14 = *ptr++
+ SUBHI r5,#8 ; available += 8
+ ; (HI) Stall...
+ ORRHI r4,r14,LSL r5 ; r4 = window|=r14<<32-available
+ CMP r2,r3 ; ptr<stop => HI
+ MOVLS r5,#-1<<30 ; (LS) available = OC_LOTS_OF_BITS+32
+ CMPHI r5,#7 ; (HI) available<=24 => HI
+ LDRHIB r14,[r3],#1 ; (HI) r14 = *ptr++
+ SUBHI r5,#8 ; (HI) available += 8
+ ; (HI) Stall...
+ ORRHI r4,r14,LSL r5 ; (HI) r4 = window|=r14<<32-available
+ RSB r14,r10,#32 ; r14 = 32-n
+ MOV r14,r4,LSR r14 ; r14 = bits=window>>32-n
+ ADD r12,r12,r14 ;
+ LDRSH r12,[r12,r14] ; r12 = node=_tree[node+1+bits]
+ RSB r5,r5,#32 ; r5 = available
+ ; Stall...
+ RSBS r14,r12,#0 ; r14 = -node, node>0 => MI
+ BMI oc_huff_token_decode_continue
+ MOV r10,r14,LSR #8 ; r10 = n=node>>8
+ MOV r4,r4,LSL r10 ; r4 = window<<=n
+ SUB r5,r10 ; r5 = available-=n
+ STMIB r0,{r3-r5} ; ptr = r3
+ ; window = r4
+ ; available = r5
+ AND r0,r14,#255 ; r0 = node&255
+ LDMFD r13!,{r4,r5,r10,pc}
+
+ END
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,116 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+ last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+ return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags;
+ flags=0;
+ /*MSVC has no inline __asm support for ARM, but it does let you __emit
+ instructions via their assembled hex code.
+ All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+ __try{
+ /*PLD [r13]*/
+ __emit(0xF5DDF000);
+ flags|=OC_CPU_ARM_EDSP;
+ }
+ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+ /*Ignore exception.*/
+ }
+# if defined(OC_ARM_ASM_MEDIA)
+ __try{
+ /*SHADD8 r3,r3,r3*/
+ __emit(0xE6333F93);
+ flags|=OC_CPU_ARM_MEDIA;
+ }
+ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+ /*Ignore exception.*/
+ }
+# if defined(OC_ARM_ASM_NEON)
+ __try{
+ /*VORR q0,q0,q0*/
+ __emit(0xF2200150);
+ flags|=OC_CPU_ARM_NEON;
+ }
+ __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+ /*Ignore exception.*/
+ }
+# endif
+# endif
+# endif
+ return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags;
+ FILE *fin;
+ flags=0;
+ /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+ Android.
+ This also means that detection will fail in Scratchbox.*/
+ fin=fopen("/proc/cpuinfo","r");
+ if(fin!=NULL){
+ /*512 should be enough for anybody (it's even enough for all the flags that
+ x86 has accumulated... so far).*/
+ char buf[512];
+ while(fgets(buf,511,fin)!=NULL){
+ if(memcmp(buf,"Features",8)==0){
+ char *p;
+ p=strstr(buf," edsp");
+ if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+ p=strstr(buf," neon");
+ if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+ }
+ if(memcmp(buf,"CPU architecture:",17)==0){
+ int version;
+ version=atoi(buf+17);
+ if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+ }
+ }
+ fclose(fin);
+ }
+ return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+ accessible in priveleged modes only, so we can't have a general user-space
+ detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform. Reconfigure with --disable-asm (or send patches)."
+#endif
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,29 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+ function:
+ last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP (1<<7)
+#define OC_CPU_ARM_NEON (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMfrag.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,633 @@
+;********************************************************************
+;* *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+;* *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;* *
+;********************************************************************
+; Original implementation:
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+ AREA |.text|, CODE, READONLY
+
+ GET armopts.s
+
+; Vanilla ARM v4 versions
+ EXPORT oc_frag_copy_list_arm
+ EXPORT oc_frag_recon_intra_arm
+ EXPORT oc_frag_recon_inter_arm
+ EXPORT oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm
+ ; r0 = _dst_frame
+ ; r1 = _src_frame
+ ; r2 = _ystride
+ ; r3 = _fragis
+ ; <> = _nfragis
+ ; <> = _frag_buf_offs
+ LDR r12,[r13] ; r12 = _nfragis
+ STMFD r13!,{r4-r6,r11,r14}
+ SUBS r12, r12, #1
+ LDR r4,[r3],#4 ; r4 = _fragis[fragii]
+ LDRGE r14,[r13,#4*6] ; r14 = _frag_buf_offs
+ BLT ofcl_arm_end
+ SUB r2, r2, #4
+ofcl_arm_lp
+ LDR r11,[r14,r4,LSL #2] ; r11 = _frag_buf_offs[_fragis[fragii]]
+ SUBS r12, r12, #1
+ ; Stall (on XScale)
+ ADD r4, r1, r11 ; r4 = _src_frame+frag_buf_off
+ LDR r6, [r4], #4
+ ADD r11,r0, r11 ; r11 = _dst_frame+frag_buf_off
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4], r2
+ STR r6, [r11],#4
+ LDR r6, [r4], #4
+ STR r5, [r11],r2
+ LDR r5, [r4]
+ LDRGE r4,[r3],#4 ; r4 = _fragis[fragii]
+ STR r6, [r11],#4
+ STR r5, [r11]
+ BGE ofcl_arm_lp
+ofcl_arm_end
+ LDMFD r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+ ; r0 = unsigned char *_dst
+ ; r1 = int _ystride
+ ; r2 = const ogg_int16_t _residue[64]
+ STMFD r13!,{r4,r5,r14}
+ MOV r14,#8
+ MOV r5, #255
+ SUB r1, r1, #7
+ofrintra_lp_arm
+ LDRSH r3, [r2], #2
+ LDRSH r4, [r2], #2
+ LDRSH r12,[r2], #2
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ LDRSH r3, [r2], #2
+ STRB r4, [r0], #1
+ ADDS r12,r12,#128
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ LDRSH r4, [r2], #2
+ STRB r12,[r0], #1
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ LDRSH r12,[r2], #2
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ LDRSH r3, [r2], #2
+ STRB r4, [r0], #1
+ ADDS r12,r12,#128
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ LDRSH r4, [r2], #2
+ STRB r12,[r0], #1
+ ADDS r3, r3, #128
+ CMPGT r5, r3
+ EORLT r3, r5, r3, ASR #32
+ STRB r3, [r0], #1
+ ADDS r4, r4, #128
+ CMPGT r5, r4
+ EORLT r4, r5, r4, ASR #32
+ STRB r4, [r0], r1
+ SUBS r14,r14,#1
+ BGT ofrintra_lp_arm
+ LDMFD r13!,{r4,r5,PC}
+
+oc_frag_recon_inter_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src
+ ; r2 = int ystride
+ ; r3 = const ogg_int16_t residue[64]
+ STMFD r13!,{r5,r9-r11,r14}
+ MOV r9, #8
+ MOV r5, #255
+ SUB r2, r2, #7
+ofrinter_lp_arm
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], #1
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ LDRSH r12,[r3], #2
+ LDRB r14,[r1], #1
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], #1
+ ADDS r12,r12,r14
+ CMPGT r5, r12
+ LDRSH r11,[r3], #2
+ LDRB r10,[r1], r2
+ EORLT r12,r5, r12,ASR #32
+ STRB r12,[r0], #1
+ ADDS r11,r11,r10
+ CMPGT r5, r11
+ EORLT r11,r5, r11,ASR #32
+ STRB r11,[r0], r2
+ SUBS r9, r9, #1
+ BGT ofrinter_lp_arm
+ LDMFD r13!,{r5,r9-r11,PC}
+
+oc_frag_recon_inter2_arm
+ ; r0 = unsigned char *dst
+ ; r1 = const unsigned char *src1
+ ; r2 = const unsigned char *src2
+ ; r3 = int ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t residue[64]
+ STMFD r13!,{r4-r8,r14}
+ MOV r14,#8
+ MOV r8, #255
+ SUB r3, r3, #7
+ofrinter2_lp_arm
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ LDRB r7, [r1], #1
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ LDRB r5, [r1], #1
+ LDRB r6, [r2], #1
+ LDRSH r4, [r12],#2
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], #1
+ ADD r5, r5, r6
+ ADDS r5, r4, r5, LSR #1
+ CMPGT r8, r5
+ LDRB r7, [r1], r3
+ LDRB r6, [r2], r3
+ LDRSH r4, [r12],#2
+ EORLT r5, r8, r5, ASR #32
+ STRB r5, [r0], #1
+ ADD r7, r7, r6
+ ADDS r7, r4, r7, LSR #1
+ CMPGT r8, r7
+ EORLT r7, r8, r7, ASR #32
+ STRB r7, [r0], r3
+ SUBS r14,r14,#1
+ BGT ofrinter2_lp_arm
+ LDMFD r13!,{r4-r8,PC}
+
+ [ OC_ARM_ASM_EDSP
+ EXPORT oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp
+ ; r0 = _dst_frame
+ ; r1 = _src_frame
+ ; r2 = _ystride
+ ; r3 = _fragis
+ ; <> = _nfragis
+ ; <> = _frag_buf_offs
+ LDR r12,[r13] ; r12 = _nfragis
+ STMFD r13!,{r4-r11,r14}
+ SUBS r12, r12, #1
+ LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
+ LDRGE r14,[r13,#4*10] ; r14 = _frag_buf_offs
+ BLT ofcl_edsp_end
+ofcl_edsp_lp
+ MOV r4, r1
+ LDR r5, [r14,r5, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
+ SUBS r12, r12, #1
+ ; Stall (on XScale)
+ LDRD r6, [r4, r5]! ; r4 = _src_frame+frag_buf_off
+ LDRD r8, [r4, r2]!
+ ; Stall
+ STRD r6, [r5, r0]! ; r5 = _dst_frame+frag_buf_off
+ STRD r8, [r5, r2]!
+ ; Stall
+ LDRD r6, [r4, r2]! ; On Xscale at least, doing 3 consecutive
+ LDRD r8, [r4, r2]! ; loads causes a stall, but that's no worse
+ LDRD r10,[r4, r2]! ; than us only doing 2, and having to do
+ ; another pair of LDRD/STRD later on.
+ ; Stall
+ STRD r6, [r5, r2]!
+ STRD r8, [r5, r2]!
+ STRD r10,[r5, r2]!
+ LDRD r6, [r4, r2]!
+ LDRD r8, [r4, r2]!
+ LDRD r10,[r4, r2]!
+ STRD r6, [r5, r2]!
+ STRD r8, [r5, r2]!
+ STRD r10,[r5, r2]!
+ LDRGE r5, [r3],#4 ; r5 = _fragis[fragii]
+ BGE ofcl_edsp_lp
+ofcl_edsp_end
+ LDMFD r13!,{r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+ EXPORT oc_frag_recon_intra_v6
+ EXPORT oc_frag_recon_inter_v6
+ EXPORT oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6
+ ; r0 = unsigned char *_dst
+ ; r1 = int _ystride
+ ; r2 = const ogg_int16_t _residue[64]
+ STMFD r13!,{r4-r6,r14}
+ MOV r14,#8
+ MOV r12,r2
+ LDR r6, =0x00800080
+ofrintra_v6_lp
+ LDRD r2, [r12],#8 ; r2 = 11110000 r3 = 33332222
+ LDRD r4, [r12],#8 ; r4 = 55554444 r5 = 77776666
+ SUBS r14,r14,#1
+ QADD16 r2, r2, r6
+ QADD16 r3, r3, r6
+ QADD16 r4, r4, r6
+ QADD16 r5, r5, r6
+ USAT16 r2, #8, r2 ; r2 = __11__00
+ USAT16 r3, #8, r3 ; r3 = __33__22
+ USAT16 r4, #8, r4 ; r4 = __55__44
+ USAT16 r5, #8, r5 ; r5 = __77__66
+ ORR r2, r2, r2, LSR #8 ; r2 = __111100
+ ORR r3, r3, r3, LSR #8 ; r3 = __333322
+ ORR r4, r4, r4, LSR #8 ; r4 = __555544
+ ORR r5, r5, r5, LSR #8 ; r5 = __777766
+ PKHBT r2, r2, r3, LSL #16 ; r2 = 33221100
+ PKHBT r3, r4, r5, LSL #16 ; r3 = 77665544
+ STRD r2, [r0], r1
+ BGT ofrintra_v6_lp
+ LDMFD r13!,{r4-r6,PC}
+
+oc_frag_recon_inter_v6
+ ; r0 = unsigned char *_dst
+ ; r1 = const unsigned char *_src
+ ; r2 = int _ystride
+ ; r3 = const ogg_int16_t _residue[64]
+ STMFD r13!,{r4-r7,r14}
+ MOV r14,#8
+ofrinter_v6_lp
+ LDRD r6, [r3], #8 ; r6 = 11110000 r7 = 33332222
+ SUBS r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+ LDRD r4, [r1], r2 ; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+ LDR r5, [r1, #4]
+ LDR r4, [r1], r2
+ ]
+ PKHBT r12,r6, r7, LSL #16 ; r12= 22220000
+ PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
+ UXTB16 r6,r4 ; r6 = __22__00
+ UXTB16 r4,r4, ROR #8 ; r4 = __33__11
+ QADD16 r12,r12,r6 ; r12= xx22xx00
+ QADD16 r4, r7, r4 ; r4 = xx33xx11
+ LDRD r6, [r3], #8 ; r6 = 55554444 r7 = 77776666
+ USAT16 r4, #8, r4 ; r4 = __33__11
+ USAT16 r12,#8,r12 ; r12= __22__00
+ ORR r4, r12,r4, LSL #8 ; r4 = 33221100
+ PKHBT r12,r6, r7, LSL #16 ; r12= 66664444
+ PKHTB r7, r7, r6, ASR #16 ; r7 = 77775555
+ UXTB16 r6,r5 ; r6 = __66__44
+ UXTB16 r5,r5, ROR #8 ; r5 = __77__55
+ QADD16 r12,r12,r6 ; r12= xx66xx44
+ QADD16 r5, r7, r5 ; r5 = xx77xx55
+ USAT16 r12,#8, r12 ; r12= __66__44
+ USAT16 r5, #8, r5 ; r4 = __77__55
+ ORR r5, r12,r5, LSL #8 ; r5 = 33221100
+ STRD r4, [r0], r2
+ BGT ofrinter_v6_lp
+ LDMFD r13!,{r4-r7,PC}
+
+oc_frag_recon_inter2_v6
+ ; r0 = unsigned char *_dst
+ ; r1 = const unsigned char *_src1
+ ; r2 = const unsigned char *_src2
+ ; r3 = int _ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t _residue[64]
+ STMFD r13!,{r4-r9,r14}
+ MOV r14,#8
+ofrinter2_v6_lp
+ LDRD r6, [r12,#8] ; r6 = 55554444 r7 = 77776666
+ SUBS r14,r14,#1
+ LDR r4, [r1, #4] ; Unaligned ; r4 = src1[1] = 77665544
+ LDR r5, [r2, #4] ; Unaligned ; r5 = src2[1] = 77665544
+ PKHBT r8, r6, r7, LSL #16 ; r8 = 66664444
+ PKHTB r9, r7, r6, ASR #16 ; r9 = 77775555
+ UHADD8 r4, r4, r5 ; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+ UXTB16 r5, r4 ; r5 = __66__44
+ UXTB16 r4, r4, ROR #8 ; r4 = __77__55
+ QADD16 r8, r8, r5 ; r8 = xx66xx44
+ QADD16 r9, r9, r4 ; r9 = xx77xx55
+ LDRD r6,[r12],#16 ; r6 = 33332222 r7 = 11110000
+ USAT16 r8, #8, r8 ; r8 = __66__44
+ LDR r4, [r1], r3 ; Unaligned ; r4 = src1[0] = 33221100
+ USAT16 r9, #8, r9 ; r9 = __77__55
+ LDR r5, [r2], r3 ; Unaligned ; r5 = src2[0] = 33221100
+ ORR r9, r8, r9, LSL #8 ; r9 = 77665544
+ PKHBT r8, r6, r7, LSL #16 ; r8 = 22220000
+ UHADD8 r4, r4, r5 ; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+ PKHTB r7, r7, r6, ASR #16 ; r7 = 33331111
+ UXTB16 r5, r4 ; r5 = __22__00
+ UXTB16 r4, r4, ROR #8 ; r4 = __33__11
+ QADD16 r8, r8, r5 ; r8 = xx22xx00
+ QADD16 r7, r7, r4 ; r7 = xx33xx11
+ USAT16 r8, #8, r8 ; r8 = __22__00
+ USAT16 r7, #8, r7 ; r7 = __33__11
+ ORR r8, r8, r7, LSL #8 ; r8 = 33221100
+ STRD r8, [r0], r3
+ BGT ofrinter2_v6_lp
+ LDMFD r13!,{r4-r9,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+ EXPORT oc_frag_copy_list_neon
+ EXPORT oc_frag_recon_intra_neon
+ EXPORT oc_frag_recon_inter_neon
+ EXPORT oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon
+ ; r0 = _dst_frame
+ ; r1 = _src_frame
+ ; r2 = _ystride
+ ; r3 = _fragis
+ ; <> = _nfragis
+ ; <> = _frag_buf_offs
+ LDR r12,[r13] ; r12 = _nfragis
+ STMFD r13!,{r4-r6,r14}
+ SUBS r12, r12, #1
+ LDRGE r6, [r3],#4 ; r6 = _fragis[fragii]
+ LDRGE r14,[r13,#4*5] ; r14 = _frag_buf_offs
+ BLT ofcl_neon_end
+ ; Stall (2 on Xscale)
+ LDR r6, [r14,r6, LSL #2] ; r6 = _frag_buf_offs[_fragis[fragii]]
+ ; Stall (on XScale)
+ofcl_neon_lp
+ ADD r4, r1, r6
+ VLD1.64 {D0}, [r4 at 64], r2
+ VLD1.64 {D1}, [r4 at 64], r2
+ VLD1.64 {D2}, [r4 at 64], r2
+ VLD1.64 {D3}, [r4 at 64], r2
+ ADD r5, r6, r0
+ VLD1.64 {D4}, [r4], r2
+ SUBS r12, r12, #1
+ VLD1.64 {D5}, [r4 at 64], r2
+ VLD1.64 {D6}, [r4 at 64], r2
+ VLD1.64 {D7}, [r4 at 64], r2
+ VST1.64 {D0}, [r5 at 64], r2
+ LDRGE r6, [r3],#4 ; r6 = _fragis[fragii]
+ VST1.64 {D1}, [r5 at 64], r2
+ VST1.64 {D2}, [r5 at 64], r2
+ VST1.64 {D3}, [r5 at 64], r2
+ VST1.64 {D4}, [r5 at 64], r2
+ LDRGE r6, [r14,r6, LSL #2] ; r5 = _frag_buf_offs[_fragis[fragii]]
+ VST1.64 {D5}, [r5 at 64], r2
+ VST1.64 {D6}, [r5 at 64], r2
+ VST1.64 {D7}, [r5 at 64], r2
+ BGE ofcl_neon_lp
+ofcl_neon_end
+ LDMFD r13!,{r4-r6,PC}
+
+oc_frag_recon_intra_neon
+ ; r0 = unsigned char *_dst
+ ; r1 = int _ystride
+ ; r2 = const ogg_int16_t _residue[64]
+ MOV r3, #128
+ VDUP.S16 Q0, r3
+ VLDMIA r2, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
+ VQADD.S16 Q8, Q8, Q0
+ VQADD.S16 Q9, Q9, Q0
+ VQADD.S16 Q10,Q10,Q0
+ VQADD.S16 Q11,Q11,Q0
+ VQADD.S16 Q12,Q12,Q0
+ VQADD.S16 Q13,Q13,Q0
+ VQADD.S16 Q14,Q14,Q0
+ VQADD.S16 Q15,Q15,Q0
+ VQMOVUN.S16 D16,Q8 ; D16= 7766554433221100 ; 1 cycle
+ VQMOVUN.S16 D17,Q9 ; D17= FFEEDDCCBBAA9988 ; 1 cycle
+ VQMOVUN.S16 D18,Q10 ; D18= NNMMLLKKJJIIHHGG ; 1 cycle
+ VST1.64 {D16},[r0 at 64], r1
+ VQMOVUN.S16 D19,Q11 ; D19= VVUUTTSSRRQQPPOO ; 1 cycle
+ VST1.64 {D17},[r0 at 64], r1
+ VQMOVUN.S16 D20,Q12 ; D20= ddccbbaaZZYYXXWW ; 1 cycle
+ VST1.64 {D18},[r0 at 64], r1
+ VQMOVUN.S16 D21,Q13 ; D21= llkkjjiihhggffee ; 1 cycle
+ VST1.64 {D19},[r0 at 64], r1
+ VQMOVUN.S16 D22,Q14 ; D22= ttssrrqqppoonnmm ; 1 cycle
+ VST1.64 {D20},[r0 at 64], r1
+ VQMOVUN.S16 D23,Q15 ; D23= !!@@zzyyxxwwvvuu ; 1 cycle
+ VST1.64 {D21},[r0 at 64], r1
+ VST1.64 {D22},[r0 at 64], r1
+ VST1.64 {D23},[r0 at 64], r1
+ MOV PC,R14
+
+oc_frag_recon_inter_neon
+ ; r0 = unsigned char *_dst
+ ; r1 = const unsigned char *_src
+ ; r2 = int _ystride
+ ; r3 = const ogg_int16_t _residue[64]
+ VLDMIA r3, {D16-D31} ; D16= 3333222211110000 etc ; 9(8) cycles
+ VLD1.64 {D0}, [r1], r2
+ VLD1.64 {D2}, [r1], r2
+ VMOVL.U8 Q0, D0 ; Q0 = __77__66__55__44__33__22__11__00
+ VLD1.64 {D4}, [r1], r2
+ VMOVL.U8 Q1, D2 ; etc
+ VLD1.64 {D6}, [r1], r2
+ VMOVL.U8 Q2, D4
+ VMOVL.U8 Q3, D6
+ VQADD.S16 Q8, Q8, Q0
+ VLD1.64 {D0}, [r1], r2
+ VQADD.S16 Q9, Q9, Q1
+ VLD1.64 {D2}, [r1], r2
+ VQADD.S16 Q10,Q10,Q2
+ VLD1.64 {D4}, [r1], r2
+ VQADD.S16 Q11,Q11,Q3
+ VLD1.64 {D6}, [r1], r2
+ VMOVL.U8 Q0, D0
+ VMOVL.U8 Q1, D2
+ VMOVL.U8 Q2, D4
+ VMOVL.U8 Q3, D6
+ VQADD.S16 Q12,Q12,Q0
+ VQADD.S16 Q13,Q13,Q1
+ VQADD.S16 Q14,Q14,Q2
+ VQADD.S16 Q15,Q15,Q3
+ VQMOVUN.S16 D16,Q8
+ VQMOVUN.S16 D17,Q9
+ VQMOVUN.S16 D18,Q10
+ VST1.64 {D16},[r0 at 64], r2
+ VQMOVUN.S16 D19,Q11
+ VST1.64 {D17},[r0 at 64], r2
+ VQMOVUN.S16 D20,Q12
+ VST1.64 {D18},[r0 at 64], r2
+ VQMOVUN.S16 D21,Q13
+ VST1.64 {D19},[r0 at 64], r2
+ VQMOVUN.S16 D22,Q14
+ VST1.64 {D20},[r0 at 64], r2
+ VQMOVUN.S16 D23,Q15
+ VST1.64 {D21},[r0 at 64], r2
+ VST1.64 {D22},[r0 at 64], r2
+ VST1.64 {D23},[r0 at 64], r2
+ MOV PC,R14
+
+oc_frag_recon_inter2_neon
+ ; r0 = unsigned char *_dst
+ ; r1 = const unsigned char *_src1
+ ; r2 = const unsigned char *_src2
+ ; r3 = int _ystride
+ LDR r12,[r13]
+ ; r12= const ogg_int16_t _residue[64]
+ VLDMIA r12,{D16-D31}
+ VLD1.64 {D0}, [r1], r3
+ VLD1.64 {D4}, [r2], r3
+ VLD1.64 {D1}, [r1], r3
+ VLD1.64 {D5}, [r2], r3
+ VHADD.U8 Q2, Q0, Q2 ; Q2 = FFEEDDCCBBAA99887766554433221100
+ VLD1.64 {D2}, [r1], r3
+ VLD1.64 {D6}, [r2], r3
+ VMOVL.U8 Q0, D4 ; Q0 = __77__66__55__44__33__22__11__00
+ VLD1.64 {D3}, [r1], r3
+ VMOVL.U8 Q2, D5 ; etc
+ VLD1.64 {D7}, [r2], r3
+ VHADD.U8 Q3, Q1, Q3
+ VQADD.S16 Q8, Q8, Q0
+ VQADD.S16 Q9, Q9, Q2
+ VLD1.64 {D0}, [r1], r3
+ VMOVL.U8 Q1, D6
+ VLD1.64 {D4}, [r2], r3
+ VMOVL.U8 Q3, D7
+ VLD1.64 {D1}, [r1], r3
+ VQADD.S16 Q10,Q10,Q1
+ VLD1.64 {D5}, [r2], r3
+ VQADD.S16 Q11,Q11,Q3
+ VLD1.64 {D2}, [r1], r3
+ VHADD.U8 Q2, Q0, Q2
+ VLD1.64 {D6}, [r2], r3
+ VLD1.64 {D3}, [r1], r3
+ VMOVL.U8 Q0, D4
+ VLD1.64 {D7}, [r2], r3
+ VMOVL.U8 Q2, D5
+ VHADD.U8 Q3, Q1, Q3
+ VQADD.S16 Q12,Q12,Q0
+ VQADD.S16 Q13,Q13,Q2
+ VMOVL.U8 Q1, D6
+ VMOVL.U8 Q3, D7
+ VQADD.S16 Q14,Q14,Q1
+ VQADD.S16 Q15,Q15,Q3
+ VQMOVUN.S16 D16,Q8
+ VQMOVUN.S16 D17,Q9
+ VQMOVUN.S16 D18,Q10
+ VST1.64 {D16},[r0 at 64], r3
+ VQMOVUN.S16 D19,Q11
+ VST1.64 {D17},[r0 at 64], r3
+ VQMOVUN.S16 D20,Q12
+ VST1.64 {D18},[r0 at 64], r3
+ VQMOVUN.S16 D21,Q13
+ VST1.64 {D19},[r0 at 64], r3
+ VQMOVUN.S16 D22,Q14
+ VST1.64 {D20},[r0 at 64], r3
+ VQMOVUN.S16 D23,Q15
+ VST1.64 {D21},[r0 at 64], r3
+ VST1.64 {D22},[r0 at 64], r3
+ VST1.64 {D23},[r0 at 64], r3
+ MOV PC,R14
+ ]
+
+ END
Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMidct.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,1822 @@
+;********************************************************************
+;* *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+;* *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;* *
+;********************************************************************
+; Original implementation:
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+ AREA |.text|, CODE, READONLY
+
+ GET armopts.s
+
+ EXPORT oc_idct8x8_arm
+
+oc_idct8x8_arm
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_int16_t *_x
+ ; r2 = int _last_zzi
+ CMP r2, #3
+ BLE oc_idct8x8_3_arm
+ CMP r2, #6
+ BLE oc_idct8x8_6_arm
+ CMP r2, #10
+ BLE oc_idct8x8_10_arm
+oc_idct8x8_slow_arm
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ STR r0, [r13,#-4]!
+ ADD r0, r13, #4 ; Write to temp storage.
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ BL idct8core_arm
+ LDR r0, [r13], #4 ; Write to the final destination.
+ ; Clear input data for next block (decoder only).
+ SUB r2, r1, #8*16
+ CMP r0, r2
+ MOV r1, r13 ; And read from temp storage.
+ BEQ oc_idct8x8_slow_arm_cols
+ MOV r4, #0
+ MOV r5, #0
+ MOV r6, #0
+ MOV r7, #0
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+ STMIA r2!,{r4,r5,r6,r7}
+oc_idct8x8_slow_arm_cols
+; Column transforms
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ BL idct8core_down_arm
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+
+oc_idct8x8_10_arm
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ MOV r2, r0
+ MOV r0, r13 ; Write to temp storage.
+ BL idct4core_arm
+ BL idct3core_arm
+ BL idct2core_arm
+ BL idct1core_arm
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #4*16
+ CMP r0, r2
+ MOV r1, r13 ; Read from temp storage.
+ BEQ oc_idct8x8_10_arm_cols
+ MOV r4, #0
+ STR r4, [r0]
+ STR r4, [r0,#4]
+ STR r4, [r0,#16]
+ STR r4, [r0,#20]
+ STR r4, [r0,#32]
+ STR r4, [r0,#48]
+ MOV r0, r2 ; Write to the final destination
+oc_idct8x8_10_arm_cols
+; Column transforms
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ BL idct4core_down_arm
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+
+oc_idct8x8_6_arm
+ STMFD r13!,{r4-r7,r9-r11,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ MOV r2, r0
+ MOV r0, r13 ; Write to temp storage.
+ BL idct3core_arm
+ BL idct2core_arm
+ BL idct1core_arm
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #3*16
+ CMP r0, r2
+ MOV r1, r13 ; Read from temp storage.
+ BEQ oc_idct8x8_6_arm_cols
+ MOV r4, #0
+ STR r4, [r0]
+ STR r4, [r0,#4]
+ STR r4, [r0,#16]
+ STR r4, [r0,#32]
+ MOV r0, r2 ; Write to the final destination
+oc_idct8x8_6_arm_cols
+; Column transforms
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ BL idct3core_down_arm
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r7,r9-r11,PC}
+
+oc_idct8x8_3_arm
+ STMFD r13!,{r4-r7,r9-r11,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ MOV r2, r0
+ MOV r0, r13 ; Write to temp storage.
+ BL idct2core_arm
+ BL idct1core_arm
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #2*16
+ CMP r0, r2
+ MOV r1, r13 ; Read from temp storage.
+ MOVNE r4, #0
+ STRNE r4, [r0]
+ STRNE r4, [r0,#16]
+ MOVNE r0, r2 ; Write to the final destination
+; Column transforms
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ BL idct2core_down_arm
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r7,r9-r11,PC}
+
+idct1core_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r3, [r1], #16
+ MOV r12,#0x05
+ ORR r12,r12,#0xB500
+ MUL r3, r12, r3
+ ; Stall ?
+ MOV r3, r3, ASR #16
+ STRH r3, [r0], #2
+ STRH r3, [r0, #14]
+ STRH r3, [r0, #30]
+ STRH r3, [r0, #46]
+ STRH r3, [r0, #62]
+ STRH r3, [r0, #78]
+ STRH r3, [r0, #94]
+ STRH r3, [r0, #110]
+ MOV PC,R14
+
+idct2core_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r12,OC_C4S4
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ LDR r3, OC_C7S1
+ MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r10,OC_C1S7
+ MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ MOV r3, r3, ASR #16 ; r3 = t[4]
+ MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ MOV r10,r10,ASR #16 ; r10= t[5]
+ ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]
+ ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+ SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+ ADD r3, r3, r9 ; r3 = t[0]+t[4]
+ ADD r11,r11,r9 ; r11= t[0]+t[7]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r12,[r0, #14] ; y[1] = t[0]+t[6]
+ STRH r10,[r0, #30] ; y[2] = t[0]+t[5]
+ STRH r3, [r0, #46] ; y[3] = t[0]+t[4]
+ RSB r3, r3, r9, LSL #1 ; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+ RSB r10,r10,r9, LSL #1 ; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+ RSB r12,r12,r9, LSL #1 ; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+ RSB r11,r11,r9, LSL #1 ; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+ STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
+ STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
+ STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
+ STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
+ MOV PC,r14
+
+idct2core_down_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r12,OC_C4S4
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ LDR r3, OC_C7S1
+ MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r10,OC_C1S7
+ MUL r3, r11,r3 ; r3 = t[4]<<16 = OC_C7S1*x[1]
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r11,r10,r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ ADD r9, r9, #8 ; r9 = t[0]+8
+ MOV r3, r3, ASR #16 ; r3 = t[4]
+ MUL r10,r12,r3 ; r10= t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ MOV r10,r10,ASR #16 ; r10= t[5]
+ ADD r12,r9,r12,ASR #16 ; r12= t[0]+t[6]+8
+ ADD r12,r12,r10 ; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+ SUB r10,r12,r10,LSL #1 ; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+ ADD r3, r3, r9 ; r3 = t[0]+t[4]+8
+ ADD r11,r11,r9 ; r11= t[0]+t[7]+8
+ ; TODO: This is wrong.
+ ; The C code truncates to 16 bits by storing to RAM and doing the
+ ; shifts later; we've got an extra 4 bits here.
+ MOV r4, r11,ASR #4
+ MOV r5, r12,ASR #4
+ MOV r6, r10,ASR #4
+ MOV r7, r3, ASR #4
+ RSB r3, r3, r9, LSL #1 ;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+ RSB r10,r10,r9, LSL #1 ;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+ RSB r12,r12,r9, LSL #1 ;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+ RSB r11,r11,r9, LSL #1 ;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+ MOV r3, r3, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r12,r12,ASR #4
+ MOV r11,r11,ASR #4
+ STRH r4, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[0]+t[6]
+ STRH r6, [r0, #30] ; y[2] = t[0]+t[5]
+ STRH r7, [r0, #46] ; y[3] = t[0]+t[4]
+ STRH r3, [r0, #62] ; y[4] = t[0]-t[4]
+ STRH r10,[r0, #78] ; y[5] = t[0]-t[5]
+ STRH r12,[r0, #94] ; y[6] = t[0]-t[6]
+ STRH r11,[r0, #110] ; y[7] = t[0]-t[7]
+ MOV PC,r14
+
+idct3core_arm
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r12,OC_C4S4 ; r12= OC_C4S4
+ LDRSH r3, [r1, #-12] ; r3 = x[2]
+ LDR r10,OC_C6S2 ; r10= OC_C6S2
+ MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r4, OC_C2S6 ; r4 = OC_C2S6
+ MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r4, OC_C7S1 ; r4 = OC_C7S1
+ LDR r5, OC_C1S7 ; r5 = OC_C1S7
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
+ ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]
+ MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ MOV r4, r4, ASR #16 ; r4 = t[4]
+ MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ ADD r10,r9, r10,ASR #16 ; r10= t[1] = t[0]+t[2]
+ RSB r6, r10,r9, LSL #1 ; r6 = t[2] = t[0]-t[2]
+ ; r3 = t2[0] = t[0]+t[3]
+ RSB r9, r3, r9, LSL #1 ; r9 = t2[3] = t[0]-t[3]
+ MOV r12,r12,ASR #16 ; r12= t[6]
+ ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
+ RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
+ ADD r11,r3, r11 ; r11= t2[0]+t[7]
+ ADD r5, r10,r5 ; r5 = t[1]+t2[6]
+ ADD r12,r6, r12 ; r12= t[2]+t2[5]
+ ADD r4, r9, r4 ; r4 = t2[3]+t[4]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
+ RSB r11,r11,r3, LSL #1 ; r11= t2[0] - t[7]
+ RSB r5, r5, r10,LSL #1 ; r5 = t[1] - t2[6]
+ RSB r12,r12,r6, LSL #1 ; r6 = t[2] - t2[5]
+ RSB r4, r4, r9, LSL #1 ; r4 = t2[3] - t[4]
+ STRH r4, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r12,[r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r5, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
+ MOV PC,R14
+
+idct3core_down_arm
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r12,OC_C4S4 ; r12= OC_C4S4
+ LDRSH r3, [r1, #-12] ; r3 = x[2]
+ LDR r10,OC_C6S2 ; r10= OC_C6S2
+ MUL r9, r12,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r4, OC_C2S6 ; r4 = OC_C2S6
+ MUL r10,r3, r10 ; r10= t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r11,[r1, #-14] ; r11= x[1]
+ MUL r3, r4, r3 ; r3 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r4, OC_C7S1 ; r4 = OC_C7S1
+ LDR r5, OC_C1S7 ; r5 = OC_C1S7
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r4, r11,r4 ; r4 = t[4]<<16 = OC_C7S1*x[1]
+ ADD r9, r9, #8 ; r9 = t[0]+8
+ MUL r11,r5, r11 ; r11= t[7]<<16 = OC_C1S7*x[1]
+ ADD r3, r9, r3, ASR #16 ; r3 = t[0]+t[3]+8
+ MOV r4, r4, ASR #16 ; r4 = t[4]
+ MUL r5, r12,r4 ; r5 = t[5]<<16 = OC_C4S4*t[4]
+ MOV r11,r11,ASR #16 ; r11= t[7]
+ MUL r12,r11,r12 ; r12= t[6]<<16 = OC_C4S4*t[7]
+ ADD r10,r9, r10,ASR #16 ; r10= t[1]+8 = t[0]+t[2]+8
+ RSB r6, r10,r9, LSL #1 ; r6 = t[2]+8 = t[0]-t[2]+8
+ ; r3 = t2[0]+8 = t[0]+t[3]+8
+ RSB r9, r3, r9, LSL #1 ; r9 = t2[3]+8 = t[0]-t[3]+8
+ MOV r12,r12,ASR #16 ; r12= t[6]
+ ADD r5, r12,r5, ASR #16 ; r5 = t2[6] = t[6]+t[5]
+ RSB r12,r5, r12,LSL #1 ; r12= t2[5] = t[6]-t[5]
+ ADD r11,r3, r11 ; r11= t2[0]+t[7] +8
+ ADD r5, r10,r5 ; r5 = t[1] +t2[6]+8
+ ADD r12,r6, r12 ; r12= t[2] +t2[5]+8
+ ADD r4, r9, r4 ; r4 = t2[3]+t[4] +8
+ RSB r3, r11,r3, LSL #1 ; r11= t2[0] - t[7] + 8
+ RSB r10,r5, r10,LSL #1 ; r5 = t[1] - t2[6] + 8
+ RSB r6, r12,r6, LSL #1 ; r6 = t[2] - t2[5] + 8
+ RSB r9, r4, r9, LSL #1 ; r4 = t2[3] - t[4] + 8
+ ; TODO: This is wrong.
+ ; The C code truncates to 16 bits by storing to RAM and doing the
+ ; shifts later; we've got an extra 4 bits here.
+ MOV r11,r11,ASR #4
+ MOV r5, r5, ASR #4
+ MOV r12,r12,ASR #4
+ MOV r4, r4, ASR #4
+ MOV r9, r9, ASR #4
+ MOV r6, r6, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r3, r3, ASR #4
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r5, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r12,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r4, [r0, #46] ; y[3] = t2[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r6, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r3, [r0, #110] ; y[7] = t2[0]-t[7]
+ MOV PC,R14
+
+idct4core_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r10,OC_C4S4 ; r10= OC_C4S4
+ LDRSH r12,[r1, #-12] ; r12= x[2]
+ LDR r4, OC_C6S2 ; r4 = OC_C6S2
+ MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r5, OC_C2S6 ; r5 = OC_C2S6
+ MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r3, [r1, #-14] ; r3 = x[1]
+ MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r6, OC_C7S1 ; r6 = OC_C7S1
+ LDR r12,OC_C1S7 ; r12= OC_C1S7
+ LDRSH r11,[r1, #-10] ; r11= x[3]
+ MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
+ LDR r7, OC_C5S3 ; r7 = OC_C5S3
+ MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
+ LDR r8, OC_C3S5 ; r8 = OC_C3S5
+ MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
+ MOV r6, r6, ASR #16 ; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+ SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+ RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
+ MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+ MOV r3, r3, ASR #16 ; r3 = t[7]
+ ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
+ RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
+ MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+ ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2]
+ RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2]
+ ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3]
+ RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3]
+ MOV r3, r3, ASR #16 ; r3 = t2[6]
+ ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
+ RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
+ ADD r11,r5, r11 ; r11= t[0]+t2[7]
+ ADD r6, r4, r6 ; r6 = t[1]+t3[6]
+ ADD r3, r10,r3 ; r3 = t[2]+t3[5]
+ ADD r7, r9, r7 ; r7 = t[3]+t2[4]
+ STRH r11,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r7, [r0, #46] ; y[3] = t2[3]+t[4]
+ RSB r11,r11,r5, LSL #1 ; r11= t[0]-t2[7]
+ RSB r6, r6, r4, LSL #1 ; r6 = t[1]-t3[6]
+ RSB r3, r3, r10,LSL #1 ; r3 = t[2]-t3[5]
+ RSB r7, r7, r9, LSL #1 ; r7 = t[3]-t2[4]
+ STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11, [r0, #110] ; y[7] = t2[0]-t[7]
+ MOV PC,r14
+
+idct4core_down_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r9, [r1], #16 ; r9 = x[0]
+ LDR r10,OC_C4S4 ; r10= OC_C4S4
+ LDRSH r12,[r1, #-12] ; r12= x[2]
+ LDR r4, OC_C6S2 ; r4 = OC_C6S2
+ MUL r9, r10,r9 ; r9 = t[0]<<16 = OC_C4S4*x[0]
+ LDR r5, OC_C2S6 ; r5 = OC_C2S6
+ MUL r4, r12,r4 ; r4 = t[2]<<16 = OC_C6S2*x[2]
+ LDRSH r3, [r1, #-14] ; r3 = x[1]
+ MUL r5, r12,r5 ; r5 = t[3]<<16 = OC_C2S6*x[2]
+ LDR r6, OC_C7S1 ; r6 = OC_C7S1
+ LDR r12,OC_C1S7 ; r12= OC_C1S7
+ LDRSH r11,[r1, #-10] ; r11= x[3]
+ MUL r6, r3, r6 ; r6 = t[4]<<16 = OC_C7S1*x[1]
+ LDR r7, OC_C5S3 ; r7 = OC_C5S3
+ MUL r3, r12,r3 ; r3 = t[7]<<16 = OC_C1S7*x[1]
+ LDR r8, OC_C3S5 ; r8 = OC_C3S5
+ MUL r7, r11,r7 ; r7 = -t[5]<<16 = OC_C5S3*x[3]
+ MOV r9, r9, ASR #16 ; r9 = t[0]
+ MUL r11,r8, r11 ; r11= t[6]<<16 = OC_C3S5*x[3]
+ MOV r6, r6, ASR #16 ; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+ SUB r7, r6, r7, ASR #16 ; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+ RSB r6, r7, r6, LSL #1 ; r6 = t[4]-t[5]
+ MUL r6, r10,r6 ; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+ MOV r3, r3, ASR #16 ; r3 = t[7]
+ ADD r11,r3, r11,ASR #16 ; r11= t2[7]=t[7]+t[6]
+ RSB r3, r11,r3, LSL #1 ; r3 = t[7]-t[6]
+ ADD r9, r9, #8 ; r9 = t[0]+8
+ MUL r3, r10,r3 ; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+ ADD r4, r9, r4, ASR #16 ; r4 = t[1] = t[0] + t[2] + 8
+ RSB r10,r4, r9, LSL #1 ; r10= t[2] = t[0] - t[2] + 8
+ ADD r5, r9, r5, ASR #16 ; r5 = t[0] = t[0] + t[3] + 8
+ RSB r9, r5, r9, LSL #1 ; r9 = t[3] = t[0] - t[3] + 8
+ MOV r3, r3, ASR #16 ; r3 = t2[6]
+ ADD r6, r3, r6, ASR #16 ; r6 = t3[6] = t2[6]+t2[5]
+ RSB r3, r6, r3, LSL #1 ; r3 = t3[5] = t2[6]-t2[5]
+ ADD r5, r5, r11 ; r5 = t[0]+t2[7]+8
+ ADD r4, r4, r6 ; r4 = t[1]+t3[6]+8
+ ADD r10,r10,r3 ; r10= t[2]+t3[5]+8
+ ADD r9, r9, r7 ; r9 = t[3]+t2[4]+8
+ SUB r11,r5, r11,LSL #1 ; r11= t[0]-t2[7]+8
+ SUB r6, r4, r6, LSL #1 ; r6 = t[1]-t3[6]+8
+ SUB r3, r10,r3, LSL #1 ; r3 = t[2]-t3[5]+8
+ SUB r7, r9, r7, LSL #1 ; r7 = t[3]-t2[4]+8
+ ; TODO: This is wrong.
+ ; The C code truncates to 16 bits by storing to RAM and doing the
+ ; shifts later; we've got an extra 4 bits here.
+ MOV r11,r11,ASR #4
+ MOV r6, r6, ASR #4
+ MOV r3, r3, ASR #4
+ MOV r7, r7, ASR #4
+ MOV r9, r9, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r4, r4, ASR #4
+ MOV r5, r5, ASR #4
+ STRH r5,[r0], #2 ; y[0] = t[0]+t[7]
+ STRH r4, [r0, #14] ; y[1] = t[1]+t2[6]
+ STRH r10,[r0, #30] ; y[2] = t[2]+t2[5]
+ STRH r9, [r0, #46] ; y[3] = t2[3]+t[4]
+ STRH r7, [r0, #62] ; y[4] = t2[3]-t[4]
+ STRH r3, [r0, #78] ; y[5] = t[2]-t2[5]
+ STRH r6, [r0, #94] ; y[6] = t[1]-t2[6]
+ STRH r11,[r0, #110] ; y[7] = t2[0]-t[7]
+ MOV PC,r14
+
+idct8core_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r2, [r1],#16 ; r2 = x[0]
+ STMFD r13!,{r1,r14}
+ LDRSH r6, [r1, #-8] ; r6 = x[4]
+ LDR r12,OC_C4S4 ; r12= C4S4
+ LDRSH r4, [r1, #-12] ; r4 = x[2]
+ ADD r2, r2, r6 ; r2 = x[0] + x[4]
+ SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
+ ; For spec compliance, these sums must be truncated to 16-bit precision
+ ; _before_ the multiply (not after).
+ ; Sadly, ARMv4 provides no simple way to do that.
+ MOV r2, r2, LSL #16
+ MOV r6, r6, LSL #16
+ MOV r2, r2, ASR #16
+ MOV r6, r6, ASR #16
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+ LDRSH r8, [r1, #-4] ; r8 = x[6]
+ LDR r7, OC_C6S2 ; r7 = OC_C6S2
+ MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+ LDR r14,OC_C2S6 ; r14= OC_C2S6
+ MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
+ LDR r5, OC_C7S1 ; r5 = OC_C7S1
+ MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
+ MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
+ MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
+ MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
+ MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
+ LDR r7, OC_C1S7 ; r7 = OC_C1S7
+ SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+ LDRSH r14,[r1, #-14] ; r14= x[1]
+ ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+ LDRSH r8, [r1, #-2] ; r8 = x[7]
+ MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
+ LDRSH r10,[r1, #-6] ; r10= x[5]
+ MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
+ MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
+ MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
+ MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
+ MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
+ LDRSH r1, [r1, #-10] ; r1 = x[3]
+ LDR r5, OC_C3S5 ; r5 = OC_C3S5
+ LDR r11,OC_C5S3 ; r11= OC_C5S3
+ ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+ MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
+ SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+ MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
+ MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
+ MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
+ MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
+ MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
+ SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+ ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+ ; r10=t[6] r12=C4S4 r14=t[5]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+ ; Stage 2
+ ; 4-5 butterfly
+ ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
+ SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
+ MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+ ; 7-6 butterfly
+ ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
+ SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
+ MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+ ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+ ; Stage 3
+ ; 0-3 butterfly
+ ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3]
+ SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3]
+ ; 1-2 butterfly
+ ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2]
+ SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2]
+ ; 6-5 butterfly
+ MOV r14,r14,ASR #16 ; r14= t2[5]
+ ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
+ SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
+ ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+ ; r10=t3[6] r14=t3[5]
+ ; Stage 4
+ ADD r2, r2, r8 ; r2 = t[0] + t[7]
+ ADD r6, r6, r10 ; r6 = t[1] + t[6]
+ ADD r3, r3, r14 ; r3 = t[2] + t[5]
+ ADD r4, r4, r9 ; r4 = t[3] + t[4]
+ SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7]
+ SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6]
+ SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5]
+ SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4]
+ STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
+ STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
+ STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
+ STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
+ LDMFD r13!,{r1,PC}
+
+idct8core_down_arm
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r2, [r1],#16 ; r2 = x[0]
+ STMFD r13!,{r1,r14}
+ LDRSH r6, [r1, #-8] ; r6 = x[4]
+ LDR r12,OC_C4S4 ; r12= C4S4
+ LDRSH r4, [r1, #-12] ; r4 = x[2]
+ ADD r2, r2, r6 ; r2 = x[0] + x[4]
+ SUB r6, r2, r6, LSL #1 ; r6 = x[0] - x[4]
+ ; For spec compliance, these sums must be truncated to 16-bit precision
+ ; _before_ the multiply (not after).
+ ; Sadly, ARMv4 provides no simple way to do that.
+ MOV r2, r2, LSL #16
+ MOV r6, r6, LSL #16
+ MOV r2, r2, ASR #16
+ MOV r6, r6, ASR #16
+ MUL r2, r12,r2 ; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+ LDRSH r8, [r1, #-4] ; r8 = x[6]
+ LDR r7, OC_C6S2 ; r7 = OC_C6S2
+ MUL r6, r12,r6 ; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+ LDR r14,OC_C2S6 ; r14= OC_C2S6
+ MUL r3, r4, r7 ; r3 = OC_C6S2*x[2]
+ LDR r5, OC_C7S1 ; r5 = OC_C7S1
+ MUL r4, r14,r4 ; r4 = OC_C2S6*x[2]
+ MOV r3, r3, ASR #16 ; r3 = OC_C6S2*x[2]>>16
+ MUL r14,r8, r14 ; r14= OC_C2S6*x[6]
+ MOV r4, r4, ASR #16 ; r4 = OC_C2S6*x[2]>>16
+ MUL r8, r7, r8 ; r8 = OC_C6S2*x[6]
+ LDR r7, OC_C1S7 ; r7 = OC_C1S7
+ SUB r3, r3, r14,ASR #16 ; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+ LDRSH r14,[r1, #-14] ; r14= x[1]
+ ADD r4, r4, r8, ASR #16 ; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+ LDRSH r8, [r1, #-2] ; r8 = x[7]
+ MUL r9, r5, r14 ; r9 = OC_C7S1*x[1]
+ LDRSH r10,[r1, #-6] ; r10= x[5]
+ MUL r14,r7, r14 ; r14= OC_C1S7*x[1]
+ MOV r9, r9, ASR #16 ; r9 = OC_C7S1*x[1]>>16
+ MUL r7, r8, r7 ; r7 = OC_C1S7*x[7]
+ MOV r14,r14,ASR #16 ; r14= OC_C1S7*x[1]>>16
+ MUL r8, r5, r8 ; r8 = OC_C7S1*x[7]
+ LDRSH r1, [r1, #-10] ; r1 = x[3]
+ LDR r5, OC_C3S5 ; r5 = OC_C3S5
+ LDR r11,OC_C5S3 ; r11= OC_C5S3
+ ADD r8, r14,r8, ASR #16 ; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+ MUL r14,r5, r10 ; r14= OC_C3S5*x[5]
+ SUB r9, r9, r7, ASR #16 ; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+ MUL r10,r11,r10 ; r10= OC_C5S3*x[5]
+ MOV r14,r14,ASR #16 ; r14= OC_C3S5*x[5]>>16
+ MUL r11,r1, r11 ; r11= OC_C5S3*x[3]
+ MOV r10,r10,ASR #16 ; r10= OC_C5S3*x[5]>>16
+ MUL r1, r5, r1 ; r1 = OC_C3S5*x[3]
+ SUB r14,r14,r11,ASR #16 ;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+ ADD r10,r10,r1, ASR #16 ;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+ ; r10=t[6] r12=C4S4 r14=t[5]
+ ; Stage 2
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+ ; 4-5 butterfly
+ ADD r9, r9, r14 ; r9 = t2[4] = t[4]+t[5]
+ SUB r14,r9, r14, LSL #1 ; r14= t[4]-t[5]
+ MUL r14,r12,r14 ; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+ ; 7-6 butterfly
+ ADD r8, r8, r10 ; r8 = t2[7] = t[7]+t[6]
+ SUB r10,r8, r10, LSL #1 ; r10= t[7]-t[6]
+ MUL r10,r12,r10 ; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+ ; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+ ; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+ ; Stage 3
+ ADD r2, r2, #8<<16 ; r2 = t[0]+8<<16
+ ADD r6, r6, #8<<16 ; r6 = t[1]+8<<16
+ ; 0-3 butterfly
+ ADD r2, r4, r2, ASR #16 ; r2 = t2[0] = t[0] + t[3] + 8
+ SUB r4, r2, r4, LSL #1 ; r4 = t2[3] = t[0] - t[3] + 8
+ ; 1-2 butterfly
+ ADD r6, r3, r6, ASR #16 ; r6 = t2[1] = t[1] + t[2] + 8
+ SUB r3, r6, r3, LSL #1 ; r3 = t2[2] = t[1] - t[2] + 8
+ ; 6-5 butterfly
+ MOV r14,r14,ASR #16 ; r14= t2[5]
+ ADD r10,r14,r10,ASR #16 ; r10= t3[6] = t[6] + t[5]
+ SUB r14,r10,r14,LSL #1 ; r14= t3[5] = t[6] - t[5]
+ ; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+ ; r10=t3[6] r14=t3[5]
+ ; Stage 4
+ ADD r2, r2, r8 ; r2 = t[0] + t[7] + 8
+ ADD r6, r6, r10 ; r6 = t[1] + t[6] + 8
+ ADD r3, r3, r14 ; r3 = t[2] + t[5] + 8
+ ADD r4, r4, r9 ; r4 = t[3] + t[4] + 8
+ SUB r8, r2, r8, LSL #1 ; r8 = t[0] - t[7] + 8
+ SUB r10,r6, r10,LSL #1 ; r10= t[1] - t[6] + 8
+ SUB r14,r3, r14,LSL #1 ; r14= t[2] - t[5] + 8
+ SUB r9, r4, r9, LSL #1 ; r9 = t[3] - t[4] + 8
+ ; TODO: This is wrong.
+ ; The C code truncates to 16 bits by storing to RAM and doing the
+ ; shifts later; we've got an extra 4 bits here.
+ MOV r2, r2, ASR #4
+ MOV r6, r6, ASR #4
+ MOV r3, r3, ASR #4
+ MOV r4, r4, ASR #4
+ MOV r8, r8, ASR #4
+ MOV r10,r10,ASR #4
+ MOV r14,r14,ASR #4
+ MOV r9, r9, ASR #4
+ STRH r2, [r0], #2 ; y[0] = t[0]+t[7]
+ STRH r6, [r0, #14] ; y[1] = t[1]+t[6]
+ STRH r3, [r0, #30] ; y[2] = t[2]+t[5]
+ STRH r4, [r0, #46] ; y[3] = t[3]+t[4]
+ STRH r9, [r0, #62] ; y[4] = t[3]-t[4]
+ STRH r14,[r0, #78] ; y[5] = t[2]-t[5]
+ STRH r10,[r0, #94] ; y[6] = t[1]-t[6]
+ STRH r8, [r0, #110] ; y[7] = t[0]-t[7]
+ LDMFD r13!,{r1,PC}
+
+ [ OC_ARM_ASM_MEDIA
+ EXPORT oc_idct8x8_v6
+
+oc_idct8x8_v6
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_int16_t *_x
+ ; r2 = int _last_zzi
+ CMP r2, #3
+ BLE oc_idct8x8_3_v6
+ ;CMP r2, #6
+ ;BLE oc_idct8x8_6_v6
+ CMP r2, #10
+ BLE oc_idct8x8_10_v6
+oc_idct8x8_slow_v6
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ STR r0, [r13,#-4]!
+ ADD r0, r13, #4 ; Write to temp storage.
+ BL idct8_8core_v6
+ BL idct8_8core_v6
+ BL idct8_8core_v6
+ BL idct8_8core_v6
+ LDR r0, [r13], #4 ; Write to the final destination.
+ ; Clear input data for next block (decoder only).
+ SUB r2, r1, #8*16
+ CMP r0, r2
+ MOV r1, r13 ; And read from temp storage.
+ BEQ oc_idct8x8_slow_v6_cols
+ MOV r4, #0
+ MOV r5, #0
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+ STRD r4, [r2], #8
+oc_idct8x8_slow_v6_cols
+; Column transforms
+ BL idct8_8core_down_v6
+ BL idct8_8core_down_v6
+ BL idct8_8core_down_v6
+ BL idct8_8core_down_v6
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r11,PC}
+
+oc_idct8x8_10_v6
+ STMFD r13!,{r4-r11,r14}
+ SUB r13,r13,#64*2+4
+; Row transforms
+ MOV r2, r13
+ STR r0, [r13,#-4]!
+ AND r0, r2, #4 ; Align the stack.
+ ADD r0, r0, r2 ; Write to temp storage.
+ BL idct4_3core_v6
+ BL idct2_1core_v6
+ LDR r0, [r13], #4 ; Write to the final destination.
+ ; Clear input data for next block (decoder only).
+ SUB r2, r1, #4*16
+ CMP r0, r2
+ AND r1, r13,#4 ; Align the stack.
+ BEQ oc_idct8x8_10_v6_cols
+ MOV r4, #0
+ MOV r5, #0
+ STRD r4, [r2]
+ STRD r4, [r2,#16]
+ STR r4, [r2,#32]
+ STR r4, [r2,#48]
+oc_idct8x8_10_v6_cols
+; Column transforms
+ ADD r1, r1, r13 ; And read from temp storage.
+ BL idct4_4core_down_v6
+ BL idct4_4core_down_v6
+ BL idct4_4core_down_v6
+ BL idct4_4core_down_v6
+ ADD r13,r13,#64*2+4
+ LDMFD r13!,{r4-r11,PC}
+
+oc_idct8x8_3_v6
+ STMFD r13!,{r4-r8,r14}
+ SUB r13,r13,#64*2
+; Row transforms
+ MOV r8, r0
+ MOV r0, r13 ; Write to temp storage.
+ BL idct2_1core_v6
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #2*16
+ CMP r0, r8
+ MOV r1, r13 ; Read from temp storage.
+ MOVNE r4, #0
+ STRNE r4, [r0]
+ STRNE r4, [r0,#16]
+ MOVNE r0, r8 ; Write to the final destination.
+; Column transforms
+ BL idct2_2core_down_v6
+ BL idct2_2core_down_v6
+ BL idct2_2core_down_v6
+ BL idct2_2core_down_v6
+ ADD r13,r13,#64*2
+ LDMFD r13!,{r4-r8,PC}
+
+idct2_1core_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
+ LDR r3, OC_C4S4
+ LDRSH r6, [r1], #16 ; r6 = x[1,0]
+ SMULWB r12,r3, r2 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+ LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
+ SMULWB r6, r3, r6 ; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+ SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+ SMULWT r7, r5, r2 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+; Stage 2:
+ SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+ PKHBT r12,r12,r6, LSL #16 ; r12= <t[1,0]|t[0,0]>
+ SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+ PKHBT r7, r7, r3 ; r7 = <0|t[0,7]>
+; Stage 3:
+ PKHBT r5, r6, r5, LSL #16 ; r5 = <t[0,5]|t[0,6]>
+ PKHBT r4, r4, r3 ; r4 = <0|t[0,4]>
+ SASX r5, r5, r5 ; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+; Stage 4:
+ PKHTB r6, r3, r5, ASR #16 ; r6 = <0|t[0,6]>
+ PKHBT r5, r5, r3 ; r5 = <0|t[0,5]>
+ SADD16 r3, r12,r7 ; r3 = t[0]+t[7]
+ STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]
+ SADD16 r3, r12,r6 ; r3 = t[0]+t[6]
+ STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]
+ SADD16 r3, r12,r5 ; r3 = t[0]+t[5]
+ STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]
+ SADD16 r3, r12,r4 ; r3 = t[0]+t[4]
+ STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]
+ SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]
+ STR r4, [r0, #60] ; y[4<<3] = t[0]-t[4]
+ SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]
+ STR r5, [r0, #76] ; y[5<<3] = t[0]-t[5]
+ SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]
+ STR r6, [r0, #92] ; y[6<<3] = t[0]-t[6]
+ SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]
+ STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
+ MOV PC,r14
+ ]
+
+ ALIGN 8
+OC_C7S1
+ DCD 12785 ; 31F1
+OC_C1S7
+ DCD 64277 ; FB15
+OC_C6S2
+ DCD 25080 ; 61F8
+OC_C2S6
+ DCD 60547 ; EC83
+OC_C5S3
+ DCD 36410 ; 8E3A
+OC_C3S5
+ DCD 54491 ; D4DB
+OC_C4S4
+ DCD 46341 ; B505
+
+ [ OC_ARM_ASM_MEDIA
+idct2_2core_down_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDR r2, [r1], #16 ; r2 = <x[0,1]|x[0,0]>
+ LDR r3, OC_C4S4
+ MOV r7 ,#8 ; r7 = 8
+ LDR r6, [r1], #16 ; r6 = <x[1,1]|x[1,0]>
+ SMLAWB r12,r3, r2, r7 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+ LDRD r4, OC_C7S1 ; r4 = OC_C7S1; r5 = OC_C1S7
+ SMLAWB r7, r3, r6, r7 ; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+ SMULWT r5, r5, r2 ; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+ PKHBT r12,r12,r7, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
+ SMULWT r4, r4, r2 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+ PKHBT r7, r5, r5, LSL #16 ; r7 = <t[0,7]|t[0,7]>
+; Stage 2:
+ SMULWB r6, r3, r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+ PKHBT r4, r4, r4, LSL #16 ; r4 = <t[0,4]|t[0,4]>
+ SMULWT r2, r3, r7 ; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+ SMULWB r5, r3, r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+ PKHBT r6, r6, r2, LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWT r2, r3, r4 ; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+ PKHBT r2, r5, r2, LSL #16 ; r2 = <t[1,5]|t[0,5]>
+; Stage 3:
+ SSUB16 r5, r6, r2 ; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+ SADD16 r6, r6, r2 ; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+; Stage 4:
+ SADD16 r2, r12,r7 ; r2 = t[0]+t[7]+8
+ MOV r3, r2, ASR #4
+ MOV r2, r2, LSL #16
+ PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[7]+8>>4
+ STR r3, [r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
+ SADD16 r2, r12,r6 ; r2 = t[0]+t[6]+8
+ MOV r3, r2, ASR #4
+ MOV r2, r2, LSL #16
+ PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[6]+8>>4
+ STR r3, [r0, #12] ; y[1<<3] = t[0]+t[6]+8>>4
+ SADD16 r2, r12,r5 ; r2 = t[0]+t[5]+8
+ MOV r3, r2, ASR #4
+ MOV r2, r2, LSL #16
+ PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[5]+8>>4
+ STR r3, [r0, #28] ; y[2<<3] = t[0]+t[5]+8>>4
+ SADD16 r2, r12,r4 ; r2 = t[0]+t[4]+8
+ MOV r3, r2, ASR #4
+ MOV r2, r2, LSL #16
+ PKHTB r3, r3, r2, ASR #20 ; r3 = t[0]+t[4]+8>>4
+ STR r3, [r0, #44] ; y[3<<3] = t[0]+t[4]+8>>4
+ SSUB16 r4, r12,r4 ; r4 = t[0]-t[4]+8
+ MOV r3, r4, ASR #4
+ MOV r4, r4, LSL #16
+ PKHTB r3, r3, r4, ASR #20 ; r3 = t[0]-t[4]+8>>4
+ STR r3, [r0, #60] ; y[4<<3] = t[0]-t[4]+8>>4
+ SSUB16 r5, r12,r5 ; r5 = t[0]-t[5]+8
+ MOV r3, r5, ASR #4
+ MOV r5, r5, LSL #16
+ PKHTB r3, r3, r5, ASR #20 ; r3 = t[0]-t[5]+8>>4
+ STR r3, [r0, #76] ; y[5<<3] = t[0]-t[5]+8>>4
+ SSUB16 r6, r12,r6 ; r6 = t[0]-t[6]+8
+ MOV r3, r6, ASR #4
+ MOV r6, r6, LSL #16
+ PKHTB r3, r3, r6, ASR #20 ; r3 = t[0]-t[6]+8>>4
+ STR r3, [r0, #92] ; y[6<<3] = t[0]-t[6]+8>>4
+ SSUB16 r7, r12,r7 ; r7 = t[0]-t[7]+8
+ MOV r3, r7, ASR #4
+ MOV r7, r7, LSL #16
+ PKHTB r3, r3, r7, ASR #20 ; r3 = t[0]-t[7]+8>>4
+ STR r3, [r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
+ MOV PC,r14
+
+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+; pay for increased branch mis-prediction to get here, but in practice it
+; doesn't seem to slow anything down to take it out, and it's less code this
+; way.
+ [ 0
+oc_idct8x8_6_v6
+ STMFD r13!,{r4-r8,r10,r11,r14}
+ SUB r13,r13,#64*2+4
+; Row transforms
+ MOV r8, r0
+ AND r0, r13,#4 ; Align the stack.
+ ADD r0, r0, r13 ; Write to temp storage.
+ BL idct3_2core_v6
+ BL idct1core_v6
+ ; Clear input data for next block (decoder only).
+ SUB r0, r1, #3*16
+ CMP r0, r8
+ AND r1, r13,#4 ; Align the stack.
+ BEQ oc_idct8x8_6_v6_cols
+ MOV r4, #0
+ MOV r5, #0
+ STRD r4, [r0]
+ STR r4, [r0,#16]
+ STR r4, [r0,#32]
+ MOV r0, r8 ; Write to the final destination.
+oc_idct8x8_6_v6_cols
+; Column transforms
+ ADD r1, r1, r13 ; And read from temp storage.
+ BL idct3_3core_down_v6
+ BL idct3_3core_down_v6
+ BL idct3_3core_down_v6
+ BL idct3_3core_down_v6
+ ADD r13,r13,#64*2+4
+ LDMFD r13!,{r4-r8,r10,r11,PC}
+
+idct1core_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+ LDRSH r3, [r1], #16
+ MOV r12,#0x05
+ ORR r12,r12,#0xB500
+ MUL r3, r12, r3
+ ; Stall ?
+ MOV r3, r3, ASR #16
+ ; Don't need to actually store the odd lines; they won't be read.
+ STRH r3, [r0], #2
+ STRH r3, [r0, #30]
+ STRH r3, [r0, #62]
+ STRH r3, [r0, #94]
+ MOV PC,R14
+
+idct3_2core_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDRD r4, [r1], #16 ; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+ LDRD r10,OC_C6S2_3_v6 ; r10= OC_C6S2; r11= OC_C2S6
+ ; Stall
+ SMULWB r3, r11,r5 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+ LDR r11,OC_C4S4
+ SMULWB r2, r10,r5 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+ LDR r5, [r1], #16 ; r5 = <x[1,1]|x[1,0]>
+ SMULWB r12,r11,r4 ; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+ LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
+ SMULWB r10,r11,r5 ; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+ PKHBT r12,r12,r10,LSL #16 ; r12= <t[1,0]|t[0,0]>
+ SMULWT r10,r7, r5 ; r10= t[1,7]=OC_C1S7*x[1,1]>>16
+ PKHBT r2, r2, r11 ; r2 = <0|t[0,2]>
+ SMULWT r7, r7, r4 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+ PKHBT r3, r3, r11 ; r3 = <0|t[0,3]>
+ SMULWT r5, r6, r5 ; r10= t[1,4]=OC_C7S1*x[1,1]>>16
+ PKHBT r7, r7, r10,LSL #16 ; r7 = <t[1,7]|t[0,7]>
+ SMULWT r4, r6, r4 ; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Stage 2:
+ SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+ PKHBT r4, r4, r5, LSL #16 ; r4 = <t[1,4]|t[0,4]>
+ SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+ SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+ PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+ B idct4_3core_stage3_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+ ALIGN 8
+OC_C7S1_3_v6
+ DCD 12785 ; 31F1
+OC_C1S7_3_v6
+ DCD 64277 ; FB15
+OC_C6S2_3_v6
+ DCD 25080 ; 61F8
+OC_C2S6_3_v6
+ DCD 60547 ; EC83
+
+idct3_3core_down_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+ LDRD r6, OC_C6S2_3_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
+ LDR r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>
+ SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+ MOV r7,#8
+ SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+ LDR r11,OC_C4S4
+ SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+ PKHBT r3, r3, r3, LSL #16 ; r3 = <t[0,3]|t[0,3]>
+ SMLAWB r5, r11,r4, r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+ PKHBT r2, r2, r2, LSL #16 ; r2 = <t[0,2]|t[0,2]>
+ LDRD r6, OC_C7S1_3_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
+ PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
+ SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+ SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+ SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+ PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
+ SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+ SMULWB r6, r11,r7 ; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+ PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
+ SMULWT r10,r11,r7 ; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+ SMULWB r5, r11,r4 ; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+ PKHBT r6, r6, r10,LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWT r10,r11,r4 ; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+ B idct4_4core_down_stage3_v6
+ ]
+
+idct4_3core_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+ LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
+ LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+ SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+ SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+ PKHBT r9, r9, r2 ; r9 = <0|t[0,6]>
+ LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
+ PKHBT r8, r8, r2 ; r9 = <0|-t[0,5]>
+ SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+ SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+ LDR r11,OC_C4S4
+ SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+ SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+ PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
+ SMULWB r12,r11,r10 ; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+ PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
+ SMULWB r5, r11,r4 ; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+ LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
+ PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]|t[0,0]>
+ SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+ SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+ SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+ PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
+ SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+ SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
+ PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
+ SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
+ SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
+ SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
+ SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
+ SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
+ SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
+ PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_3core_stage3_v6
+ SADD16 r11,r12,r2 ; r11= t[1]=t[0]+t[2]
+ PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
+ SSUB16 r2, r12,r2 ; r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6
+ SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
+ SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
+ SADD16 r10,r12,r3 ; r10= t[0]'=t[0]+t[3]
+ SSUB16 r3, r12,r3 ; r3 = t[3]=t[0]-t[3]
+; Stage 4:
+ SADD16 r12,r10,r7 ; r12= t[0]+t[7]
+ STR r12,[r0], #4 ; y[0<<3] = t[0]+t[7]
+ SADD16 r12,r11,r6 ; r12= t[1]+t[6]
+ STR r12,[r0, #12] ; y[1<<3] = t[1]+t[6]
+ SADD16 r12,r2, r5 ; r12= t[2]+t[5]
+ STR r12,[r0, #28] ; y[2<<3] = t[2]+t[5]
+ SADD16 r12,r3, r4 ; r12= t[3]+t[4]
+ STR r12,[r0, #44] ; y[3<<3] = t[3]+t[4]
+ SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]
+ STR r4, [r0, #60] ; y[4<<3] = t[3]-t[4]
+ SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]
+ STR r5, [r0, #76] ; y[5<<3] = t[2]-t[5]
+ SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]
+ STR r6, [r0, #92] ; y[6<<3] = t[1]-t[6]
+ SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]
+ STR r7, [r0, #108] ; y[7<<3] = t[0]-t[7]
+ MOV PC,r14
+
+; Another copy so the LDRD offsets are less than +/- 255.
+ ALIGN 8
+OC_C7S1_4_v6
+ DCD 12785 ; 31F1
+OC_C1S7_4_v6
+ DCD 64277 ; FB15
+OC_C6S2_4_v6
+ DCD 25080 ; 61F8
+OC_C2S6_4_v6
+ DCD 60547 ; EC83
+OC_C5S3_4_v6
+ DCD 36410 ; 8E3A
+OC_C3S5_4_v6
+ DCD 54491 ; D4DB
+
+idct4_4core_down_v6
+ ; r0 = ogg_int16_t *_y (destination)
+ ; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+ LDRD r10,[r1], #16 ; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+ LDRD r2, OC_C5S3_4_v6 ; r2 = OC_C5S3; r3 = OC_C3S5
+ LDRD r4, [r1], #16 ; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+ SMULWT r9, r3, r11 ; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+ LDRD r6, OC_C6S2_4_v6 ; r6 = OC_C6S2; r7 = OC_C2S6
+ SMULWT r8, r2, r11 ; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+ PKHBT r9, r9, r9, LSL #16 ; r9 = <t[0,6]|t[0,6]>
+ SMULWB r3, r7, r11 ; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+ PKHBT r8, r8, r8, LSL #16 ; r8 = <-t[0,5]|-t[0,5]>
+ SMULWB r2, r6, r11 ; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+ LDR r11,OC_C4S4
+ SMULWB r12,r7, r5 ; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+ MOV r7,#8
+ SMULWB r5, r6, r5 ; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+ PKHBT r3, r3, r12,LSL #16 ; r3 = <t[1,3]|t[0,3]>
+ SMLAWB r12,r11,r10,r7 ; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+ PKHBT r2, r2, r5, LSL #16 ; r2 = <t[1,2]|t[0,2]>
+ SMLAWB r5, r11,r4 ,r7 ; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+ LDRD r6, OC_C7S1_4_v6 ; r6 = OC_C7S1; r7 = OC_C1S7
+ PKHBT r12,r12,r5, LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
+ SMULWT r5, r7, r4 ; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+ SMULWT r7, r7, r10 ; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+ SMULWT r10,r6, r10 ; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+ PKHBT r7, r7, r5, LSL #16 ; r7 = <t[1,7]|t[0,7]>
+ SMULWT r4, r6, r4 ; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+ SSUB16 r6, r7, r9 ; r6 = t[7]-t[6]
+ PKHBT r4, r10,r4, LSL #16 ; r4 = <t[1,4]|t[0,4]>
+ SADD16 r7, r7, r9 ; r7 = t[7]=t[7]+t[6]
+ SMULWT r9, r11,r6 ; r9 = t[1,6]=OC_C4S4*r6T>>16
+ SADD16 r5, r4, r8 ; r5 = t[4]-t[5]
+ SMULWB r6, r11,r6 ; r6 = t[0,6]=OC_C4S4*r6B>>16
+ SSUB16 r4, r4, r8 ; r4 = t[4]=t[4]+t[5]
+ SMULWT r10,r11,r5 ; r10= t[1,5]=OC_C4S4*r5T>>16
+ PKHBT r6, r6, r9, LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWB r5, r11,r5 ; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_4core_down_stage3_v6
+ SADD16 r11,r12,r2 ; r11= t[1]+8=t[0]+t[2]+8
+ PKHBT r10,r5, r10,LSL #16 ; r10= <t[1,5]|t[0,5]>
+ SSUB16 r2, r12,r2 ; r2 = t[2]+8=t[0]-t[2]+8
+ B idct8_8core_down_stage3_5_v6
+
+idct8_8core_v6
+ STMFD r13!,{r0,r14}
+; Stage 1:
+ ;5-6 rotation by 3pi/16
+ LDRD r10,OC_C5S3_4_v6 ; r10= OC_C5S3, r11= OC_C3S5
+ LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
+ LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
+ SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
+ LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
+ SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
+ LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
+ SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
+ SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
+ SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+ PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
+ SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+ PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
+ SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
+ PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
+ ;2-3 rotation by 6pi/16
+ LDRD r10,OC_C6S2_4_v6 ; r10= OC_C6S2, r11= OC_C2S6
+ PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
+ LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
+ SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
+ SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
+ SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
+ LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
+ SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
+ SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
+ PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
+ SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+ SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+ SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
+ PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
+ SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
+ ;4-7 rotation by 7pi/16
+ LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
+ PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
+ LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
+ PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
+ SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
+ SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
+ LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
+ SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
+ SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
+ SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
+ SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+ PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
+ SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+ PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
+ SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
+ PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
+ SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
+ ;0-1 butterfly
+ LDR r11,OC_C4S4
+ PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
+ SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
+ SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
+ SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
+ SMULWB r8, r11,r7 ; r8 = t[0,0]=OC_C4S4*r7B>>16
+ SMULWT r12,r11,r7 ; r12= t[1,0]=OC_C4S4*r7T>>16
+ SMULWB r7, r11,r4 ; r7 = t[0,1]=OC_C4S4*r4B>>16
+ PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]|t[0,0]>
+ SMULWT r8, r11,r4 ; r8 = t[1,1]=OC_C4S4*r4T>>16
+; Stage 2:
+ SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
+ PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]|t[0,0]>
+ SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
+ SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
+ SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
+ SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
+ SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
+ SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
+ PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
+ SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+ SADD16 r11,r8, r2 ; r11= t[1]'=t[1]+t[2]
+ PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SSUB16 r2, r8, r2 ; r2 = t[2]=t[1]-t[2]
+ LDMFD r13!,{r0,r14}
+ B idct4_3core_stage3_5_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+ ALIGN 8
+OC_C7S1_8_v6
+ DCD 12785 ; 31F1
+OC_C1S7_8_v6
+ DCD 64277 ; FB15
+OC_C6S2_8_v6
+ DCD 25080 ; 61F8
+OC_C2S6_8_v6
+ DCD 60547 ; EC83
+OC_C5S3_8_v6
+ DCD 36410 ; 8E3A
+OC_C3S5_8_v6
+ DCD 54491 ; D4DB
+
+idct8_8core_down_v6
+ STMFD r13!,{r0,r14}
+; Stage 1:
+ ;5-6 rotation by 3pi/16
+ LDRD r10,OC_C5S3_8_v6 ; r10= OC_C5S3, r11= OC_C3S5
+ LDR r4, [r1,#8] ; r4 = <x[0,5]|x[0,4]>
+ LDR r7, [r1,#24] ; r7 = <x[1,5]|x[1,4]>
+ SMULWT r5, r11,r4 ; r5 = OC_C3S5*x[0,5]>>16
+ LDR r0, [r1,#4] ; r0 = <x[0,3]|x[0,2]>
+ SMULWT r3, r11,r7 ; r3 = OC_C3S5*x[1,5]>>16
+ LDR r12,[r1,#20] ; r12= <x[1,3]|x[1,2]>
+ SMULWT r6, r11,r0 ; r6 = OC_C3S5*x[0,3]>>16
+ SMULWT r11,r11,r12 ; r11= OC_C3S5*x[1,3]>>16
+ SMLAWT r6, r10,r4, r6 ; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+ PKHBT r5, r5, r3, LSL #16 ; r5 = <r3|r5>
+ SMLAWT r11,r10,r7, r11 ; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+ PKHBT r4, r4, r7, LSL #16 ; r4 = <x[1,4]|x[0,4]>
+ SMULWT r3, r10,r0 ; r3 = OC_C5S3*x[0,3]>>16
+ PKHBT r6, r6, r11,LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SMULWT r8, r10,r12 ; r8 = OC_C5S3*x[1,3]>>16
+ ;2-3 rotation by 6pi/16
+ LDRD r10,OC_C6S2_8_v6 ; r10= OC_C6S2, r11= OC_C2S6
+ PKHBT r3, r3, r8, LSL #16 ; r3 = <r8|r3>
+ LDR r8, [r1,#12] ; r8 = <x[0,7]|x[0,6]>
+ SMULWB r2, r10,r0 ; r2 = OC_C6S2*x[0,2]>>16
+ SSUB16 r5, r5, r3 ; r5 = <t[1,5]|t[0,5]>
+ SMULWB r9, r10,r12 ; r9 = OC_C6S2*x[1,2]>>16
+ LDR r7, [r1,#28] ; r7 = <x[1,7]|x[1,6]>
+ SMULWB r3, r10,r8 ; r3 = OC_C6S2*x[0,6]>>16
+ SMULWB r10,r10,r7 ; r10= OC_C6S2*x[1,6]>>16
+ PKHBT r2, r2, r9, LSL #16 ; r2 = <r2|r9>
+ SMLAWB r3, r11,r0, r3 ; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+ SMLAWB r10,r11,r12,r10 ; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+ SMULWB r9, r11,r8 ; r9 = OC_C2S6*x[0,6]>>16
+ PKHBT r3, r3, r10,LSL #16 ; r3 = <t[1,6]|t[0,6]>
+ SMULWB r12,r11,r7 ; r12= OC_C2S6*x[1,6]>>16
+ ;4-7 rotation by 7pi/16
+ LDRD r10,OC_C7S1_8_v6 ; r10= OC_C7S1, r11= OC_C1S7
+ PKHBT r9, r9, r12,LSL #16 ; r9 = <r9|r12>
+ LDR r0, [r1],#16 ; r0 = <x[0,1]|x[0,0]>
+ PKHTB r7, r7, r8, ASR #16 ; r7 = <x[1,7]|x[0,7]>
+ SSUB16 r2, r2, r9 ; r2 = <t[1,2]|t[0,2]>
+ SMULWB r9, r10,r7 ; r9 = OC_C7S1*x[0,7]>>16
+ LDR r14,[r1],#16 ; r14= <x[1,1]|x[1,0]>
+ SMULWT r12,r10,r7 ; r12= OC_C7S1*x[1,7]>>16
+ SMULWT r8, r10,r0 ; r8 = OC_C7S1*x[0,1]>>16
+ SMULWT r10,r10,r14 ; r10= OC_C7S1*x[1,1]>>16
+ SMLAWT r9, r11,r0, r9 ; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+ PKHBT r8, r8, r10,LSL #16 ; r8 = <r12|r8>
+ SMLAWT r12,r11,r14,r12 ; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+ PKHBT r0, r0, r14,LSL #16 ; r0 = <x[1,0]|x[0,0]>
+ SMULWB r10,r11,r7 ; r10= OC_C1S7*x[0,6]>>16
+ PKHBT r9, r9, r12,LSL #16 ; r9 = <t[1,7]|t[0,7]>
+ SMULWT r12,r11,r7 ; r12= OC_C1S7*x[1,6]>>16
+ ;0-1 butterfly
+ LDR r11,OC_C4S4
+ MOV r14,#8
+ PKHBT r10,r10,r12,LSL #16 ; r10= <r12|r10>
+ SADD16 r7, r0, r4 ; r7 = x[0]+x[4]
+ SSUB16 r10,r8, r10 ; r10= <t[1,4]|t[0,4]>
+ SMLAWB r8, r11,r7, r14 ; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+ SSUB16 r4, r0, r4 ; r4 = x[0]-x[4]
+ SMLAWT r12,r11,r7, r14 ; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+ SMLAWB r7, r11,r4, r14 ; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+ PKHBT r12,r8, r12,LSL #16 ; r12= <t[1,0]+8|t[0,0]+8>
+ SMLAWT r8, r11,r4, r14 ; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+; Stage 2:
+ SADD16 r4, r10,r5 ; r4 = t[4]'=t[4]+t[5]
+ PKHBT r8, r7, r8, LSL #16 ; r8 = <t[1,0]+8|t[0,0]+8>
+ SSUB16 r5, r10,r5 ; r5 = t[4]-t[5]
+ SMULWB r10,r11,r5 ; r10= t[0,5]=OC_C4S4*r5B>>16
+ SADD16 r7, r9, r6 ; r7 = t[7]'=t[7]+t[6]
+ SMULWT r5, r11,r5 ; r5 = t[1,5]=OC_C4S4*r5T>>16
+ SSUB16 r6, r9, r6 ; r6 = t[7]-t[6]
+ SMULWB r9, r11,r6 ; r9 = t[0,6]=OC_C4S4*r6B>>16
+ PKHBT r10,r10,r5, LSL #16 ; r10= <t[1,5]|t[0,5]>
+ SMULWT r6, r11,r6 ; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+ SADD16 r11,r8, r2 ; r11= t[1]'+8=t[1]+t[2]+8
+ PKHBT r6, r9, r6, LSL #16 ; r6 = <t[1,6]|t[0,6]>
+ SSUB16 r2, r8, r2 ; r2 = t[2]+8=t[1]-t[2]+8
+ LDMFD r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6
+ SSUB16 r5, r6, r10 ; r5 = t[5]'=t[6]-t[5]
+ SADD16 r6, r6, r10 ; r6 = t[6]=t[6]+t[5]
+ SADD16 r10,r12,r3 ; r10= t[0]'+8=t[0]+t[3]+8
+ SSUB16 r3, r12,r3 ; r3 = t[3]+8=t[0]-t[3]+8
+; Stage 4:
+ SADD16 r12,r10,r7 ; r12= t[0]+t[7]+8
+ SSUB16 r7, r10,r7 ; r7 = t[0]-t[7]+8
+ MOV r10,r12,ASR #4
+ MOV r12,r12,LSL #16
+ PKHTB r10,r10,r12,ASR #20 ; r10= t[0]+t[7]+8>>4
+ STR r10,[r0], #4 ; y[0<<3] = t[0]+t[7]+8>>4
+ SADD16 r12,r11,r6 ; r12= t[1]+t[6]+8
+ SSUB16 r6, r11,r6 ; r6 = t[1]-t[6]+8
+ MOV r10,r12,ASR #4
+ MOV r12,r12,LSL #16
+ PKHTB r10,r10,r12,ASR #20 ; r10= t[1]+t[6]+8>>4
+ STR r10,[r0, #12] ; y[1<<3] = t[1]+t[6]+8>>4
+ SADD16 r12,r2, r5 ; r12= t[2]+t[5]+8
+ SSUB16 r5, r2, r5 ; r5 = t[2]-t[5]+8
+ MOV r10,r12,ASR #4
+ MOV r12,r12,LSL #16
+ PKHTB r10,r10,r12,ASR #20 ; r10= t[2]+t[5]+8>>4
+ STR r10,[r0, #28] ; y[2<<3] = t[2]+t[5]+8>>4
+ SADD16 r12,r3, r4 ; r12= t[3]+t[4]+8
+ SSUB16 r4, r3, r4 ; r4 = t[3]-t[4]+8
+ MOV r10,r12,ASR #4
+ MOV r12,r12,LSL #16
+ PKHTB r10,r10,r12,ASR #20 ; r10= t[3]+t[4]+8>>4
+ STR r10,[r0, #44] ; y[3<<3] = t[3]+t[4]+8>>4
+ MOV r10,r4, ASR #4
+ MOV r4, r4, LSL #16
+ PKHTB r10,r10,r4, ASR #20 ; r10= t[3]-t[4]+8>>4
+ STR r10,[r0, #60] ; y[4<<3] = t[3]-t[4]+8>>4
+ MOV r10,r5, ASR #4
+ MOV r5, r5, LSL #16
+ PKHTB r10,r10,r5, ASR #20 ; r10= t[2]-t[5]+8>>4
+ STR r10,[r0, #76] ; y[5<<3] = t[2]-t[5]+8>>4
+ MOV r10,r6, ASR #4
+ MOV r6, r6, LSL #16
+ PKHTB r10,r10,r6, ASR #20 ; r10= t[1]-t[6]+8>>4
+ STR r10,[r0, #92] ; y[6<<3] = t[1]-t[6]+8>>4
+ MOV r10,r7, ASR #4
+ MOV r7, r7, LSL #16
+ PKHTB r10,r10,r7, ASR #20 ; r10= t[0]-t[7]+8>>4
+ STR r10,[r0, #108] ; y[7<<3] = t[0]-t[7]+8>>4
+ MOV PC,r14
+ ]
+
+ [ OC_ARM_ASM_NEON
+ EXPORT oc_idct8x8_neon
+
+ ALIGN 16
+OC_IDCT_CONSTS_NEON
+ DCW 8
+ DCW 64277 ; FB15 (C1S7)
+ DCW 60547 ; EC83 (C2S6)
+ DCW 54491 ; D4DB (C3S5)
+ DCW 46341 ; B505 (C4S4)
+ DCW 36410 ; 471D (C5S3)
+ DCW 25080 ; 30FC (C6S2)
+ DCW 12785 ; 31F1 (C7S1)
+
+oc_idct8x8_neon
+ ; r0 = ogg_int16_t *_y
+ ; r1 = ogg_int16_t *_x
+ ; r2 = int _last_zzi
+ CMP r2, #10
+ BLE oc_idct8x8_10_neon
+oc_idct8x8_slow_neon
+ VPUSH {D8-D15}
+ MOV r2, r1
+ ADR r3, OC_IDCT_CONSTS_NEON
+ ; Row transforms (input is pre-transposed)
+ VLD1.64 {D16,D17,D18,D19}, [r2 at 128]!
+ VLD1.64 {D20,D21,D22,D23}, [r2 at 128]!
+ VLD1.64 {D24,D25,D26,D27}, [r2 at 128]!
+ VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
+ VLD1.64 {D28,D29,D30,D31}, [r2 at 128]
+ VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
+ VLD1.64 {D0,D1}, [r3 at 128]
+ MOV r12, r14
+ BL oc_idct8x8_stage123_neon
+; Stage 4
+ VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
+ VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
+ VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
+ VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
+ VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
+ VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
+ VTRN.16 Q14,Q15
+ VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
+ VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
+ ; 8x8 Transpose
+ VTRN.16 Q8, Q9
+ VTRN.16 Q10,Q11
+ VTRN.16 Q12,Q13
+ VTRN.32 Q8, Q10
+ VTRN.32 Q9, Q11
+ VTRN.32 Q12,Q14
+ VTRN.32 Q13,Q15
+ VSWP D17,D24
+ VSUB.S16 Q1, Q8, Q12 ; Q8 = x[0]-x[4]
+ VSWP D19,D26
+ VADD.S16 Q8, Q8, Q12 ; Q1 = x[0]+x[4]
+ VSWP D21,D28
+ VSWP D23,D30
+ ; Column transforms
+ BL oc_idct8x8_stage123_neon
+ CMP r0,r1
+ ; We have to put the return address back in the LR, or the branch
+ ; predictor will not recognize the function return and mis-predict the
+ ; entire call stack.
+ MOV r14, r12
+; Stage 4
+ VSUB.S16 Q15,Q8, Q7 ; Q15 = y[7]=t[0]'-t[7]'
+ VADD.S16 Q8, Q8, Q7 ; Q8 = y[0]=t[0]'+t[7]'
+ VSUB.S16 Q14,Q9, Q3 ; Q14 = y[6]=t[1]'-t[6]''
+ VADD.S16 Q9, Q9, Q3 ; Q9 = y[1]=t[1]'+t[6]''
+ VSUB.S16 Q13,Q10,Q5 ; Q13 = y[5]=t[2]'-t[5]''
+ VADD.S16 Q10,Q10,Q5 ; Q10 = y[2]=t[2]'+t[5]''
+ VSUB.S16 Q12,Q11,Q4 ; Q12 = y[4]=t[3]'-t[4]'
+ VADD.S16 Q11,Q11,Q4 ; Q11 = y[3]=t[3]'+t[4]'
+ BEQ oc_idct8x8_slow_neon_noclear
+ VMOV.I8 Q2,#0
+ VPOP {D8-D15}
+ VMOV.I8 Q3,#0
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VST1.64 {D4, D5, D6, D7}, [r1 at 128]!
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VST1.64 {D4, D5, D6, D7}, [r1 at 128]!
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VST1.64 {D4, D5, D6, D7}, [r1 at 128]!
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VST1.64 {D4, D5, D6, D7}, [r1 at 128]
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
+
+oc_idct8x8_slow_neon_noclear
+ VPOP {D8-D15}
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
+
+oc_idct8x8_stage123_neon
+; Stages 1 & 2
+ VMULL.S16 Q4, D18,D1[3]
+ VMULL.S16 Q5, D19,D1[3]
+ VMULL.S16 Q7, D30,D1[3]
+ VMULL.S16 Q6, D31,D1[3]
+ VMULL.S16 Q2, D30,D0[1]
+ VMULL.S16 Q3, D31,D0[1]
+ VSHRN.S32 D8, Q4, #16
+ VSHRN.S32 D9, Q5, #16 ; Q4 = (OC_C7S1*x[1]>>16)
+ VSHRN.S32 D14,Q7, #16
+ VSHRN.S32 D15,Q6, #16 ; Q7 = (OC_C7S1*x[7]>>16)
+ VSHRN.S32 D4, Q2, #16
+ VSHRN.S32 D5, Q3, #16 ; Q2 = (OC_C1S7*x[7]>>16)-x[7]
+ VSUB.S16 Q4, Q4, Q15
+ VADD.S16 Q7, Q7, Q9
+ VSUB.S16 Q4, Q4, Q2 ; Q4 = t[4]
+ VMULL.S16 Q2, D18,D0[1]
+ VMULL.S16 Q9, D19,D0[1]
+ VMULL.S16 Q5, D26,D0[3]
+ VMULL.S16 Q3, D27,D0[3]
+ VMULL.S16 Q6, D22,D0[3]
+ VMULL.S16 Q12,D23,D0[3]
+ VSHRN.S32 D4, Q2, #16
+ VSHRN.S32 D5, Q9, #16 ; Q2 = (OC_C1S7*x[1]>>16)-x[1]
+ VSHRN.S32 D10,Q5, #16
+ VSHRN.S32 D11,Q3, #16 ; Q5 = (OC_C3S5*x[5]>>16)-x[5]
+ VSHRN.S32 D12,Q6, #16
+ VSHRN.S32 D13,Q12,#16 ; Q6 = (OC_C3S5*x[3]>>16)-x[3]
+ VADD.S16 Q7, Q7, Q2 ; Q7 = t[7]
+ VSUB.S16 Q5, Q5, Q11
+ VADD.S16 Q6, Q6, Q11
+ VADD.S16 Q5, Q5, Q13
+ VADD.S16 Q6, Q6, Q13
+ VMULL.S16 Q9, D22,D1[1]
+ VMULL.S16 Q11,D23,D1[1]
+ VMULL.S16 Q15,D26,D1[1]
+ VMULL.S16 Q13,D27,D1[1]
+ VMULL.S16 Q2, D20,D1[2]
+ VMULL.S16 Q12,D21,D1[2]
+ VSHRN.S32 D18,Q9, #16
+ VSHRN.S32 D19,Q11,#16 ; Q9 = (OC_C5S3*x[3]>>16)-x[3]
+ VSHRN.S32 D30,Q15,#16
+ VSHRN.S32 D31,Q13,#16 ; Q15= (OC_C5S3*x[5]>>16)-x[5]
+ VSHRN.S32 D4, Q2, #16
+ VSHRN.S32 D5, Q12,#16 ; Q2 = (OC_C6S2*x[2]>>16)
+ VSUB.S16 Q5, Q5, Q9 ; Q5 = t[5]
+ VADD.S16 Q6, Q6, Q15 ; Q6 = t[6]
+ VSUB.S16 Q2, Q2, Q14
+ VMULL.S16 Q3, D28,D1[2]
+ VMULL.S16 Q11,D29,D1[2]
+ VMULL.S16 Q12,D28,D0[2]
+ VMULL.S16 Q9, D29,D0[2]
+ VMULL.S16 Q13,D20,D0[2]
+ VMULL.S16 Q15,D21,D0[2]
+ VSHRN.S32 D6, Q3, #16
+ VSHRN.S32 D7, Q11,#16 ; Q3 = (OC_C6S2*x[6]>>16)
+ VSHRN.S32 D24,Q12,#16
+ VSHRN.S32 D25,Q9, #16 ; Q12= (OC_C2S6*x[6]>>16)-x[6]
+ VSHRN.S32 D26,Q13,#16
+ VSHRN.S32 D27,Q15,#16 ; Q13= (OC_C2S6*x[2]>>16)-x[2]
+ VSUB.S16 Q9, Q4, Q5 ; Q9 = t[4]-t[5]
+ VSUB.S16 Q11,Q7, Q6 ; Q11= t[7]-t[6]
+ VADD.S16 Q3, Q3, Q10
+ VADD.S16 Q4, Q4, Q5 ; Q4 = t[4]'=t[4]+t[5]
+ VADD.S16 Q7, Q7, Q6 ; Q7 = t[7]'=t[7]+t[6]
+ VSUB.S16 Q2, Q2, Q12 ; Q2 = t[2]
+ VADD.S16 Q3, Q3, Q13 ; Q3 = t[3]
+ VMULL.S16 Q12,D16,D1[0]
+ VMULL.S16 Q13,D17,D1[0]
+ VMULL.S16 Q14,D2, D1[0]
+ VMULL.S16 Q15,D3, D1[0]
+ VMULL.S16 Q5, D18,D1[0]
+ VMULL.S16 Q6, D22,D1[0]
+ VSHRN.S32 D24,Q12,#16
+ VSHRN.S32 D25,Q13,#16
+ VSHRN.S32 D28,Q14,#16
+ VSHRN.S32 D29,Q15,#16
+ VMULL.S16 Q13,D19,D1[0]
+ VMULL.S16 Q15,D23,D1[0]
+ VADD.S16 Q8, Q8, Q12 ; Q8 = t[0]
+ VADD.S16 Q1, Q1, Q14 ; Q1 = t[1]
+ VSHRN.S32 D10,Q5, #16
+ VSHRN.S32 D12,Q6, #16
+ VSHRN.S32 D11,Q13,#16
+ VSHRN.S32 D13,Q15,#16
+ VADD.S16 Q5, Q5, Q9 ; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
+ VADD.S16 Q6, Q6, Q11 ; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
+; Stage 3
+ VSUB.S16 Q11,Q8, Q3 ; Q11 = t[3]''=t[0]-t[3]
+ VADD.S16 Q8, Q8, Q3 ; Q8 = t[0]''=t[0]+t[3]
+ VADD.S16 Q9, Q1, Q2 ; Q9 = t[1]''=t[1]+t[2]
+ VADD.S16 Q3, Q6, Q5 ; Q3 = t[6]''=t[6]'+t[5]'
+ VSUB.S16 Q10,Q1, Q2 ; Q10 = t[2]''=t[1]-t[2]
+ VSUB.S16 Q5, Q6, Q5 ; Q5 = t[5]''=t[6]'-t[5]'
+ MOV PC, r14
+
+oc_idct8x8_10_neon
+ ADR r3, OC_IDCT_CONSTS_NEON
+ VLD1.64 {D0,D1}, [r3 at 128]
+ MOV r2, r1
+ ; Row transforms (input is pre-transposed)
+; Stage 1
+ VLD1.64 {D16,D17,D18,D19},[r2 at 128]!
+ MOV r12, #16
+ VMULL.S16 Q15,D16,D1[0] ; Q15= OC_C4S4*x[0]-(x[0]<<16)
+ VLD1.64 {D17}, [r2 at 64], r12
+ VMULL.S16 Q2, D18,D0[1] ; Q2 = OC_C1S7*x[1]-(x[1]<<16)
+ VLD1.64 {D19}, [r2 at 64]
+ VMULL.S16 Q14,D17,D0[2] ; Q14= OC_C2S6*x[2]-(x[2]<<16)
+ VMULL.S16 Q3, D19,D0[3] ; Q3 = OC_C3S5*x[3]-(x[3]<<16)
+ VMULL.S16 Q13,D19,D1[1] ; Q13= OC_C5S3*x[3]-(x[3]<<16)
+ VMULL.S16 Q12,D18,D1[3] ; Q12= OC_C7S1*x[1]
+ VMULL.S16 Q1, D17,D1[2] ; Q1 = OC_C6S2*x[2]
+ VSHRN.S32 D30,Q15,#16 ; D30= t[0]-x[0]
+ VSHRN.S32 D4, Q2, #16 ; D4 = t[7]-x[1]
+ VSHRN.S32 D31,Q14,#16 ; D31= t[3]-x[2]
+ VSHRN.S32 D6, Q3, #16 ; D6 = t[6]-x[3]
+ VSHRN.S32 D7, Q13,#16 ; D7 = -t[5]-x[3]
+ VSHRN.S32 D5, Q12,#16 ; D5 = t[4]
+ VSHRN.S32 D2, Q1, #16 ; D2 = t[2]
+ VADD.S16 D4, D4, D18 ; D4 = t[7]
+ VADD.S16 D6, D6, D19 ; D6 = t[6]
+ VADD.S16 D7, D7, D19 ; D7 = -t[5]
+ VADD.S16 Q15,Q15,Q8 ; D30= t[0]
+ ; D31= t[3]
+; Stages 2 & 3
+ VSUB.S16 Q12,Q2, Q3 ; D24= t[7]-t[6]
+ ; D25= t[4]'=t[4]+t[5]
+ VADD.S16 Q13,Q2, Q3 ; D26= t[7]'=t[7]+t[6]
+ ; D27= t[4]-t[5]
+ VMULL.S16 Q11,D24,D1[0] ; Q11= OC_C4S4*(t[7]-t[6])
+ ; -(t[7]-t[6]<<16)
+ VMULL.S16 Q14,D27,D1[0] ; Q14= OC_C4S4*(t[4]-t[5])
+ ; -(t[4]-t[5]<<16)
+ VADD.S16 D16,D30,D31 ; D16= t[0]'=t[0]+t[3]
+ VSUB.S16 D17,D30,D2 ; D17= t[2]'=t[0]-t[2]
+ VADD.S16 D18,D30,D2 ; D18= t[1]'=t[0]+t[2]
+ VSHRN.S32 D22,Q11,#16 ; D22= (OC_C4S4*(t[7]-t[6])>>16)
+ ; -(t[7]-t[6])
+ VSHRN.S32 D23,Q14,#16 ; D23= (OC_C4S4*(t[4]-t[5])>>16)
+ ; -(t[4]-t[5])
+ VSUB.S16 D19,D30,D31 ; D19= t[3]'=t[0]-t[3]
+ VADD.S16 D22,D22,D24 ; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
+ VADD.S16 D23,D23,D27 ; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
+ VSUB.S16 D27,D22,D23 ; D27= t[5]''=t[6]'-t[5]'
+ VADD.S16 D24,D22,D23 ; D24= t[6]''=t[6]'+t[5]'
+; Stage 4
+ VSUB.S16 Q11,Q8, Q13 ; D22= y[7]=t[0]'-t[7]'
+ ; D23= y[5]=t[2]'-t[5]''
+ VSUB.S16 Q10,Q9, Q12 ; D20= y[6]=t[1]'-t[6]'
+ ; D21= y[4]=t[3]'-t[4]''
+ VADD.S16 Q8, Q8, Q13 ; D16= y[0]=t[0]'+t[7]'
+ ; D17= y[2]=t[2]'+t[5]''
+ VADD.S16 Q9, Q9, Q12 ; D18= y[1]=t[1]'-t[6]'
+ ; D19= y[3]=t[3]'-t[4]''
+ ; 8x4 transpose
+ VTRN.16 Q10,Q11 ; Q10= c5c4a5a4 c7c6a7a6
+ ; Q11= d5d4b5b4 d7d6b7b6
+ VTRN.16 Q8, Q9 ; Q8 = c3c2a3a2 c1c0a1a0
+ ; Q9 = d3d2b3b2 d1d0b1b0
+ VSWP D20,D21 ; Q10= c7c6a7a6 c5c4a5a4
+ VSWP D22,D23 ; Q11= d7d6b7b6 d5d4b5b4
+ VUZP.32 Q9, Q11 ; Q9 = b7b6b5b4 b3b2b1b0
+ ; Q11= d7d6d5d4 d3d2d1d0
+ VMULL.S16 Q15,D18,D0[1]
+ VMULL.S16 Q13,D22,D1[1]
+ VUZP.32 Q8, Q10 ; Q8 = a7a6a5a4 a3a2a1a0
+ ; Q10= c7c6c5c4 c3c2c1c0
+ ; Column transforms
+; Stages 1, 2, & 3
+ VMULL.S16 Q14,D19,D0[1] ; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+ VMULL.S16 Q12,D23,D1[1] ; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+ VMULL.S16 Q3, D22,D0[3]
+ VMULL.S16 Q2, D23,D0[3] ; Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+ VSHRN.S32 D30,Q15,#16
+ VSHRN.S32 D31,Q14,#16 ; Q15= (OC_C1S7*x[1]>>16)-x[1]
+ VSHRN.S32 D26,Q13,#16
+ VSHRN.S32 D27,Q12,#16 ; Q13= (OC_C5S3*x[3]>>16)-x[3]
+ VSHRN.S32 D28,Q3, #16
+ VSHRN.S32 D29,Q2, #16 ; Q14= (OC_C3S5*x[3]>>16)-x[3]
+ VADD.S16 Q15,Q15,Q9 ; Q15= t[7]
+ VADD.S16 Q13,Q13,Q11 ; Q13= -t[5]
+ VADD.S16 Q14,Q14,Q11 ; Q14= t[6]
+ VMULL.S16 Q12,D18,D1[3]
+ VMULL.S16 Q2, D19,D1[3] ; Q2:Q12= OC_C7S1*x[1]
+ VMULL.S16 Q1, D16,D1[0]
+ VMULL.S16 Q11,D17,D1[0] ; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+ VMULL.S16 Q3, D20,D0[2]
+ VMULL.S16 Q9, D21,D0[2] ; Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+ VSHRN.S32 D24,Q12,#16
+ VSHRN.S32 D25,Q2, #16 ; Q12= t[4]
+ VMULL.S16 Q2, D20,D1[2]
+ VSHRN.S32 D2, Q1, #16
+ VSHRN.S32 D3, Q11,#16 ; Q1 = (OC_C4S4*x[0]>>16)-x[0]
+ VMULL.S16 Q11,D21,D1[2] ; Q2:Q11= OC_C6S2*x[2]
+ VSHRN.S32 D6, Q3, #16
+ VSHRN.S32 D7, Q9, #16 ; Q3 = (OC_C2S6*x[2]>>16)-x[2]
+ VSUB.S16 Q9, Q15,Q14 ; Q9 = t[7]-t[6]
+ VADD.S16 Q15,Q15,Q14 ; Q15= t[7]'=t[7]+t[6]
+ VSHRN.S32 D4, Q2, #16
+ VSHRN.S32 D5, Q11,#16 ; Q2 = t[2]
+ VADD.S16 Q1, Q1, Q8 ; Q1 = t[0]
+ VADD.S16 Q8, Q12,Q13 ; Q8 = t[4]-t[5]
+ VADD.S16 Q3, Q3, Q10 ; Q3 = t[3]
+ VMULL.S16 Q10,D16,D1[0]
+ VMULL.S16 Q11,D17,D1[0] ; Q11:Q10= OC_C4S4*(t[4]-t[5])
+ ; -(t[4]-t[5]<<16)
+ VSUB.S16 Q12,Q12,Q13 ; Q12= t[4]'=t[4]+t[5]
+ VMULL.S16 Q14,D18,D1[0]
+ VMULL.S16 Q13,D19,D1[0] ; Q13:Q14= OC_C4S4*(t[6]-t[7])
+ ; -(t[6]-t[7]<<16)
+ VSHRN.S32 D20,Q10,#16
+ VSHRN.S32 D21,Q11,#16 ; Q10= (OC_C4S4*(t[4]-t[5])>>16)
+ ; -(t[4]-t[5])
+ VADD.S16 Q11,Q1, Q3 ; Q11= t[0]'=t[0]+t[3]
+ VSUB.S16 Q3, Q1, Q3 ; Q3 = t[3]'=t[0]-t[3]
+ VSHRN.S32 D28,Q14,#16
+ VSHRN.S32 D29,Q13,#16 ; Q14= (OC_C4S4*(t[7]-t[6])>>16)
+ ; -(t[7]-t[6])
+ VADD.S16 Q10,Q10,Q8 ; Q10=t[5]'
+ VADD.S16 Q14,Q14,Q9 ; Q14=t[6]'
+ VSUB.S16 Q13,Q14,Q10 ; Q13=t[5]''=t[6]'-t[5]'
+ VADD.S16 Q14,Q14,Q10 ; Q14=t[6]''=t[6]'+t[5]'
+ VADD.S16 Q10,Q1, Q2 ; Q10= t[1]'=t[0]+t[2]
+ VSUB.S16 Q2, Q1, Q2 ; Q2 = t[2]'=t[0]-t[2]
+; Stage 4
+ CMP r0, r1
+ VADD.S16 Q8, Q11,Q15 ; Q8 = y[0]=t[0]'+t[7]'
+ VADD.S16 Q9, Q10,Q14 ; Q9 = y[1]=t[1]'+t[6]''
+ VSUB.S16 Q15,Q11,Q15 ; Q15 = y[7]=t[0]'-t[7]'
+ VSUB.S16 Q14,Q10,Q14 ; Q14 = y[6]=t[1]'-t[6]''
+ VADD.S16 Q10,Q2, Q13 ; Q10 = y[2]=t[2]'+t[5]''
+ VADD.S16 Q11,Q3, Q12 ; Q11 = y[3]=t[3]'+t[4]'
+ VSUB.S16 Q12,Q3, Q12 ; Q12 = y[4]=t[3]'-t[4]'
+ VSUB.S16 Q13,Q2, Q13 ; Q13 = y[5]=t[2]'-t[5]''
+ BEQ oc_idct8x8_10_neon_noclear
+ VMOV.I8 D2, #0
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VST1.64 {D2}, [r1 at 64], r12
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VST1.64 {D2}, [r1 at 64], r12
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VST1.64 {D2}, [r1 at 64], r12
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VST1.64 {D2}, [r1 at 64]
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
+
+oc_idct8x8_10_neon_noclear
+ VRSHR.S16 Q8, Q8, #4 ; Q8 = y[0]+8>>4
+ VRSHR.S16 Q9, Q9, #4 ; Q9 = y[1]+8>>4
+ VRSHR.S16 Q10,Q10,#4 ; Q10 = y[2]+8>>4
+ VRSHR.S16 Q11,Q11,#4 ; Q11 = y[3]+8>>4
+ VRSHR.S16 Q12,Q12,#4 ; Q12 = y[4]+8>>4
+ VRSHR.S16 Q13,Q13,#4 ; Q13 = y[5]+8>>4
+ VRSHR.S16 Q14,Q14,#4 ; Q14 = y[6]+8>>4
+ VRSHR.S16 Q15,Q15,#4 ; Q15 = y[7]+8>>4
+ VSTMIA r0, {D16-D31}
+ MOV PC, r14
+ ]
+
+ END
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armint.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armint.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,117 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+# if defined(__ARMEB__)
+# error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+# endif
+
+# define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+ of the things that depend on structure offsets.
+ We reuse the function pointer with the wrong prototype, though.*/
+# define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+ ((oc_loop_filter_frag_rows_arm_func) \
+ (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+ (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+ (_bv), \
+ (_state)->frags, \
+ (_state)->fplanes[(_pli)].froffset \
+ +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+ (_state)->fplanes[(_pli)].froffset \
+ +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+ (_state)->fplanes[(_pli)].froffset, \
+ (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+ (_state)->frag_buf_offs, \
+ (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+# define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+# if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+# if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+# if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+# endif
+# endif
+# endif
+# endif
+
+#endif
Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMfilter.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,662 @@
+;********************************************************************
+;* *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+;* *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;* *
+;********************************************************************
+; Original implementation:
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+ AREA |.text|, CODE, READONLY
+
+ GET armopts.s
+
+ EXPORT oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG * 1
+
+ ; Vanilla ARM v4 version
+loop_filter_h_arm
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int *_bv
+ ; preserves r0-r3
+ STMFD r13!,{r3-r6,r14}
+ MOV r14,#8
+ MOV r6, #255
+lfh_arm_lp
+ LDRB r3, [r0, #-2] ; r3 = _pix[0]
+ LDRB r12,[r0, #1] ; r12= _pix[3]
+ LDRB r4, [r0, #-1] ; r4 = _pix[1]
+ LDRB r5, [r0] ; r5 = _pix[2]
+ SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
+ ADD r3, r3, #4
+ SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
+ ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
+ ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+ MOV r12,r12,ASR #3
+ LDRSB r12,[r2, r12]
+ ; Stall (2 on Xscale)
+ ADDS r4, r4, r12
+ CMPGT r6, r4
+ EORLT r4, r6, r4, ASR #32
+ SUBS r5, r5, r12
+ CMPGT r6, r5
+ EORLT r5, r6, r5, ASR #32
+ STRB r4, [r0, #-1]
+ STRB r5, [r0], r1
+ SUBS r14,r14,#1
+ BGT lfh_arm_lp
+ SUB r0, r0, r1, LSL #3
+ LDMFD r13!,{r3-r6,PC}
+
+loop_filter_v_arm
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int *_bv
+ ; preserves r0-r3
+ STMFD r13!,{r3-r6,r14}
+ MOV r14,#8
+ MOV r6, #255
+lfv_arm_lp
+ LDRB r3, [r0, -r1, LSL #1] ; r3 = _pix[0]
+ LDRB r12,[r0, r1] ; r12= _pix[3]
+ LDRB r4, [r0, -r1] ; r4 = _pix[1]
+ LDRB r5, [r0] ; r5 = _pix[2]
+ SUB r3, r3, r12 ; r3 = _pix[0]-_pix[3]+4
+ ADD r3, r3, #4
+ SUB r12,r5, r4 ; r12= _pix[2]-_pix[1]
+ ADD r12,r12,r12,LSL #1 ; r12= 3*(_pix[2]-_pix[1])
+ ADD r12,r12,r3 ; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+ MOV r12,r12,ASR #3
+ LDRSB r12,[r2, r12]
+ ; Stall (2 on Xscale)
+ ADDS r4, r4, r12
+ CMPGT r6, r4
+ EORLT r4, r6, r4, ASR #32
+ SUBS r5, r5, r12
+ CMPGT r6, r5
+ EORLT r5, r6, r5, ASR #32
+ STRB r4, [r0, -r1]
+ STRB r5, [r0], #1
+ SUBS r14,r14,#1
+ BGT lfv_arm_lp
+ SUB r0, r0, #8
+ LDMFD r13!,{r3-r6,PC}
+
+oc_loop_filter_frag_rows_arm
+ ; r0 = _ref_frame_data
+ ; r1 = _ystride
+ ; r2 = _bv
+ ; r3 = _frags
+ ; r4 = _fragi0
+ ; r5 = _fragi0_end
+ ; r6 = _fragi_top
+ ; r7 = _fragi_bot
+ ; r8 = _frag_buf_offs
+ ; r9 = _nhfrags
+ MOV r12,r13
+ STMFD r13!,{r0,r4-r11,r14}
+ LDMFD r12,{r4-r9}
+ ADD r2, r2, #127 ; _bv += 127
+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
+ BGE oslffri_arm_end ; bail
+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
+ BLE oslffri_arm_end ; bail
+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+ MOV r10,r4 ; r10= fragi = _fragi0
+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
+ LDR r0, [r13] ; r0 = _ref_frame_data
+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
+ TST r14,#OC_FRAG_CODED_FLAG
+ BEQ oslffri_arm_uncoded
+ CMP r10,r4 ; if (fragi>_fragi0)
+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+ BLGT loop_filter_h_arm
+ CMP r4, r6 ; if (_fragi0>_fragi_top)
+ BLGT loop_filter_v_arm
+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
+ ADD r0, r0, #8
+ ADD r10,r10,#1 ; r10 = fragi+1;
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
+ BLLT loop_filter_h_arm
+ CMP r10,r7 ; if (fragi<_fragi_bot)
+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
+ SUB r0, r0, #8
+ ADD r0, r0, r1, LSL #3
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG
+ BLLT loop_filter_v_arm
+ CMP r10,r11 ; while(fragi<=fragi_end-1)
+ BLE oslffri_arm_lp2
+ MOV r4, r10 ; r4 = fragi0 += _nhfrags
+ CMP r4, r5
+ BLT oslffri_arm_lp1
+oslffri_arm_end
+ LDMFD r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+ ADD r10,r10,#1
+ CMP r10,r11
+ BLE oslffri_arm_lp2
+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
+ CMP r4, r5
+ BLT oslffri_arm_lp1
+ LDMFD r13!,{r0,r4-r11,PC}
+
+ [ OC_ARM_ASM_MEDIA
+ EXPORT oc_loop_filter_init_v6
+ EXPORT oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6
+ ; r0 = _bv
+ ; r1 = _flimit (=L from the spec)
+ MVN r1, r1, LSL #1 ; r1 = <0xFFFFFF|255-2*L>
+ AND r1, r1, #255 ; r1 = ll=r1&0xFF
+ ORR r1, r1, r1, LSL #8 ; r1 = <ll|ll>
+ PKHBT r1, r1, r1, LSL #16 ; r1 = <ll|ll|ll|ll>
+ STR r1, [r0]
+ MOV PC,r14
+
+; We could use the same strategy as the v filter below, but that would require
+; 40 instructions to load the data and transpose it into columns and another
+; 32 to write out the results at the end, plus the 52 instructions to do the
+; filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+; 52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+; proposed for FFmpeg, but not by much:
+; http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+; of four.
+loop_filter_h_v6
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int _ll
+ ; preserves r0-r3
+ STMFD r13!,{r4-r11,r14}
+ LDR r12,=0x10003
+ BL loop_filter_h_core_v6
+ ADD r0, r0, r1, LSL #2
+ BL loop_filter_h_core_v6
+ SUB r0, r0, r1, LSL #2
+ LDMFD r13!,{r4-r11,PC}
+
+loop_filter_h_core_v6
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int _ll
+ ; r12= 0x10003
+ ; Preserves r0-r3, r12; Clobbers r4-r11.
+ LDR r4,[r0, #-2]! ; r4 = <p3|p2|p1|p0>
+ ; Single issue
+ LDR r5,[r0, r1]! ; r5 = <q3|q2|q1|q0>
+ UXTB16 r6, r4, ROR #16 ; r6 = <p0|p2>
+ UXTB16 r4, r4, ROR #8 ; r4 = <p3|p1>
+ UXTB16 r7, r5, ROR #16 ; r7 = <q0|q2>
+ UXTB16 r5, r5, ROR #8 ; r5 = <q3|q1>
+ PKHBT r8, r4, r5, LSL #16 ; r8 = <__|q1|__|p1>
+ PKHBT r9, r6, r7, LSL #16 ; r9 = <__|q2|__|p2>
+ SSUB16 r6, r4, r6 ; r6 = <p3-p0|p1-p2>
+ SMLAD r6, r6, r12,r12 ; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+ SSUB16 r7, r5, r7 ; r7 = <q3-q0|q1-q2>
+ SMLAD r7, r7, r12,r12 ; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+ LDR r4,[r0, r1]! ; r4 = <r3|r2|r1|r0>
+ MOV r6, r6, ASR #3 ; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+ LDR r5,[r0, r1]! ; r5 = <s3|s2|s1|s0>
+ PKHBT r11,r6, r7, LSL #13 ; r11= <??|-R_q|??|-R_p>
+ UXTB16 r6, r4, ROR #16 ; r6 = <r0|r2>
+ UXTB16 r11,r11 ; r11= <__|-R_q|__|-R_p>
+ UXTB16 r4, r4, ROR #8 ; r4 = <r3|r1>
+ UXTB16 r7, r5, ROR #16 ; r7 = <s0|s2>
+ PKHBT r10,r6, r7, LSL #16 ; r10= <__|s2|__|r2>
+ SSUB16 r6, r4, r6 ; r6 = <r3-r0|r1-r2>
+ UXTB16 r5, r5, ROR #8 ; r5 = <s3|s1>
+ SMLAD r6, r6, r12,r12 ; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+ SSUB16 r7, r5, r7 ; r7 = <r3-r0|r1-r2>
+ SMLAD r7, r7, r12,r12 ; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+ ORR r9, r9, r10, LSL #8 ; r9 = <s2|q2|r2|p2>
+ MOV r6, r6, ASR #3 ; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+ PKHBT r10,r4, r5, LSL #16 ; r10= <__|s1|__|r1>
+ PKHBT r6, r6, r7, LSL #13 ; r6 = <??|-R_s|??|-R_r>
+ ORR r8, r8, r10, LSL #8 ; r8 = <s1|q1|r1|p1>
+ UXTB16 r6, r6 ; r6 = <__|-R_s|__|-R_r>
+ MOV r10,#0
+ ORR r6, r11,r6, LSL #8 ; r6 = <-R_s|-R_q|-R_r|-R_p>
+ ; Single issue
+ ; There's no min, max or abs instruction.
+ ; SSUB8 and SEL will work for abs, and we can do all the rest with
+ ; unsigned saturated adds, which means the GE flags are still all
+ ; set when we're done computing lflim(abs(R_i),L).
+ ; This allows us to both add and subtract, and split the results by
+ ; the original sign of R_i.
+ SSUB8 r7, r10,r6
+ ; Single issue
+ SEL r7, r7, r6 ; r7 = abs(R_i)
+ ; Single issue
+ UQADD8 r4, r7, r2 ; r4 = 255-max(2*L-abs(R_i),0)
+ ; Single issue
+ UQADD8 r7, r7, r4
+ ; Single issue
+ UQSUB8 r7, r7, r4 ; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+ ; Single issue
+ UQSUB8 r4, r8, r7
+ UQADD8 r5, r9, r7
+ UQADD8 r8, r8, r7
+ UQSUB8 r9, r9, r7
+ SEL r8, r8, r4 ; r8 = p1+lflim(R_i,L)
+ SEL r9, r9, r5 ; r9 = p2-lflim(R_i,L)
+ MOV r5, r9, LSR #24 ; r5 = s2
+ STRB r5, [r0,#2]!
+ MOV r4, r8, LSR #24 ; r4 = s1
+ STRB r4, [r0,#-1]
+ MOV r5, r9, LSR #8 ; r5 = r2
+ STRB r5, [r0,-r1]!
+ MOV r4, r8, LSR #8 ; r4 = r1
+ STRB r4, [r0,#-1]
+ MOV r5, r9, LSR #16 ; r5 = q2
+ STRB r5, [r0,-r1]!
+ MOV r4, r8, LSR #16 ; r4 = q1
+ STRB r4, [r0,#-1]
+ ; Single issue
+ STRB r9, [r0,-r1]!
+ ; Single issue
+ STRB r8, [r0,#-1]
+ MOV PC,r14
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+; computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+; filter value, f:
+; u = ~UHADD8(p1,~p2);
+; v = UHADD8(~p1,p2);
+; m = v-u;
+; a = m^UHADD8(m^p0,m^~p3);
+; f = UHADD8(UHADD8(a,u1),v1);
+; where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+; as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+; but requires more code, because it does all eight columns at once, instead
+; of four at a time.
+loop_filter_v_v6
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int _ll
+ ; preserves r0-r11
+ STMFD r13!,{r4-r11,r14}
+ LDRD r6, [r0, -r1]! ; r7, r6 = <p5|p1>
+ LDRD r4, [r0, -r1] ; r5, r4 = <p4|p0>
+ LDRD r8, [r0, r1]! ; r9, r8 = <p6|p2>
+ MVN r14,r6 ; r14= ~p1
+ LDRD r10,[r0, r1] ; r11,r10= <p7|p3>
+ ; Filter the first four columns.
+ MVN r12,r8 ; r12= ~p2
+ UHADD8 r14,r14,r8 ; r14= v1=~p1+p2>>1
+ UHADD8 r12,r12,r6 ; r12= p1+~p2>>1
+ MVN r10, r10 ; r10=~p3
+ MVN r12,r12 ; r12= u1=~p1+p2+1>>1
+ SSUB8 r14,r14,r12 ; r14= m1=v1-u1
+ ; Single issue
+ EOR r4, r4, r14 ; r4 = m1^p0
+ EOR r10,r10,r14 ; r10= m1^~p3
+ UHADD8 r4, r4, r10 ; r4 = (m1^p0)+(m1^~p3)>>1
+ ; Single issue
+ EOR r4, r4, r14 ; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+ SADD8 r14,r14,r12 ; r14= v1=m1+u1
+ UHADD8 r4, r4, r12 ; r4 = a1+u1>>1
+ MVN r12,r9 ; r12= ~p6
+ UHADD8 r4, r4, r14 ; r4 = f1=(a1+u1>>1)+v1>>1
+ ; Filter the second four columns.
+ MVN r14,r7 ; r14= ~p5
+ UHADD8 r12,r12,r7 ; r12= p5+~p6>>1
+ UHADD8 r14,r14,r9 ; r14= v2=~p5+p6>>1
+ MVN r12,r12 ; r12= u2=~p5+p6+1>>1
+ MVN r11,r11 ; r11=~p7
+ SSUB8 r10,r14,r12 ; r10= m2=v2-u2
+ ; Single issue
+ EOR r5, r5, r10 ; r5 = m2^p4
+ EOR r11,r11,r10 ; r11= m2^~p7
+ UHADD8 r5, r5, r11 ; r5 = (m2^p4)+(m2^~p7)>>1
+ ; Single issue
+ EOR r5, r5, r10 ; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+ ; Single issue
+ UHADD8 r5, r5, r12 ; r5 = a2+u2>>1
+ LDR r12,=0x7F7F7F7F ; r12 = {127}x4
+ UHADD8 r5, r5, r14 ; r5 = f2=(a2+u2>>1)+v2>>1
+ ; Now split f[i] by sign.
+ ; There's no min or max instruction.
+ ; We could use SSUB8 and SEL, but this is just as many instructions and
+ ; dual issues more (for v7 without NEON).
+ UQSUB8 r10,r4, r12 ; r10= R_i>0?R_i:0
+ UQSUB8 r4, r12,r4 ; r4 = R_i<0?-R_i:0
+ UQADD8 r11,r10,r2 ; r11= 255-max(2*L-abs(R_i<0),0)
+ UQADD8 r14,r4, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
+ UQADD8 r10,r10,r11
+ UQADD8 r4, r4, r14
+ UQSUB8 r10,r10,r11 ; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+ UQSUB8 r4, r4, r14 ; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+ UQSUB8 r11,r5, r12 ; r11= R_i>0?R_i:0
+ UQADD8 r6, r6, r10
+ UQSUB8 r8, r8, r10
+ UQSUB8 r5, r12,r5 ; r5 = R_i<0?-R_i:0
+ UQSUB8 r6, r6, r4 ; r6 = p1+lflim(R_i,L)
+ UQADD8 r8, r8, r4 ; r8 = p2-lflim(R_i,L)
+ UQADD8 r10,r11,r2 ; r10= 255-max(2*L-abs(R_i<0),0)
+ UQADD8 r14,r5, r2 ; r14= 255-max(2*L-abs(R_i>0),0)
+ UQADD8 r11,r11,r10
+ UQADD8 r5, r5, r14
+ UQSUB8 r11,r11,r10 ; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+ UQSUB8 r5, r5, r14 ; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+ UQADD8 r7, r7, r11
+ UQSUB8 r9, r9, r11
+ UQSUB8 r7, r7, r5 ; r7 = p5+lflim(R_i,L)
+ STRD r6, [r0, -r1] ; [p5:p1] = [r7: r6]
+ UQADD8 r9, r9, r5 ; r9 = p6-lflim(R_i,L)
+ STRD r8, [r0] ; [p6:p2] = [r9: r8]
+ LDMFD r13!,{r4-r11,PC}
+
+oc_loop_filter_frag_rows_v6
+ ; r0 = _ref_frame_data
+ ; r1 = _ystride
+ ; r2 = _bv
+ ; r3 = _frags
+ ; r4 = _fragi0
+ ; r5 = _fragi0_end
+ ; r6 = _fragi_top
+ ; r7 = _fragi_bot
+ ; r8 = _frag_buf_offs
+ ; r9 = _nhfrags
+ MOV r12,r13
+ STMFD r13!,{r0,r4-r11,r14}
+ LDMFD r12,{r4-r9}
+ LDR r2, [r2] ; ll = *(int *)_bv
+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
+ BGE oslffri_v6_end ; bail
+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
+ BLE oslffri_v6_end ; bail
+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+ MOV r10,r4 ; r10= fragi = _fragi0
+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
+ LDR r0, [r13] ; r0 = _ref_frame_data
+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
+ TST r14,#OC_FRAG_CODED_FLAG
+ BEQ oslffri_v6_uncoded
+ CMP r10,r4 ; if (fragi>_fragi0)
+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+ BLGT loop_filter_h_v6
+ CMP r4, r6 ; if (fragi0>_fragi_top)
+ BLGT loop_filter_v_v6
+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
+ ADD r0, r0, #8
+ ADD r10,r10,#1 ; r10 = fragi+1;
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
+ BLLT loop_filter_h_v6
+ CMP r10,r7 ; if (fragi<_fragi_bot)
+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
+ SUB r0, r0, #8
+ ADD r0, r0, r1, LSL #3
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG
+ BLLT loop_filter_v_v6
+ CMP r10,r11 ; while(fragi<=fragi_end-1)
+ BLE oslffri_v6_lp2
+ MOV r4, r10 ; r4 = fragi0 += nhfrags
+ CMP r4, r5
+ BLT oslffri_v6_lp1
+oslffri_v6_end
+ LDMFD r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+ ADD r10,r10,#1
+ CMP r10,r11
+ BLE oslffri_v6_lp2
+ MOV r4, r10 ; r4 = fragi0 += nhfrags
+ CMP r4, r5
+ BLT oslffri_v6_lp1
+ LDMFD r13!,{r0,r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+ EXPORT oc_loop_filter_init_neon
+ EXPORT oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon
+ ; r0 = _bv
+ ; r1 = _flimit (=L from the spec)
+ MOV r1, r1, LSL #1 ; r1 = 2*L
+ VDUP.S16 Q15, r1 ; Q15= 2L in U16s
+ VST1.64 {D30,D31}, [r0 at 128]
+ MOV PC,r14
+
+loop_filter_h_neon
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int *_bv
+ ; preserves r0-r3
+ ; We assume Q15= 2*L in U16s
+ ; My best guesses at cycle counts (and latency)--vvv
+ SUB r12,r0, #2
+ ; Doing a 2-element structure load saves doing two VTRN's below, at the
+ ; cost of using two more slower single-lane loads vs. the faster
+ ; all-lane loads.
+ ; It's less code this way, though, and benches a hair faster, but it
+ ; leaves D2 and D4 swapped.
+ VLD2.16 {D0[],D2[]}, [r12], r1 ; D0 = ____________1100 2,1
+ ; D2 = ____________3322
+ VLD2.16 {D4[],D6[]}, [r12], r1 ; D4 = ____________5544 2,1
+ ; D6 = ____________7766
+ VLD2.16 {D0[1],D2[1]},[r12], r1 ; D0 = ________99881100 3,1
+ ; D2 = ________BBAA3322
+ VLD2.16 {D4[1],D6[1]},[r12], r1 ; D4 = ________DDCC5544 3,1
+ ; D6 = ________FFEE7766
+ VLD2.16 {D0[2],D2[2]},[r12], r1 ; D0 = ____GGHH99881100 3,1
+ ; D2 = ____JJIIBBAA3322
+ VLD2.16 {D4[2],D6[2]},[r12], r1 ; D4 = ____KKLLDDCC5544 3,1
+ ; D6 = ____NNMMFFEE7766
+ VLD2.16 {D0[3],D2[3]},[r12], r1 ; D0 = PPOOGGHH99881100 3,1
+ ; D2 = RRQQJJIIBBAA3322
+ VLD2.16 {D4[3],D6[3]},[r12], r1 ; D4 = TTSSKKLLDDCC5544 3,1
+ ; D6 = VVUUNNMMFFEE7766
+ VTRN.8 D0, D4 ; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511 1,1
+ VTRN.8 D2, D6 ; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733 1,1
+ VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
+ VSUBL.U8 Q8, D2, D4 ; Q8 = 22 - 11 in S16s 1,3
+ ; Stall
+ VADD.S16 Q0, Q0, Q8 ; 1,3
+ SUB r12, r0, #1
+ ; Stall
+ VADD.S16 Q0, Q0, Q8 ; 1,3
+ ; Stall x2
+ VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
+ ; Stall x2
+ VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
+ ; We want to do
+ ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+ ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+ ; So we've reduced the left and right hand terms to be the same, except
+ ; for a negation.
+ ; Stall x3
+ VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
+ VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
+ ; Stall x2
+ VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
+ VMOVL.U8 Q1, D2 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
+ ; Stall x2
+ VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
+ ; Now we need to correct for the sign of f.
+ ; For negative elements of Q0, we want to subtract the appropriate
+ ; element of Q9. For positive elements we want to add them. No NEON
+ ; instruction exists to do this, so we need to negate the negative
+ ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+ ; Stall x3
+ VADD.S16 Q9, Q9, Q0 ; 1,3
+ ; Stall x2
+ VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
+ ; Bah. No VRSBW.U8
+ ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+ VADDW.U8 Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
+ VSUB.S16 Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
+ VQMOVUN.S16 D4, Q2 ; D4 = TTPPLLHHDD995511 1,1
+ VQMOVUN.S16 D2, Q1 ; D2 = UUQQMMIIEEAA6622 1,1
+ VTRN.8 D4, D2 ; D4 = QQPPIIHHAA992211 D2 = MMLLEEDD6655 1,1
+ VST1.16 {D4[0]}, [r12], r1
+ VST1.16 {D2[0]}, [r12], r1
+ VST1.16 {D4[1]}, [r12], r1
+ VST1.16 {D2[1]}, [r12], r1
+ VST1.16 {D4[2]}, [r12], r1
+ VST1.16 {D2[2]}, [r12], r1
+ VST1.16 {D4[3]}, [r12], r1
+ VST1.16 {D2[3]}, [r12], r1
+ MOV PC,r14
+
+loop_filter_v_neon
+ ; r0 = unsigned char *_pix
+ ; r1 = int _ystride
+ ; r2 = int *_bv
+ ; preserves r0-r3
+ ; We assume Q15= 2*L in U16s
+ ; My best guesses at cycle counts (and latency)--vvv
+ SUB r12,r0, r1, LSL #1
+ VLD1.64 {D0}, [r12 at 64], r1 ; D0 = SSOOKKGGCC884400 2,1
+ VLD1.64 {D2}, [r12 at 64], r1 ; D2 = TTPPLLHHDD995511 2,1
+ VLD1.64 {D4}, [r12 at 64], r1 ; D4 = UUQQMMIIEEAA6622 2,1
+ VLD1.64 {D6}, [r12 at 64], r1 ; D6 = VVRRNNJJFFBB7733 2,1
+ VSUBL.U8 Q8, D4, D2 ; Q8 = 22 - 11 in S16s 1,3
+ VSUBL.U8 Q0, D0, D6 ; Q0 = 00 - 33 in S16s 1,3
+ ; Stall
+ VADD.S16 Q0, Q0, Q8 ; 1,3
+ SUB r12, r0, r1
+ ; Stall
+ VADD.S16 Q0, Q0, Q8 ; 1,3
+ ; Stall x2
+ VADD.S16 Q0, Q0, Q8 ; Q0 = [0-3]+3*[2-1] 1,3
+ ; Stall x2
+ VRSHR.S16 Q0, Q0, #3 ; Q0 = f = ([0-3]+3*[2-1]+4)>>3 1,4
+ ; We want to do
+ ; f = CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+ ; = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX( f , MIN(-2L- f ,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+ ; = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+ ; So we've reduced the left and right hand terms to be the same, except
+ ; for a negation.
+ ; Stall x3
+ VABS.S16 Q9, Q0 ; Q9 = |f| in U16s 1,4
+ VSHR.S16 Q0, Q0, #15 ; Q0 = -1 or 0 according to sign 1,3
+ ; Stall x2
+ VQSUB.U16 Q10,Q15,Q9 ; Q10= MAX(2L-|f|,0) in U16s 1,4
+ VMOVL.U8 Q2, D4 ; Q2 = __UU__QQ__MM__II__EE__AA__66__22 2,3
+ ; Stall x2
+ VMIN.U16 Q9, Q10,Q9 ; Q9 = MIN(|f|,MAX(2L-|f|)) 1,4
+ ; Now we need to correct for the sign of f.
+ ; For negative elements of Q0, we want to subtract the appropriate
+ ; element of Q9. For positive elements we want to add them. No NEON
+ ; instruction exists to do this, so we need to negate the negative
+ ; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+ ; Stall x3
+ VADD.S16 Q9, Q9, Q0 ; 1,3
+ ; Stall x2
+ VEOR.S16 Q9, Q9, Q0 ; Q9 = real value of f 1,3
+ ; Bah. No VRSBW.U8
+ ; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+ VADDW.U8 Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11 1,3
+ VSUB.S16 Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22 1,3
+ VQMOVUN.S16 D2, Q1 ; D2 = TTPPLLHHDD995511 1,1
+ VQMOVUN.S16 D4, Q2 ; D4 = UUQQMMIIEEAA6622 1,1
+ VST1.64 {D2}, [r12 at 64], r1
+ VST1.64 {D4}, [r12 at 64], r1
+ MOV PC,r14
+
+oc_loop_filter_frag_rows_neon
+ ; r0 = _ref_frame_data
+ ; r1 = _ystride
+ ; r2 = _bv
+ ; r3 = _frags
+ ; r4 = _fragi0
+ ; r5 = _fragi0_end
+ ; r6 = _fragi_top
+ ; r7 = _fragi_bot
+ ; r8 = _frag_buf_offs
+ ; r9 = _nhfrags
+ MOV r12,r13
+ STMFD r13!,{r0,r4-r11,r14}
+ LDMFD r12,{r4-r9}
+ CMP r4, r5 ; if(_fragi0>=_fragi0_end)
+ BGE oslffri_neon_end; bail
+ SUBS r9, r9, #1 ; r9 = _nhfrags-1 if (r9<=0)
+ BLE oslffri_neon_end ; bail
+ VLD1.64 {D30,D31}, [r2 at 128] ; Q15= 2L in U16s
+ ADD r3, r3, r4, LSL #2 ; r3 = &_frags[fragi]
+ ADD r8, r8, r4, LSL #2 ; r8 = &_frag_buf_offs[fragi]
+ SUB r7, r7, r9 ; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+ MOV r10,r4 ; r10= fragi = _fragi0
+ ADD r11,r4, r9 ; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+ LDR r14,[r3], #4 ; r14= _frags[fragi] _frags++
+ LDR r0, [r13] ; r0 = _ref_frame_data
+ LDR r12,[r8], #4 ; r12= _frag_buf_offs[fragi] _frag_buf_offs++
+ TST r14,#OC_FRAG_CODED_FLAG
+ BEQ oslffri_neon_uncoded
+ CMP r10,r4 ; if (fragi>_fragi0)
+ ADD r0, r0, r12 ; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+ BLGT loop_filter_h_neon
+ CMP r4, r6 ; if (_fragi0>_fragi_top)
+ BLGT loop_filter_v_neon
+ CMP r10,r11 ; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+ LDRLT r12,[r3] ; r12 = _frags[fragi+1]
+ ADD r0, r0, #8
+ ADD r10,r10,#1 ; r10 = fragi+1;
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG ; && _frags[fragi+1].coded==0
+ BLLT loop_filter_h_neon
+ CMP r10,r7 ; if (fragi<_fragi_bot)
+ LDRLT r12,[r3, r9, LSL #2] ; r12 = _frags[fragi+1+_nhfrags-1]
+ SUB r0, r0, #8
+ ADD r0, r0, r1, LSL #3
+ ANDLT r12,r12,#OC_FRAG_CODED_FLAG
+ CMPLT r12,#OC_FRAG_CODED_FLAG
+ BLLT loop_filter_v_neon
+ CMP r10,r11 ; while(fragi<=fragi_end-1)
+ BLE oslffri_neon_lp2
+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
+ CMP r4, r5
+ BLT oslffri_neon_lp1
+oslffri_neon_end
+ LDMFD r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+ ADD r10,r10,#1
+ CMP r10,r11
+ BLE oslffri_neon_lp2
+ MOV r4, r10 ; r4 = _fragi0 += _nhfrags
+ CMP r4, r5
+ BLT oslffri_neon_lp1
+ LDMFD r13!,{r0,r4-r11,PC}
+ ]
+
+ END
Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMoptions.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,39 @@
+;********************************************************************
+;* *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+;* *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;* *
+;********************************************************************
+; Original implementation:
+; Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+; (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP * @HAVE_ARM_ASM_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA * @HAVE_ARM_ASM_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON * @HAVE_ARM_ASM_NEON@
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN * 0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+; boundary, so it's usually a bad idea to use them anyway if they can be
+; avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD * 0
+
+ END
Added: experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,95 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ function:
+ last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+ the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+ 0, 8, 1, 2, 9,16,24,17,
+ 10, 3, 4,11,18,25,32,40,
+ 33,26,19,12, 5, 6,13,20,
+ 27,34,41,48,56,49,42,35,
+ 28,21,14, 7,15,22,29,36,
+ 43,50,57,58,51,44,37,30,
+ 23,31,38,45,52,59,60,53,
+ 46,39,47,54,61,62,55,63,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64,
+ 64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+ oc_state_accel_init_c(_state);
+ _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+ _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+ /*Note: We _must_ set this function pointer, because the macro in armint.h
+ calls it with different arguments, so the C version will segfault.*/
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+ if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+# if defined(OC_STATE_USE_VTABLE)
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+# endif
+ }
+# if defined(OC_ARM_ASM_MEDIA)
+ if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+# if defined(OC_STATE_USE_VTABLE)
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+ _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+# endif
+ }
+# if defined(OC_ARM_ASM_NEON)
+ if(_state->cpu_flags&OC_CPU_ARM_NEON){
+# if defined(OC_STATE_USE_VTABLE)
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+ _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+ _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+ _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+ _state->opt_vtable.state_loop_filter_frag_rows=
+ (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+ _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+# endif
+ _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+ }
+# endif
+# endif
+# endif
+}
+
+#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/bitpack.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/bitpack.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/bitpack.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -32,15 +32,18 @@
const unsigned char *stop;
oc_pb_window window;
int available;
+ unsigned shift;
stop=_b->stop;
ptr=_b->ptr;
window=_b->window;
available=_b->bits;
- while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
- available+=8;
- window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+ shift=OC_PB_WINDOW_SIZE-available;
+ while(7<shift&&ptr<stop){
+ shift-=8;
+ window|=(oc_pb_window)*ptr++<<shift;
}
_b->ptr=ptr;
+ available=OC_PB_WINDOW_SIZE-shift;
if(_bits>available){
if(ptr>=stop){
_b->eof=1;
@@ -67,7 +70,7 @@
}
/*Here we assume that 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits){
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
oc_pb_window window;
int available;
long result;
@@ -87,7 +90,7 @@
return result;
}
-int oc_pack_read1(oc_pack_buf *_b){
+int oc_pack_read1_c(oc_pack_buf *_b){
oc_pb_window window;
int available;
int result;
Modified: experimental/derf/theora-ptalarbvorm/lib/bitpack.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/bitpack.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/bitpack.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -26,6 +26,21 @@
+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+# include "arm/armbits.h"
+# endif
+
+# if !defined(oc_pack_read)
+# define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+# define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+# define oc_huff_token_decode oc_huff_token_decode_c
+# endif
+
# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
/*This is meant to be a large, positive constant that can still be efficiently
loaded as an immediate (on platforms like ARM, for example).
@@ -46,8 +61,8 @@
int oc_pack_look1(oc_pack_buf *_b);
void oc_pack_adv1(oc_pack_buf *_b);
/*Here we assume 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits);
-int oc_pack_read1(oc_pack_buf *_b);
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
/* returns -1 for read beyond EOF, or the number of whole bytes available */
long oc_pack_bytes_left(oc_pack_buf *_b);
Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -61,7 +61,7 @@
/*The TI compiler refuses to pipeline this if we put it in an if(coded)
block.
We can do the loads unconditionally, which helps move them earlier.
- We do the store unconditionally too, because if we use a condtional
+ We do the store unconditionally too, because if we use a conditional
store, the compiler propagates the condition back to the operations
the store depended on, presumably to reduce cache pressure by
eliminating dead loads.
Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -43,6 +43,40 @@
#undef OC_ITER
}
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+ ptrdiff_t fragii;
+ /*9 cycles per iteration.*/
+ for(fragii=0;fragii<_nfragis;fragii++){
+ const unsigned char *restrict src;
+ const unsigned char *restrict s2;
+ unsigned char *restrict dst;
+ unsigned char *restrict d2;
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+ dst=_dst_frame+frag_buf_off;
+ src=_src_frame+frag_buf_off;
+ d2=dst+_ystride;
+ s2=src+_ystride;
+#define OC_ITER() \
+ do{ \
+ _amem8(dst)=_amem8_const(src); \
+ dst+=2*_ystride; \
+ src+=2*_ystride; \
+ _amem8(d2)=_amem8_const(s2); \
+ d2+=2*_ystride; \
+ s2+=2*_ystride; \
+ } \
+ while(0)
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+ OC_ITER();
+#undef OC_ITER
+ }
+}
+
/*34 cycles.*/
void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
const ogg_int16_t _residue[64]){
@@ -130,7 +164,7 @@
}
void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
@@ -138,25 +172,28 @@
/*Apply the inverse transform.*/
/*Special case only having a DC component.*/
if(_last_zzi<2){
- ogg_int16_t p;
+ int p;
+ long long ll;
int ci;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
- p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
- /*LOOP VECTORIZES.*/
- for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+ p=_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5;
+ ll=_itoll(_pack2(p,p),_pack2(p,p));
+ for(ci=0;ci<64;ci+=4)_amem8(_dct_coeffs+64+ci)=ll;
}
else{
/*First, dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
- oc_idct8x8_c64x(_dct_coeffs,_last_zzi);
+ oc_idct8x8_c64x(_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
mb_mode=_state->frags[_fragi].mb_mode;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
- if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs);
+ if(mb_mode==OC_MODE_INTRA){
+ oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs+64);
+ }
else{
const unsigned char *ref;
int mvoffsets[2];
@@ -166,54 +203,12 @@
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
- ystride,_dct_coeffs);
+ ystride,_dct_coeffs+64);
}
- else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+ else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
}
}
-void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
- const ptrdiff_t *frag_buf_offs;
- const unsigned char *src_frame_data;
- unsigned char *dst_frame_data;
- ptrdiff_t fragii;
- int ystride;
- dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
- src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
- ystride=_state->ref_ystride[_pli];
- frag_buf_offs=_state->frag_buf_offs;
- /*9 cycles per iteration.*/
- for(fragii=0;fragii<_nfragis;fragii++){
- const unsigned char *restrict src;
- const unsigned char *restrict s2;
- unsigned char *restrict dst;
- unsigned char *restrict d2;
- ptrdiff_t frag_buf_off;
- frag_buf_off=frag_buf_offs[_fragis[fragii]];
- dst=dst_frame_data+frag_buf_off;
- src=src_frame_data+frag_buf_off;
- d2=dst+ystride;
- s2=src+ystride;
-#define OC_ITER() \
- do{ \
- _amem8(dst)=_amem8_const(src); \
- dst+=2*ystride; \
- src+=2*ystride; \
- _amem8(d2)=_amem8_const(s2); \
- d2+=2*ystride; \
- s2+=2*ystride; \
- } \
- while(0)
- OC_ITER();
- OC_ITER();
- OC_ITER();
- OC_ITER();
-#undef OC_ITER
- }
-}
-
/*46 cycles.*/
static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
int p0;
@@ -394,9 +389,16 @@
_amem8(_pix)=_itoll(p6,p2);
}
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit){
+ int ll;
+ ll=_flimit<<1;
+ ll=_pack2(ll,ll);
+ ll=~_spacku4(ll,ll);
+ *((int *)_bv)=ll;
+}
-void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
@@ -413,14 +415,12 @@
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
- fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
ref_frame_data=_state->ref_frame_data[_refi];
- ll=_state->loop_filter_limits[_state->qis[0]]<<1;
- ll=_pack2(ll,ll);
- ll=~_spacku4(ll,ll);
+ ll=*((int *)_bv);
/*The following loops are constructed somewhat non-intuitively on purpose.
The main idea is: if a block boundary has at least one coded fragment on
it, the filter is applied to it.
Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -296,8 +296,8 @@
} \
while(0)
-/*179 cycles.*/
-static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64]){
+/*196 cycles.*/
+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
ogg_int16_t w[64];
int x0;
int x1;
@@ -318,7 +318,13 @@
int i;
/*Transform rows of x into columns of w.*/
for(i=0;i<8;i+=2){
- OC_IDCT8x2_LOAD8(_y+i*8);
+ OC_IDCT8x2_LOAD8(_x+i*8);
+ if(_x!=_y){
+ _amem8(_x+i*8)=0LL;
+ _amem8(_x+i*8+4)=0LL;
+ _amem8(_x+i*8+8)=0LL;
+ _amem8(_x+i*8+12)=0LL;
+ }
OC_IDCT8x2();
OC_IDCT8x2_STORET(w+i);
}
@@ -330,8 +336,8 @@
}
}
-/*107 cycles.*/
-static void oc_idct8x8_10_c64x(ogg_int16_t _y[64]){
+/*106 cycles.*/
+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
ogg_int16_t w[64];
int t0;
int t1;
@@ -347,10 +353,16 @@
int x3;
int i;
/*Transform rows of x into columns of w.*/
- OC_IDCT8x2_LOAD4(_y);
+ OC_IDCT8x2_LOAD4(_x);
OC_IDCT8x2_4();
OC_IDCT8x2_STORET(w);
- OC_IDCT8x2_LOAD2(_y+16);
+ OC_IDCT8x2_LOAD2(_x+16);
+ if(_x!=_y){
+ _amem8(_x)=0LL;
+ _amem8(_x+8)=0LL;
+ _amem4(_x+16)=0;
+ _amem4(_x+24)=0;
+ }
OC_IDCT8x2_2();
OC_IDCT8x2_STORET(w+2);
/*Transform rows of w into columns of y.*/
@@ -361,8 +373,13 @@
}
}
-/*88 cycles.*/
-static void oc_idct8x8_3_c64x(ogg_int16_t _y[64]){
+#if 0
+/*This used to compile to something faster (88 cycles), but no longer, and I'm
+ not sure what changed to cause this.
+ In any case, it's barely an advantage over the 10-coefficient version, and is
+ now hardly worth the icache space.*/
+/*95 cycles.*/
+static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
ogg_int16_t w[64];
int t0;
int t1;
@@ -377,10 +394,14 @@
int i;
/*Transform rows of x into rows of w.*/
for(i=0;i<2;i+=2){
- OC_IDCT8x2_LOAD2(_y+i*8);
+ OC_IDCT8x2_LOAD2(_x+i*8);
OC_IDCT8x2_2();
OC_IDCT8x2_STORE(w+i*8);
}
+ if(_x!=_y){
+ _amem4(_x)=0;
+ _amem4(_x+8)=0;
+ }
/*Transform columns of w into columns of y.*/
for(i=0;i<8;i+=2){
OC_IDCT8x2_LOAD2T(w+i);
@@ -388,12 +409,13 @@
OC_IDCT8x2_ROUND_STORET(_y+i);
}
}
+#endif
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi){
- if(_last_zzi<3)oc_idct8x8_3_c64x(_y);
- else if(_last_zzi<10)oc_idct8x8_10_c64x(_y);
- else oc_idct8x8_slow_c64x(_y);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+ /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
+ else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
+ else oc_idct8x8_slow_c64x(_y,_x);
}
Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -23,16 +23,21 @@
# define oc_state_accel_init oc_state_accel_init_c64x
# define oc_frag_copy(_state,_dst,_src,_ystride) \
oc_frag_copy_c64x(_dst,_src,_ystride)
+# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ oc_frag_copy_list_c64x(_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs)
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
-# define oc_idct8x8(_state,_y,_last_zzi) \
- oc_idct8x8_c64x(_y,_last_zzi)
+# define oc_idct8x8(_state,_y,_x,_last_zzi) \
+ oc_idct8x8_c64x(_y,_x,_last_zzi)
# define oc_state_frag_recon oc_state_frag_recon_c64x
-# define oc_state_frag_copy_list oc_state_frag_copy_list_c64x
+# define oc_loop_filter_init(_state,_bv,_flimit) \
+ oc_loop_filter_init_c64x(_bv,_flimit)
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
# define oc_restore_fpu(_state) do{}while(0)
# endif
@@ -43,19 +48,20 @@
void oc_frag_copy_c64x(unsigned char *_dst,
const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_c64x(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit);
void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -20,7 +20,7 @@
#if defined(OC_C64X_ASM)
void oc_state_accel_init_c64x(oc_theora_state *_state){
- _state->cpu_flags=0;
+ oc_state_accel_init_c(_state);
# if defined(OC_STATE_USE_VTABLE)
_state->opt_vtable.frag_copy=oc_frag_copy_c64x;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
@@ -28,12 +28,12 @@
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
_state->opt_vtable.idct8x8=oc_idct8x8_c64x;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
- _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c64x;
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c64x;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c64x;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_c64x;
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
# endif
- _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
}
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/decint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decint.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/decint.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -77,7 +77,24 @@
struct oc_dec_pipeline_state{
- int bounding_values[256];
+ /*Decoded DCT coefficients.
+ These are placed here instead of on the stack so that they can persist
+ between blocks, which makes clearing them back to zero much faster when
+ only a few non-zero coefficients were decoded.
+ It requires at least 65 elements because the zig-zag index array uses the
+ 65th element as a dumping ground for out-of-range indices to protect us
+ from buffer overflow.
+ We make it fully twice as large so that the second half can serve as the
+ reconstruction buffer, which saves passing another parameter to all the
+ acceleration functios.
+ It also solves problems with 16-byte alignment for NEON on ARM.
+ gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+ alignment, and silently produces incorrect results if you ask for 16.
+ Finally, keeping it off the stack means there's less likely to be a data
+ hazard beween the NEON co-processor and the regular ARM core, which avoids
+ unnecessary stalls.*/
+ OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+ OC_ALIGN16(signed char bounding_values[256]);
ptrdiff_t ti[3][64];
ptrdiff_t ebi[3][64];
ptrdiff_t eob_runs[3][64];
@@ -97,66 +114,66 @@
struct th_dec_ctx{
/*Shared encoder/decoder state.*/
- oc_theora_state state;
+ oc_theora_state state;
/*Whether or not packets are ready to be emitted.
This takes on negative values while there are remaining header packets to
be emitted, reaches 0 when the codec is ready for input, and goes to 1
when a frame has been processed and a data packet is ready.*/
- int packet_state;
+ int packet_state;
/*Buffer in which to assemble packets.*/
- oc_pack_buf opb;
+ oc_pack_buf opb;
/*Huffman decode trees.*/
- ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
+ ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
/*The index of the first token in each plane for each coefficient.*/
- ptrdiff_t ti0[3][64];
+ ptrdiff_t ti0[3][64];
/*The number of outstanding EOB runs at the start of each coefficient in each
plane.*/
- ptrdiff_t eob_runs[3][64];
+ ptrdiff_t eob_runs[3][64];
/*The DCT token lists.*/
- unsigned char *dct_tokens;
+ unsigned char *dct_tokens;
/*The extra bits associated with DCT tokens.*/
- unsigned char *extra_bits;
+ unsigned char *extra_bits;
/*The number of dct tokens unpacked so far.*/
- int dct_tokens_count;
+ int dct_tokens_count;
/*The out-of-loop post-processing level.*/
- int pp_level;
+ int pp_level;
/*The DC scale used for out-of-loop deblocking.*/
- int pp_dc_scale[64];
+ int pp_dc_scale[64];
/*The sharpen modifier used for out-of-loop deringing.*/
- int pp_sharp_mod[64];
+ int pp_sharp_mod[64];
/*The DC quantization index of each block.*/
- unsigned char *dc_qis;
+ unsigned char *dc_qis;
/*The variance of each block.*/
- int *variances;
+ int *variances;
/*The storage for the post-processed frame buffer.*/
- unsigned char *pp_frame_data;
+ unsigned char *pp_frame_data;
/*Whether or not the post-processsed frame buffer has space for chroma.*/
- int pp_frame_state;
+ int pp_frame_state;
/*The buffer used for the post-processed frame.
Note that this is _not_ guaranteed to have the same strides and offsets as
the reference frame buffers.*/
- th_ycbcr_buffer pp_frame_buf;
+ th_ycbcr_buffer pp_frame_buf;
/*The striped decode callback function.*/
- th_stripe_callback stripe_cb;
- oc_dec_pipeline_state pipe;
+ th_stripe_callback stripe_cb;
+ oc_dec_pipeline_state pipe;
# if defined(OC_DEC_USE_VTABLE)
/*Table for decoder acceleration functions.*/
- oc_dec_opt_vtable opt_vtable;
+ oc_dec_opt_vtable opt_vtable;
# endif
# if defined(HAVE_CAIRO)
/*Output metrics for debugging.*/
- int telemetry;
- int telemetry_mbmode;
- int telemetry_mv;
- int telemetry_qi;
- int telemetry_bits;
- int telemetry_frame_bytes;
- int telemetry_coding_bytes;
- int telemetry_mode_bytes;
- int telemetry_mv_bytes;
- int telemetry_qi_bytes;
- int telemetry_dc_bytes;
- unsigned char *telemetry_frame_data;
+ int telemetry;
+ int telemetry_mbmode;
+ int telemetry_mv;
+ int telemetry_qi;
+ int telemetry_bits;
+ int telemetry_frame_bytes;
+ int telemetry_coding_bytes;
+ int telemetry_mode_bytes;
+ int telemetry_mv_bytes;
+ int telemetry_qi_bytes;
+ int telemetry_dc_bytes;
+ unsigned char *telemetry_frame_data;
# endif
};
Modified: experimental/derf/theora-ptalarbvorm/lib/decode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decode.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/decode.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -726,11 +726,12 @@
frags=_dec->state.frags;
for(mbi=0;mbi<nmbs;mbi++){
if(mb_modes[mbi]!=OC_MODE_INVALID){
- int bi;
/*Check for a coded luma block in this macro block.*/
- for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
- /*We found one, decode a mode.*/
- if(bi<4){
+ if(frags[mb_maps[mbi][0][0]].coded
+ ||frags[mb_maps[mbi][0][1]].coded
+ ||frags[mb_maps[mbi][0][2]].coded
+ ||frags[mb_maps[mbi][0][3]].coded){
+ /*We found one, decode a mode.*/
mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
}
/*There were none: INTER_NOMV is forced.*/
@@ -1335,9 +1336,11 @@
oc_dec_pipeline_state *_pipe){
const ptrdiff_t *coded_fragis;
const ptrdiff_t *uncoded_fragis;
+ int flimit;
int pli;
int qii;
int qti;
+ int zzi;
/*If chroma is sub-sampled in the vertical direction, we have to decode two
super block rows of Y' for each super block row of Cb and Cr.*/
_pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
@@ -1369,8 +1372,9 @@
/*Set the previous DC predictor to 0 for all color planes and frame types.*/
memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
/*Initialize the bounding value array for the loop filter.*/
- _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state,
- _pipe->bounding_values);
+ flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
+ _pipe->loop_filter=flimit!=0;
+ if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
/*Initialize any buffers needed for post-processing.
We also save the current post-processing level, to guard against the user
changing it from a callback.*/
@@ -1383,6 +1387,8 @@
_dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
sizeof(_dec->pp_frame_buf[0])*3);
}
+ /*Clear down the DCT coefficient buffer for the first block.*/
+ for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
}
/*Undo the DC prediction in a single plane of an MCU (one or two super block
@@ -1532,16 +1538,11 @@
eob_runs=_pipe->eob_runs[_pli];
for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
for(fragii=0;fragii<ncoded_fragis;fragii++){
- /*This array is made one element larger because the zig-zag index array
- uses the final element as a dumping ground for out-of-range indices
- to protect us from buffer overflow.*/
- OC_ALIGN16(ogg_int16_t dct_coeffs[65]);
const ogg_uint16_t *ac_quant;
ptrdiff_t fragi;
int last_zzi;
int zzi;
fragi=coded_fragis[fragii];
- for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
/*Decode the AC coefficients.*/
@@ -1578,18 +1579,19 @@
eob_runs[zzi]=eob;
ti[zzi]=lti;
zzi+=rlen;
- dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+ _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
+ (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
zzi+=!eob;
}
}
/*TODO: zzi should be exactly 64 here.
If it's not, we should report some kind of warning.*/
zzi=OC_MINI(zzi,64);
- dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+ _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
/*last_zzi is always initialized.
If your compiler thinks otherwise, it is dumb.*/
oc_state_frag_recon(&_dec->state,fragi,_pli,
- dct_coeffs,last_zzi,dc_quant[qti]);
+ _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
}
_pipe->coded_fragis[_pli]+=ncoded_fragis;
/*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -1603,9 +1605,14 @@
code, and the hard case (high bitrate, high resolution) is handled
correctly.*/
/*Copy the uncoded blocks from the previous reference frame.*/
- _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
- oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
- _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+ if(_pipe->nuncoded_fragis[_pli]>0){
+ _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+ oc_frag_copy_list(&_dec->state,
+ _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
+ _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_PREV]],
+ _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+ _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
+ }
}
/*Filter a horizontal block edge.*/
Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -31,6 +31,9 @@
typedef struct oc_enc_opt_data oc_enc_opt_data;
typedef struct oc_mb_enc_info oc_mb_enc_info;
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_fr_state oc_fr_state;
+typedef struct oc_qii_state oc_qii_state;
+typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
typedef struct oc_mode_rd oc_mode_rd;
typedef struct oc_iir_filter oc_iir_filter;
typedef struct oc_frame_metrics oc_frame_metrics;
@@ -384,6 +387,90 @@
+/*State to track coded block flags and their bit cost.
+ We use opportunity cost to measure the bits required to code or skip the next
+ block, using the cheaper of the cost to code it fully or partially, so long
+ as both are possible.*/
+struct oc_fr_state{
+ /*The number of bits required for the coded block flags so far this frame.*/
+ ptrdiff_t bits;
+ /*The length of the current run for the partial super block flag, not
+ including the current super block.*/
+ unsigned sb_partial_count:16;
+ /*The length of the current run for the full super block flag, not
+ including the current super block.*/
+ unsigned sb_full_count:16;
+ /*The length of the coded block flag run when the current super block
+ started.*/
+ unsigned b_coded_count_prev:6;
+ /*The coded block flag when the current super block started.*/
+ signed int b_coded_prev:2;
+ /*The length of the current coded block flag run.*/
+ unsigned b_coded_count:6;
+ /*The current coded block flag.*/
+ signed int b_coded:2;
+ /*The number of blocks processed in the current super block.*/
+ unsigned b_count:5;
+ /*Whether or not it is cheaper to code the current super block partially,
+ even if it could still be coded fully.*/
+ unsigned sb_prefer_partial:1;
+ /*Whether the last super block was coded partially.*/
+ signed int sb_partial:2;
+ /*The number of bits required for the flags for the current super block.*/
+ unsigned sb_bits:6;
+ /*Whether the last non-partial super block was coded fully.*/
+ signed int sb_full:2;
+};
+
+
+
+struct oc_qii_state{
+ ptrdiff_t bits;
+ unsigned qi01_count:14;
+ signed int qi01:2;
+ unsigned qi12_count:14;
+ signed int qi12:2;
+};
+
+
+
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+ /*DCT coefficient storage.
+ This is kept off the stack because a) gcc can't align things on the stack
+ reliably on ARM, and b) it avoids (unintentional) data hazards between
+ ARM and NEON code.*/
+ OC_ALIGN16(ogg_int16_t dct_data[128]);
+ OC_ALIGN16(signed char bounding_values[256]);
+ oc_fr_state fr[3];
+ oc_qii_state qs[3];
+ /*Skip SSD storage for the current MCU in each plane.*/
+ unsigned *skip_ssd[3];
+ /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+ ptrdiff_t *coded_fragis[3];
+ ptrdiff_t *uncoded_fragis[3];
+ ptrdiff_t ncoded_fragis[3];
+ ptrdiff_t nuncoded_fragis[3];
+ /*The starting fragment for the current MCU in each plane.*/
+ ptrdiff_t froffset[3];
+ /*The starting row for the current MCU in each plane.*/
+ int fragy0[3];
+ /*The ending row for the current MCU in each plane.*/
+ int fragy_end[3];
+ /*The starting superblock for the current MCU in each plane.*/
+ unsigned sbi0[3];
+ /*The ending superblock for the current MCU in each plane.*/
+ unsigned sbi_end[3];
+ /*The number of tokens for zzi=1 for each color plane.*/
+ int ndct_tokens1[3];
+ /*The outstanding eob_run count for zzi=1 for each color plane.*/
+ int eob_run1[3];
+ /*Whether or not the loop filter is enabled.*/
+ int loop_filter;
+};
+
+
+
/*Statistics used to estimate R-D cost of a block in a given coding mode.
See modedec.h for more details.*/
struct oc_mode_rd{
@@ -565,6 +652,8 @@
size_t mv_bits[2];
/*The mode scheme chooser for estimating mode coding costs.*/
oc_mode_scheme_chooser chooser;
+ /*Temporary encoder state for the analysis pipeline.*/
+ oc_enc_pipeline_state pipe;
/*The number of vertical super blocks in an MCU.*/
int mcu_nvsbs;
/*The SSD error for skipping each fragment in the current MCU.*/
Modified: experimental/derf/theora-ptalarbvorm/lib/fragment.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/fragment.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/fragment.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -26,6 +26,26 @@
}
}
+/*Copies the fragments specified by the lists of fragment indices from one
+ frame to another.
+ _dst_frame: The reference frame to copy to.
+ _src_frame: The reference frame to copy from.
+ _ystride: The row stride of the reference frames.
+ _fragis: A pointer to a list of fragment indices.
+ _nfragis: The number of fragment indices to copy.
+ _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+ ptrdiff_t fragii;
+ for(fragii=0;fragii<_nfragis;fragii++){
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+ oc_frag_copy_c(_dst_frame+frag_buf_off,
+ _src_frame+frag_buf_off,_ystride);
+ }
+}
+
void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
const ogg_int16_t _residue[64]){
int i;
Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -491,22 +491,22 @@
for(;;){
n=_tree[node];
if(n>available){
- int shift;
- shift=OC_PB_WINDOW_SIZE-8-available;
+ unsigned shift;
+ shift=OC_PB_WINDOW_SIZE-available;
do{
/*We don't bother setting eof because we won't check for it after we've
started decoding DCT tokens.*/
if(ptr>=stop){
- available=OC_LOTS_OF_BITS;
+ shift=-OC_LOTS_OF_BITS;
break;
}
- available+=8;
+ shift-=8;
window|=(oc_pb_window)*ptr++<<shift;
- shift-=8;
}
- while(shift>=0);
+ while(shift>=8);
/*Note: We never request more than 24 bits, so there's no need to fill in
the last partial byte here.*/
+ available=OC_PB_WINDOW_SIZE-shift;
}
bits=window>>OC_PB_WINDOW_SIZE-n;
node=_tree[node+1+bits];
Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -27,6 +27,6 @@
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const ogg_int16_t *_node);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/idct.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/idct.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -231,18 +231,18 @@
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients.*/
-static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- const ogg_int16_t *in;
- ogg_int16_t *end;
- ogg_int16_t *out;
- ogg_int16_t w[64];
+static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ ogg_int16_t w[64];
+ int i;
/*Transform rows of x into columns of w.*/
idct8_2(w,_x);
idct8_1(w+1,_x+8);
/*Transform rows of w into columns of y.*/
- for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+ for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
/*Adjust for the scale factor.*/
- for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+ for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+ /*Clear input data for next block (decoder only).*/
+ if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
@@ -260,20 +260,20 @@
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients.*/
-static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- const ogg_int16_t *in;
- ogg_int16_t *end;
- ogg_int16_t *out;
- ogg_int16_t w[64];
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ ogg_int16_t w[64];
+ int i;
/*Transform rows of x into columns of w.*/
idct8_4(w,_x);
idct8_3(w+1,_x+8);
idct8_2(w+2,_x+16);
idct8_1(w+3,_x+24);
/*Transform rows of w into columns of y.*/
- for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+ for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
/*Adjust for the scale factor.*/
- for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+ for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+ /*Clear input data for next block (decoder only).*/
+ if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
@@ -282,23 +282,22 @@
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients.*/
-static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
- const ogg_int16_t *in;
- ogg_int16_t *end;
- ogg_int16_t *out;
- ogg_int16_t w[64];
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ ogg_int16_t w[64];
+ int i;
/*Transform rows of x into columns of w.*/
- for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+ for(i=0;i<8;i++)idct8(w+i,_x+i*8);
/*Transform rows of w into columns of y.*/
- for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+ for(i=0;i<8;i++)idct8(_y+i,w+i*8);
/*Adjust for the scale factor.*/
- for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+ for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+ if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
@@ -324,7 +323,7 @@
gets.
Needless to say we inherited this approach from VP3.*/
/*Then perform the iDCT.*/
- if(_last_zzi<3)oc_idct8x8_3(_y,_y);
- else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
- else oc_idct8x8_slow(_y,_y);
+ if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
+ else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+ else oc_idct8x8_slow(_y,_x);
}
Modified: experimental/derf/theora-ptalarbvorm/lib/internal.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/internal.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/internal.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -38,14 +38,21 @@
# endif
# endif
-/*Some assembly constructs require aligned operands.*/
-# if defined(OC_X86_ASM)
+/*Some assembly constructs require aligned operands.
+ The following macros are _only_ intended for structure member declarations.
+ Although they will sometimes work on stack variables, gcc will often silently
+ ignore them.
+ A separate set of macros could be made for manual stack alignment, but we
+ don't actually require it anywhere.*/
+# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
# if defined(__GNUC__)
# define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
# define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
# elif defined(_MSC_VER)
# define OC_ALIGN8(expr) __declspec (align(8)) expr
# define OC_ALIGN16(expr) __declspec (align(16)) expr
+# else
+# error "Alignment macros required for this platform."
# endif
# endif
# if !defined(OC_ALIGN8)
Modified: experimental/derf/theora-ptalarbvorm/lib/state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/state.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/state.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -663,12 +663,13 @@
_state->cpu_flags=0;
#if defined(OC_STATE_USE_VTABLE)
_state->opt_vtable.frag_copy=oc_frag_copy_c;
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
_state->opt_vtable.idct8x8=oc_idct8x8_c;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
- _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_c;
_state->opt_vtable.restore_fpu=oc_restore_fpu_c;
@@ -930,7 +931,7 @@
}
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
@@ -944,19 +945,21 @@
no iDCT rounding.*/
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
/*LOOP VECTORIZES.*/
- for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+ for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
}
else{
/*First, dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
- oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+ oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
mb_mode=_state->frags[_fragi].mb_mode;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
- if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
+ if(mb_mode==OC_MODE_INTRA){
+ oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
+ }
else{
const unsigned char *ref;
int mvoffsets[2];
@@ -966,40 +969,15 @@
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
oc_frag_recon_inter2(_state,
- dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
+ dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
}
- else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+ else{
+ oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+ }
}
}
-/*Copies the fragments specified by the lists of fragment indices from one
- frame to another.
- _fragis: A pointer to a list of fragment indices.
- _nfragis: The number of fragment indices to copy.
- _dst_frame: The reference frame to copy to.
- _src_frame: The reference frame to copy from.
- _pli: The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
- const ptrdiff_t *frag_buf_offs;
- const unsigned char *src_frame_data;
- unsigned char *dst_frame_data;
- ptrdiff_t fragii;
- int ystride;
- dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
- src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
- ystride=_state->ref_ystride[_pli];
- frag_buf_offs=_state->frag_buf_offs;
- for(fragii=0;fragii<_nfragis;fragii++){
- ptrdiff_t frag_buf_off;
- frag_buf_off=frag_buf_offs[_fragis[fragii]];
- oc_frag_copy(_state,dst_frame_data+frag_buf_off,
- src_frame_data+frag_buf_off,ystride);
- }
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
int y;
_pix-=2;
for(y=0;y<8;y++){
@@ -1015,7 +993,7 @@
}
}
-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
int x;
_pix-=_ystride*2;
for(x=0;x<8;x++){
@@ -1032,20 +1010,16 @@
/*Initialize the bounding values array used by the loop filter.
_bv: Storage for the array.
- Return: 0 on success, or a non-zero value if no filtering need be applied.*/
-int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
- int flimit;
+ _flimit: The filter limit as defined in Section 7.10 of the spec.*/
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
int i;
- flimit=_state->loop_filter_limits[_state->qis[0]];
- if(flimit==0)return 1;
memset(_bv,0,sizeof(_bv[0])*256);
- for(i=0;i<flimit;i++){
- if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
- _bv[127-i]=-i;
- _bv[127+i]=i;
- if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
+ for(i=0;i<_flimit;i++){
+ if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
+ _bv[127-i]=(signed char)(-i);
+ _bv[127+i]=(signed char)(i);
+ if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
}
- return 0;
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -1056,8 +1030,8 @@
_pli: The color plane to filter.
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
@@ -1074,7 +1048,7 @@
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
- fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
Modified: experimental/derf/theora-ptalarbvorm/lib/state.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/state.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/state.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -52,6 +52,9 @@
# include "x86/x86int.h"
# endif
# endif
+# if defined(OC_ARM_ASM)
+# include "arm/armint.h"
+# endif
# if defined(OC_C64X_ASM)
# include "c64x/c64xint.h"
# endif
@@ -64,6 +67,12 @@
# define oc_frag_copy(_state,_dst,_src,_ystride) \
((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
# endif
+# if !defined(oc_frag_copy_list)
+# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs))
+# endif
# if !defined(oc_frag_recon_intra)
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
@@ -78,8 +87,8 @@
_src1,_src2,_ystride,_residue))
# endif
# if !defined(oc_idct8x8)
-# define oc_idct8x8(_state,_y,_last_zzi) \
- ((*(_state)->opt_vtable.idct8x8)(_y,_last_zzi))
+# define oc_idct8x8(_state,_y,_x,_last_zzi) \
+ ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
# endif
# if !defined(oc_state_frag_recon)
# define oc_state_frag_recon(_state,_fragi, \
@@ -87,11 +96,9 @@
((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
_pli,_dct_coeffs,_last_zzi,_dc_quant))
# endif
-# if !defined(oc_state_frag_copy_list)
-# define oc_state_frag_copy_list(_state,_fragis,_nfragis, \
- _dst_frame,_src_frame,_pli) \
- ((*(_state)->opt_vtable.state_frag_copy_list)(_state,_fragis,_nfragis, \
- _dst_frame,_src_frame,_pli))
+# if !defined(oc_loop_filter_init)
+# define oc_loop_filter_init(_state,_bv,_flimit) \
+ ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
# endif
# if !defined(oc_state_loop_filter_frag_rows)
# define oc_state_loop_filter_frag_rows(_state, \
@@ -108,6 +115,12 @@
# define oc_frag_copy(_state,_dst,_src,_ystride) \
oc_frag_copy_c(_dst,_src,_ystride)
# endif
+# if !defined(oc_frag_copy_list)
+# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs)
+# endif
# if !defined(oc_frag_recon_intra)
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
@@ -121,13 +134,14 @@
oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
# endif
# if !defined(oc_idct8x8)
-# define oc_idct8x8(_state,_y,_last_zzi) oc_idct8x8_c(_y,_last_zzi)
+# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
# endif
# if !defined(oc_state_frag_recon)
# define oc_state_frag_recon oc_state_frag_recon_c
# endif
-# if !defined(oc_state_frag_copy_list)
-# define oc_state_frag_copy_list oc_state_frag_copy_list_c
+# if !defined(oc_loop_filter_init)
+# define oc_loop_filter_init(_state,_bv,_flimit) \
+ oc_loop_filter_init_c(_bv,_flimit)
# endif
# if !defined(oc_state_loop_filter_frag_rows)
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
@@ -314,25 +328,28 @@
};
+typedef void (*oc_state_loop_filter_frag_rows_func)(
+ const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
+ int _fragy0,int _fragy_end);
/*The shared (encoder and decoder) functions that have accelerated variants.*/
struct oc_base_opt_vtable{
void (*frag_copy)(unsigned char *_dst,
const unsigned char *_src,int _ystride);
+ void (*frag_copy_list)(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
const ogg_int16_t _residue[64]);
void (*frag_recon_inter)(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
- void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
+ void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
- void (*state_frag_copy_list)(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
- void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+ void (*loop_filter_init)(signed char _bv[256],int _flimit);
+ oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
void (*restore_fpu)(void);
};
@@ -463,7 +480,7 @@
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
int _pli,int _dx,int _dy);
-int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
# if defined(OC_DUMP_IMAGES)
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
@@ -473,20 +490,20 @@
/*Default pure-C implementations of shared accelerated functions.*/
void oc_frag_copy_c(unsigned char *_dst,
const unsigned char *_src,int _src_ystride);
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
const ogg_int16_t _residue[64]);
void oc_frag_recon_inter_c(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu_c(void);
/*We need a way to call a few encoder functions without introducing a link-time
Modified: experimental/derf/theora-ptalarbvorm/lib/tokenize.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/tokenize.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/tokenize.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -754,6 +754,8 @@
int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
+ /*Note that gcc will not always respect this alignment.
+ In this case it doesn't matter terribly much.*/
OC_ALIGN16(ogg_int16_t coef[64]);
const unsigned char *dct_fzig_zag;
ogg_uint16_t *eob_run;
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,182 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- * *
- ********************************************************************
-
- CPU capability detection for x86 processors.
- Originally written by Rudolf Marek.
-
- function:
- last mod: $Id$
-
- ********************************************************************/
-
-#include "cpu.h"
-
-#if !defined(OC_X86_ASM)
-ogg_uint32_t oc_cpu_flags_get(void){
- return 0;
-}
-#else
-# if defined(__amd64__)||defined(__x86_64__)
-/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
- compiling with -fPIC.*/
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
- __asm__ __volatile__( \
- "cpuid\n\t" \
- :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
- :"a"(_op) \
- :"cc" \
- )
-# else
-/*On x86-32, not so much.*/
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
- __asm__ __volatile__( \
- "xchgl %%ebx,%[ebx]\n\t" \
- "cpuid\n\t" \
- "xchgl %%ebx,%[ebx]\n\t" \
- :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
- :"a"(_op) \
- :"cc" \
- )
-# endif
-
-static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
- ogg_uint32_t flags;
- /*If there isn't even MMX, give up.*/
- if(!(_edx&0x00800000))return 0;
- flags=OC_CPU_X86_MMX;
- if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
- if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
- if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
- if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
- if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
- if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
- return flags;
-}
-
-static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
- ogg_uint32_t flags;
- /*If there isn't even MMX, give up.*/
- if(!(_edx&0x00800000))return 0;
- flags=OC_CPU_X86_MMX;
- if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
- if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
- if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
- if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
- if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
- return flags;
-}
-
-ogg_uint32_t oc_cpu_flags_get(void){
- ogg_uint32_t flags;
- ogg_uint32_t eax;
- ogg_uint32_t ebx;
- ogg_uint32_t ecx;
- ogg_uint32_t edx;
-# if !defined(__amd64__)&&!defined(__x86_64__)
- /*Not all x86-32 chips support cpuid, so we have to check.*/
- __asm__ __volatile__(
- "pushfl\n\t"
- "pushfl\n\t"
- "popl %[a]\n\t"
- "movl %[a],%[b]\n\t"
- "xorl $0x200000,%[a]\n\t"
- "pushl %[a]\n\t"
- "popfl\n\t"
- "pushfl\n\t"
- "popl %[a]\n\t"
- "popfl\n\t"
- :[a]"=r"(eax),[b]"=r"(ebx)
- :
- :"cc"
- );
- /*No cpuid.*/
- if(eax==ebx)return 0;
-# endif
- cpuid(0,eax,ebx,ecx,edx);
- /* l e t n I e n i u n e G*/
- if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
- /* 6 8 x M T e n i u n e G*/
- ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
- int family;
- int model;
- /*Intel, Transmeta (tested with Crusoe TM5800):*/
- cpuid(1,eax,ebx,ecx,edx);
- flags=oc_parse_intel_flags(edx,ecx);
- family=(eax>>8)&0xF;
- model=(eax>>4)&0xF;
- /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
- unit, so don't use it.*/
- if(family==6&&(model==9||model==13||model==14)){
- flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
- }
- }
- /* D M A c i t n e h t u A*/
- else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
- /* C S N y b e d o e G*/
- ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
- /*AMD, Geode:*/
- cpuid(0x80000000,eax,ebx,ecx,edx);
- if(eax<0x80000001)flags=0;
- else{
- cpuid(0x80000001,eax,ebx,ecx,edx);
- flags=oc_parse_amd_flags(edx,ecx);
- }
- /*Also check for SSE.*/
- cpuid(1,eax,ebx,ecx,edx);
- flags|=oc_parse_intel_flags(edx,ecx);
- }
- /*Technically some VIA chips can be configured in the BIOS to return any
- string here the user wants.
- There is a special detection method that can be used to identify such
- processors, but in my opinion, if the user really wants to change it, they
- deserve what they get.*/
- /* s l u a H r u a t n e C*/
- else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
- /*VIA:*/
- /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
- chips (thanks to the engineers from Centaur Technology who provided it).
- These chips support Intel-like cpuid info.
- The C3-2 (Nehemiah) cores appear to, as well.*/
- cpuid(1,eax,ebx,ecx,edx);
- flags=oc_parse_intel_flags(edx,ecx);
- if(eax>=0x80000001){
- /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
- We need to check this even if the Intel test succeeds to pick up 3DNow!
- support on these processors.
- Unlike actual AMD processors, we cannot _rely_ on this info, since
- some cores (e.g., the 693 stepping of the Nehemiah) claim to support
- this function, yet return edx=0, despite the Intel test indicating
- MMX support.
- Therefore the features detected here are strictly added to those
- detected by the Intel test.*/
- /*TODO: How about earlier chips?*/
- cpuid(0x80000001,eax,ebx,ecx,edx);
- /*Note: As of the C7, this function returns Intel-style extended feature
- flags, not AMD-style.
- Currently, this only defines bits 11, 20, and 29 (0x20100800), which
- do not conflict with any of the AMD flags we inspect.
- For the remaining bits, Intel tells us, "Do not count on their value",
- but VIA assures us that they will all be zero (at least on the C7 and
- Isaiah chips).
- In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
- (0xC0C00000) for something else, we will have to add code to detect
- the model to decide when it is appropriate to inspect them.*/
- flags|=oc_parse_amd_flags(edx,ecx);
- }
- }
- else{
- /*Implement me.*/
- flags=0;
- }
- return flags;
-}
-#endif
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,36 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- * *
- ********************************************************************
- function:
- last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "../internal.h"
-
-#define OC_CPU_X86_MMX (1<<0)
-#define OC_CPU_X86_3DNOW (1<<1)
-#define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT (1<<3)
-#define OC_CPU_X86_SSE (1<<4)
-#define OC_CPU_X86_SSE2 (1<<5)
-#define OC_CPU_X86_PNI (1<<6)
-#define OC_CPU_X86_SSSE3 (1<<7)
-#define OC_CPU_X86_SSE4_1 (1<<8)
-#define OC_CPU_X86_SSE4_2 (1<<9)
-#define OC_CPU_X86_SSE4A (1<<10)
-#define OC_CPU_X86_SSE5 (1<<11)
-
-ogg_uint32_t oc_cpu_flags_get(void);
-
-#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -22,17 +22,92 @@
The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"
-#include "mmxfrag.h"
#if defined(OC_X86_ASM)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+ do{ \
+ const unsigned char *src; \
+ unsigned char *dst; \
+ ptrdiff_t ystride3; \
+ src=(_src); \
+ dst=(_dst); \
+ __asm__ __volatile__( \
+ /*src+0*ystride*/ \
+ "movq (%[src]),%%mm0\n\t" \
+ /*src+1*ystride*/ \
+ "movq (%[src],%[ystride]),%%mm1\n\t" \
+ /*ystride3=ystride*3*/ \
+ "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+ /*src+2*ystride*/ \
+ "movq (%[src],%[ystride],2),%%mm2\n\t" \
+ /*src+3*ystride*/ \
+ "movq (%[src],%[ystride3]),%%mm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movq %%mm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movq %%mm1,(%[dst],%[ystride])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[src],%[ystride],4),%[src]\n\t" \
+ /*dst+2*ystride*/ \
+ "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+ /*Pointer to next 4.*/ \
+ "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+ /*src+0*ystride*/ \
+ "movq (%[src]),%%mm0\n\t" \
+ /*src+1*ystride*/ \
+ "movq (%[src],%[ystride]),%%mm1\n\t" \
+ /*src+2*ystride*/ \
+ "movq (%[src],%[ystride],2),%%mm2\n\t" \
+ /*src+3*ystride*/ \
+ "movq (%[src],%[ystride3]),%%mm3\n\t" \
+ /*dst+0*ystride*/ \
+ "movq %%mm0,(%[dst])\n\t" \
+ /*dst+1*ystride*/ \
+ "movq %%mm1,(%[dst],%[ystride])\n\t" \
+ /*dst+2*ystride*/ \
+ "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+ /*dst+3*ystride*/ \
+ "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+ :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+ :[ystride]"r"((ptrdiff_t)(_ystride)) \
+ :"memory" \
+ ); \
+ } \
+ while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride){
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
}
+/*Copies the fragments specified by the lists of fragment indices from one
+ frame to another.
+ _dst_frame: The reference frame to copy to.
+ _src_frame: The reference frame to copy from.
+ _ystride: The row stride of the reference frames.
+ _fragis: A pointer to a list of fragment indices.
+ _nfragis: The number of fragment indices to copy.
+ _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+ ptrdiff_t fragii;
+ for(fragii=0;fragii<_nfragis;fragii++){
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+ OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+ _src_frame+frag_buf_off,_ystride);
+ }
+}
+
+
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue){
__asm__ __volatile__(
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
- between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
- do{ \
- const unsigned char *src; \
- unsigned char *dst; \
- ptrdiff_t ystride3; \
- src=(_src); \
- dst=(_dst); \
- __asm__ __volatile__( \
- /*src+0*ystride*/ \
- "movq (%[src]),%%mm0\n\t" \
- /*src+1*ystride*/ \
- "movq (%[src],%[ystride]),%%mm1\n\t" \
- /*ystride3=ystride*3*/ \
- "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
- /*src+2*ystride*/ \
- "movq (%[src],%[ystride],2),%%mm2\n\t" \
- /*src+3*ystride*/ \
- "movq (%[src],%[ystride3]),%%mm3\n\t" \
- /*dst+0*ystride*/ \
- "movq %%mm0,(%[dst])\n\t" \
- /*dst+1*ystride*/ \
- "movq %%mm1,(%[dst],%[ystride])\n\t" \
- /*Pointer to next 4.*/ \
- "lea (%[src],%[ystride],4),%[src]\n\t" \
- /*dst+2*ystride*/ \
- "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
- /*dst+3*ystride*/ \
- "movq %%mm3,(%[dst],%[ystride3])\n\t" \
- /*Pointer to next 4.*/ \
- "lea (%[dst],%[ystride],4),%[dst]\n\t" \
- /*src+0*ystride*/ \
- "movq (%[src]),%%mm0\n\t" \
- /*src+1*ystride*/ \
- "movq (%[src],%[ystride]),%%mm1\n\t" \
- /*src+2*ystride*/ \
- "movq (%[src],%[ystride],2),%%mm2\n\t" \
- /*src+3*ystride*/ \
- "movq (%[src],%[ystride3]),%%mm3\n\t" \
- /*dst+0*ystride*/ \
- "movq %%mm0,(%[dst])\n\t" \
- /*dst+1*ystride*/ \
- "movq %%mm1,(%[dst],%[ystride])\n\t" \
- /*dst+2*ystride*/ \
- "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
- /*dst+3*ystride*/ \
- "movq %%mm3,(%[dst],%[ystride3])\n\t" \
- :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
- :[ystride]"r"((ptrdiff_t)(_ystride)) \
- :"memory" \
- ); \
- } \
- while(0)
-
-# endif
-#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -31,65 +31,65 @@
/*38 cycles*/
-#define OC_IDCT_BEGIN \
+#define OC_IDCT_BEGIN(_y,_x) \
"#OC_IDCT_BEGIN\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
- "movq 0x30(%[c]),%%mm6\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
"movq %%mm2,%%mm4\n\t" \
- "movq "OC_J(5)",%%mm7\n\t" \
+ "movq "OC_J(5,_x)",%%mm7\n\t" \
"pmulhw %%mm6,%%mm4\n\t" \
- "movq 0x50(%[c]),%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
"pmulhw %%mm7,%%mm6\n\t" \
"movq %%mm1,%%mm5\n\t" \
"pmulhw %%mm2,%%mm1\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
"pmulhw %%mm7,%%mm5\n\t" \
- "movq 0x10(%[c]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
"paddw %%mm2,%%mm4\n\t" \
"paddw %%mm7,%%mm6\n\t" \
"paddw %%mm1,%%mm2\n\t" \
- "movq "OC_J(7)",%%mm1\n\t" \
+ "movq "OC_J(7,_x)",%%mm1\n\t" \
"paddw %%mm5,%%mm7\n\t" \
"movq %%mm0,%%mm5\n\t" \
"pmulhw %%mm3,%%mm0\n\t" \
"paddw %%mm7,%%mm4\n\t" \
"pmulhw %%mm1,%%mm5\n\t" \
- "movq 0x70(%[c]),%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm3,%%mm0\n\t" \
"pmulhw %%mm7,%%mm3\n\t" \
- "movq "OC_I(2)",%%mm2\n\t" \
+ "movq "OC_I(2,_x)",%%mm2\n\t" \
"pmulhw %%mm1,%%mm7\n\t" \
"paddw %%mm1,%%mm5\n\t" \
"movq %%mm2,%%mm1\n\t" \
- "pmulhw 0x20(%[c]),%%mm2\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
"psubw %%mm5,%%mm3\n\t" \
- "movq "OC_J(6)",%%mm5\n\t" \
+ "movq "OC_J(6,_x)",%%mm5\n\t" \
"paddw %%mm7,%%mm0\n\t" \
"movq %%mm5,%%mm7\n\t" \
"psubw %%mm4,%%mm0\n\t" \
- "pmulhw 0x20(%[c]),%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
"paddw %%mm1,%%mm2\n\t" \
- "pmulhw 0x60(%[c]),%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
"paddw %%mm4,%%mm4\n\t" \
"paddw %%mm0,%%mm4\n\t" \
"psubw %%mm6,%%mm3\n\t" \
"paddw %%mm7,%%mm5\n\t" \
"paddw %%mm6,%%mm6\n\t" \
- "pmulhw 0x60(%[c]),%%mm7\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
"paddw %%mm3,%%mm6\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
"psubw %%mm5,%%mm1\n\t" \
- "movq 0x40(%[c]),%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"movq %%mm3,%%mm5\n\t" \
"pmulhw %%mm4,%%mm3\n\t" \
"paddw %%mm2,%%mm7\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
"movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
"pmulhw %%mm4,%%mm0\n\t" \
"paddw %%mm3,%%mm5\n\t" \
- "movq "OC_J(4)",%%mm3\n\t" \
+ "movq "OC_J(4,_x)",%%mm3\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psubw %%mm3,%%mm6\n\t" \
@@ -103,18 +103,18 @@
"paddw %%mm0,%%mm6\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"paddw %%mm3,%%mm4\n\t" \
"psubw %%mm1,%%mm2\n\t" \
"#end OC_IDCT_BEGIN\n\t" \
/*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
+#define OC_ROW_IDCT(_y,_x) \
"#OC_ROW_IDCT\n" \
- OC_IDCT_BEGIN \
+ OC_IDCT_BEGIN(_y,_x) \
/*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=H'+H'*/ \
@@ -139,7 +139,7 @@
"psubw %%mm0,%%mm7\n\t" \
"paddw %%mm0,%%mm0\n\t" \
/*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r0=R0=G.+C.*/ \
"paddw %%mm7,%%mm0\n\t" \
"#end OC_ROW_IDCT\n\t" \
@@ -172,11 +172,11 @@
Since r1 is free at entry, we calculate the Js first.*/
/*19 cycles.*/
-#define OC_TRANSPOSE \
+#define OC_TRANSPOSE(_y) \
"#OC_TRANSPOSE\n\t" \
"movq %%mm4,%%mm1\n\t" \
"punpcklwd %%mm5,%%mm4\n\t" \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
"punpckhwd %%mm5,%%mm1\n\t" \
"movq %%mm6,%%mm0\n\t" \
"punpcklwd %%mm7,%%mm6\n\t" \
@@ -184,17 +184,17 @@
"punpckldq %%mm6,%%mm4\n\t" \
"punpckhdq %%mm6,%%mm5\n\t" \
"movq %%mm1,%%mm6\n\t" \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
"punpckhwd %%mm7,%%mm0\n\t" \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
"punpckhdq %%mm0,%%mm6\n\t" \
- "movq "OC_I(0)",%%mm4\n\t" \
+ "movq "OC_I(0,_y)",%%mm4\n\t" \
"punpckldq %%mm0,%%mm1\n\t" \
- "movq "OC_I(1)",%%mm5\n\t" \
+ "movq "OC_I(1,_y)",%%mm5\n\t" \
"movq %%mm4,%%mm0\n\t" \
- "movq %%mm6,"OC_J(7)"\n\t" \
+ "movq %%mm6,"OC_J(7,_y)"\n\t" \
"punpcklwd %%mm5,%%mm0\n\t" \
- "movq %%mm1,"OC_J(6)"\n\t" \
+ "movq %%mm1,"OC_J(6,_y)"\n\t" \
"punpckhwd %%mm5,%%mm4\n\t" \
"movq %%mm2,%%mm5\n\t" \
"punpcklwd %%mm3,%%mm2\n\t" \
@@ -202,20 +202,20 @@
"punpckldq %%mm2,%%mm0\n\t" \
"punpckhdq %%mm2,%%mm1\n\t" \
"movq %%mm4,%%mm2\n\t" \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
"punpckhwd %%mm3,%%mm5\n\t" \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
"punpckhdq %%mm5,%%mm4\n\t" \
"punpckldq %%mm5,%%mm2\n\t" \
- "movq %%mm4,"OC_I(3)"\n\t" \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm4,"OC_I(3,_y)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
"#end OC_TRANSPOSE\n\t" \
/*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
+#define OC_COLUMN_IDCT(_y) \
"#OC_COLUMN_IDCT\n" \
- OC_IDCT_BEGIN \
- "paddw 0x00(%[c]),%%mm2\n\t" \
+ OC_IDCT_BEGIN(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r1=R1=A''+H'*/ \
@@ -227,18 +227,18 @@
/*r1=NR1*/ \
"psraw $4,%%mm1\n\t" \
/*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
- "paddw 0x00(%[c]),%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
/*r3=D'+D'*/ \
"paddw %%mm3,%%mm3\n\t" \
/*r3=R3=E'+D'*/ \
@@ -249,7 +249,7 @@
"psubw %%mm5,%%mm6\n\t" \
/*r3=NR3*/ \
"psraw $4,%%mm3\n\t" \
- "paddw 0x00(%[c]),%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
/*r5=B''+B''*/ \
"paddw %%mm5,%%mm5\n\t" \
/*r5=R5=F'+B''*/ \
@@ -257,14 +257,14 @@
/*r6=NR6*/ \
"psraw $4,%%mm6\n\t" \
/*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
/*r5=NR5*/ \
"psraw $4,%%mm5\n\t" \
/*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
- "paddw 0x00(%[c]),%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
/*r0=C'+C'*/ \
"paddw %%mm0,%%mm0\n\t" \
/*r0=R0=G'+C'*/ \
@@ -272,113 +272,123 @@
/*r7=NR7*/ \
"psraw $4,%%mm7\n\t" \
/*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
/*r0=NR0*/ \
"psraw $4,%%mm0\n\t" \
/*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
/*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
/*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
"#end OC_COLUMN_IDCT\n\t" \
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
/*This routine accepts an 8x8 matrix, but in partially transposed form.
Every 4x4 block is transposed.*/
__asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
- OC_ROW_IDCT
- OC_TRANSPOSE
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k-4)*16)+8,_y)
+ OC_ROW_IDCT(y,x)
+ OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+72)"(%[y])"
- OC_ROW_IDCT
- OC_TRANSPOSE
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16)+64,_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k-4)*16)+72,_y)
+ OC_ROW_IDCT(y,x)
+ OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16)+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(y)
#undef OC_I
#undef OC_J
- :
- :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
);
+ if(_x!=_y){
+ int i;
+ __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+ for(i=0;i<4;i++){
+ __asm__ __volatile__(
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+ :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+ );
+ }
+ }
}
/*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
+#define OC_IDCT_BEGIN_10(_y,_x) \
"#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
"nop\n\t" \
- "movq 0x30(%[c]),%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
"movq %%mm2,%%mm4\n\t" \
- "movq 0x50(%[c]),%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
"pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
"pmulhw %%mm2,%%mm1\n\t" \
- "movq 0x10(%[c]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
"paddw %%mm2,%%mm4\n\t" \
"pxor %%mm6,%%mm6\n\t" \
"paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
"pmulhw %%mm3,%%mm0\n\t" \
"movq %%mm5,%%mm1\n\t" \
"paddw %%mm3,%%mm0\n\t" \
- "pmulhw 0x70(%[c]),%%mm3\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
"psubw %%mm2,%%mm6\n\t" \
- "pmulhw 0x20(%[c]),%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
"psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
"paddw %%mm4,%%mm4\n\t" \
"paddw %%mm5,%%mm7\n\t" \
"paddw %%mm0,%%mm4\n\t" \
- "pmulhw 0x60(%[c]),%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
"psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
"paddw %%mm6,%%mm6\n\t" \
- "movq 0x40(%[c]),%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"paddw %%mm3,%%mm6\n\t" \
"movq %%mm3,%%mm5\n\t" \
"pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
"movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
"pmulhw %%mm4,%%mm0\n\t" \
"paddw %%mm3,%%mm5\n\t" \
"paddw %%mm0,%%mm2\n\t" \
"psubw %%mm1,%%mm5\n\t" \
"pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
"paddw %%mm1,%%mm1\n\t" \
"movq %%mm6,%%mm4\n\t" \
"paddw %%mm5,%%mm1\n\t" \
"psubw %%mm2,%%mm6\n\t" \
"paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"psubw %%mm1,%%mm2\n\t" \
"nop\n\t" \
"#end OC_IDCT_BEGIN_10\n\t" \
/*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
+#define OC_ROW_IDCT_10(_y,_x) \
"#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
+ OC_IDCT_BEGIN_10(_y,_x) \
/*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
/*r4=E'=E-G*/ \
"psubw %%mm7,%%mm4\n\t" \
/*r1=H'+H'*/ \
@@ -403,16 +413,16 @@
"psubw %%mm0,%%mm7\n\t" \
"paddw %%mm0,%%mm0\n\t" \
/*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r0=R0=G'+C'*/ \
"paddw %%mm7,%%mm0\n\t" \
"#end OC_ROW_IDCT_10\n\t" \
/*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
+#define OC_COLUMN_IDCT_10(_y) \
"#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw 0x00(%[c]),%%mm2\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
/*r1=H'+H'*/ \
"paddw %%mm1,%%mm1\n\t" \
/*r1=R1=A''+H'*/ \
@@ -424,18 +434,18 @@
/*r1=NR1*/ \
"psraw $4,%%mm1\n\t" \
/*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
/*r7=G+G*/ \
"paddw %%mm7,%%mm7\n\t" \
/*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
/*r7=G'=E+G*/ \
"paddw %%mm4,%%mm7\n\t" \
/*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
/*r4=R4=E'-D'*/ \
"psubw %%mm3,%%mm4\n\t" \
- "paddw 0x00(%[c]),%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
/*r3=D'+D'*/ \
"paddw %%mm3,%%mm3\n\t" \
/*r3=R3=E'+D'*/ \
@@ -446,7 +456,7 @@
"psubw %%mm5,%%mm6\n\t" \
/*r3=NR3*/ \
"psraw $4,%%mm3\n\t" \
- "paddw 0x00(%[c]),%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
/*r5=B''+B''*/ \
"paddw %%mm5,%%mm5\n\t" \
/*r5=R5=F'+B''*/ \
@@ -454,14 +464,14 @@
/*r6=NR6*/ \
"psraw $4,%%mm6\n\t" \
/*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
/*r5=NR5*/ \
"psraw $4,%%mm5\n\t" \
/*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
/*r7=R7=G'-C'*/ \
"psubw %%mm0,%%mm7\n\t" \
- "paddw 0x00(%[c]),%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
/*r0=C'+C'*/ \
"paddw %%mm0,%%mm0\n\t" \
/*r0=R0=G'+C'*/ \
@@ -469,46 +479,57 @@
/*r7=NR7*/ \
"psraw $4,%%mm7\n\t" \
/*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
/*r0=NR0*/ \
"psraw $4,%%mm0\n\t" \
/*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
/*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
/*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
"#end OC_COLUMN_IDCT_10\n\t" \
-static void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
__asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k-4)*16)+8,_y)
/*Done with dequant, descramble, and partial transpose.
Now do the iDCT itself.*/
- OC_ROW_IDCT_10
- OC_TRANSPOSE
+ OC_ROW_IDCT_10(y,x)
+ OC_TRANSPOSE(y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16)+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(y)
#undef OC_I
#undef OC_J
- :
- :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
);
+ if(_x!=_y){
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+ );
+ }
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
@@ -534,8 +555,8 @@
gets.
Needless to say we inherited this approach from VP3.*/
/*Then perform the iDCT.*/
- if(_last_zzi<10)oc_idct8x8_10_mmx(_y);
- else oc_idct8x8_slow_mmx(_y);
+ if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+ else oc_idct8x8_slow_mmx(_y,_x);
}
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -19,13 +19,12 @@
Originally written by Rudolf Marek.*/
#include <string.h>
#include "x86int.h"
-#include "mmxfrag.h"
#include "mmxloop.h"
#if defined(OC_X86_ASM)
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
@@ -36,6 +35,7 @@
/*Note that this value must be unsigned, to keep the __asm__ block from
sign-extending it when it puts it in a register.*/
ogg_uint16_t p;
+ int i;
/*We round this dequant product (and not any of the others) because there's
no iDCT rounding.*/
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
@@ -47,38 +47,30 @@
"punpcklwd %%mm0,%%mm0\n\t"
/*mm0=AAAA AAAA AAAA AAAA*/
"punpckldq %%mm0,%%mm0\n\t"
- "movq %%mm0,(%[y])\n\t"
- "movq %%mm0,8(%[y])\n\t"
- "movq %%mm0,16(%[y])\n\t"
- "movq %%mm0,24(%[y])\n\t"
- "movq %%mm0,32(%[y])\n\t"
- "movq %%mm0,40(%[y])\n\t"
- "movq %%mm0,48(%[y])\n\t"
- "movq %%mm0,56(%[y])\n\t"
- "movq %%mm0,64(%[y])\n\t"
- "movq %%mm0,72(%[y])\n\t"
- "movq %%mm0,80(%[y])\n\t"
- "movq %%mm0,88(%[y])\n\t"
- "movq %%mm0,96(%[y])\n\t"
- "movq %%mm0,104(%[y])\n\t"
- "movq %%mm0,112(%[y])\n\t"
- "movq %%mm0,120(%[y])\n\t"
:
- :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
- :"memory"
+ :[p]"r"((unsigned)p)
);
+ for(i=0;i<4;i++){
+ __asm__ __volatile__(
+ "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+ :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+ );
+ }
}
else{
/*Dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
- oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+ oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
mb_mode=_state->frags[_fragi].mb_mode;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
- if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+ if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
else{
const unsigned char *ref;
int mvoffsets[2];
@@ -88,40 +80,17 @@
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
- _dct_coeffs);
+ _dct_coeffs+64);
}
- else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+ else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
}
}
/*We copy these entire function to inline the actual MMX routines so that we
use only a single indirect call.*/
-/*Copies the fragments specified by the lists of fragment indices from one
- frame to another.
- _fragis: A pointer to a list of fragment indices.
- _nfragis: The number of fragment indices to copy.
- _dst_frame: The reference frame to copy to.
- _src_frame: The reference frame to copy from.
- _pli: The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
- const ptrdiff_t *frag_buf_offs;
- const unsigned char *src_frame_data;
- unsigned char *dst_frame_data;
- ptrdiff_t fragii;
- int ystride;
- dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
- src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
- ystride=_state->ref_ystride[_pli];
- frag_buf_offs=_state->frag_buf_offs;
- for(fragii=0;fragii<_nfragis;fragii++){
- ptrdiff_t frag_buf_off;
- frag_buf_off=frag_buf_offs[_fragis[fragii]];
- OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
- src_frame_data+frag_buf_off,ystride);
- }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+ memset(_bv,_flimit,8);
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -133,7 +102,7 @@
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
OC_ALIGN8(unsigned char ll[8]);
const oc_fragment_plane *fplane;
const oc_fragment *frags;
@@ -189,6 +158,10 @@
}
}
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+ memset(_bv,~(_flimit<<1),8);
+}
+
/*Apply the loop filter to a given set of fragment rows in the given plane.
The filter may be run on the bottom edge, affecting pixels in the next row of
fragments, so this row also needs to be available.
@@ -198,8 +171,7 @@
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
- OC_ALIGN8(unsigned char ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
@@ -210,13 +182,12 @@
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
- memset(ll,~(_state->loop_filter_limits[_state->qis[0]]<<1),sizeof(ll));
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
- fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
@@ -236,16 +207,16 @@
unsigned char *ref;
ref=ref_frame_data+frag_buf_offs[fragi];
if(fragi>fragi0){
- OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
}
if(fragi0>fragi_top){
- OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
}
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
- OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,ll);
+ OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
- OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,ll);
+ OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
}
}
fragi++;
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -38,16 +38,16 @@
/*Performs the first three stages of the iDCT.
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
(accessed in that order).
- The remaining rows must be in %[y] at their corresponding locations.
+ The remaining rows must be in _x at their corresponding locations.
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
contain rows 4 through 7.*/
-#define OC_IDCT_8x8_ABC \
+#define OC_IDCT_8x8_ABC(_x) \
"#OC_IDCT_8x8_ABC\n\t" \
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
- "movdqa 0x20(%[c]),%%xmm1\n\t" \
- "movdqa 0x60(%[c]),%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+ "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
"movdqa %%xmm1,%%xmm0\n\t" \
"pmulhw %%xmm2,%%xmm1\n\t" \
"movdqa %%xmm4,%%xmm7\n\t" \
@@ -55,12 +55,12 @@
"pmulhw %%xmm2,%%xmm7\n\t" \
"pmulhw %%xmm6,%%xmm4\n\t" \
"paddw %%xmm6,%%xmm0\n\t" \
- "movdqa 0x30(%[c]),%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
"paddw %%xmm1,%%xmm2\n\t" \
"psubw %%xmm0,%%xmm7\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
"paddw %%xmm4,%%xmm2\n\t" \
- "movdqa 0x50(%[c]),%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
@@ -73,13 +73,13 @@
"paddw %%xmm3,%%xmm4\n\t" \
"paddw %%xmm5,%%xmm3\n\t" \
"paddw %%xmm6,%%xmm3\n\t" \
- "movdqa 0x70(%[y]),%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
"paddw %%xmm5,%%xmm1\n\t" \
- "movdqa 0x10(%[y]),%%xmm5\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
"paddw %%xmm3,%%xmm2\n\t" \
- "movdqa 0x70(%[c]),%%xmm3\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
"psubw %%xmm4,%%xmm1\n\t" \
- "movdqa 0x10(%[c]),%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
/*4-7 rotation by 7pi/16. \
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
"movdqa %%xmm3,%%xmm0\n\t" \
@@ -89,12 +89,12 @@
"pmulhw %%xmm6,%%xmm4\n\t" \
"pmulhw %%xmm6,%%xmm0\n\t" \
"paddw %%xmm6,%%xmm4\n\t" \
- "movdqa 0x40(%[y]),%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
"paddw %%xmm5,%%xmm7\n\t" \
"psubw %%xmm4,%%xmm3\n\t" \
- "movdqa 0x40(%[c]),%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
"paddw %%xmm7,%%xmm0\n\t" \
- "movdqa 0x00(%[y]),%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
/*0-1 butterfly. \
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
"paddw %%xmm7,%%xmm6\n\t" \
@@ -172,15 +172,15 @@
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
"psubw %%xmm3,%%xmm4\n\t" \
- "movdqa %%xmm4,0x40(%[y])\n\t" \
- "movdqa 0x00(%[c]),%%xmm4\n\t" \
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+ "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
"psubw %%xmm0,%%xmm7\n\t" \
"psubw %%xmm1,%%xmm6\n\t" \
"psubw %%xmm2,%%xmm5\n\t" \
"paddw %%xmm4,%%xmm7\n\t" \
"paddw %%xmm4,%%xmm6\n\t" \
"paddw %%xmm4,%%xmm5\n\t" \
- "paddw 0x40(%[y]),%%xmm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
"paddw %%xmm0,%%xmm0\n\t" \
"paddw %%xmm1,%%xmm1\n\t" \
"paddw %%xmm2,%%xmm2\n\t" \
@@ -189,45 +189,61 @@
"paddw %%xmm6,%%xmm1\n\t" \
"psraw $4,%%xmm0\n\t" \
"paddw %%xmm5,%%xmm2\n\t" \
- "movdqa %%xmm0,0x00(%[y])\n\t" \
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
"psraw $4,%%xmm1\n\t" \
"paddw %%xmm4,%%xmm3\n\t" \
- "movdqa %%xmm1,0x10(%[y])\n\t" \
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
"psraw $4,%%xmm2\n\t" \
- "movdqa %%xmm2,0x20(%[y])\n\t" \
+ "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
"psraw $4,%%xmm3\n\t" \
- "movdqa %%xmm3,0x30(%[y])\n\t" \
+ "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
"psraw $4,%%xmm4\n\t" \
- "movdqa %%xmm4,0x40(%[y])\n\t" \
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
"psraw $4,%%xmm5\n\t" \
- "movdqa %%xmm5,0x50(%[y])\n\t" \
+ "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
"psraw $4,%%xmm6\n\t" \
- "movdqa %%xmm6,0x60(%[y])\n\t" \
+ "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
"psraw $4,%%xmm7\n\t" \
- "movdqa %%xmm7,0x70(%[y])\n\t" \
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
-static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
- "movdqa 0x20(%[y]),%%xmm2\n\t"
- "movdqa 0x60(%[y]),%%xmm6\n\t"
- "movdqa 0x30(%[y]),%%xmm3\n\t"
- "movdqa 0x50(%[y]),%%xmm5\n\t"
- OC_IDCT_8x8_ABC
+ "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+ "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+ "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+ "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+ OC_IDCT_8x8_ABC(x)
OC_IDCT_8x8_D
OC_TRANSPOSE_8x8
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
- "movdqa %%xmm7,0x70(%[y])\n\t"
- "movdqa %%xmm4,0x40(%[y])\n\t"
- "movdqa %%xmm1,0x10(%[y])\n\t"
- "movdqa %%xmm0,0x00(%[y])\n\t"
- OC_IDCT_8x8_ABC
+ "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+ "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+ "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+ OC_IDCT_8x8_ABC(y)
OC_IDCT_8x8_D_STORE
- :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
- :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+ :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+ [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+ :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+ [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
);
+ if(_x!=_y){
+ int i;
+ __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+ /*Clear input data for next block (decoder only).*/
+ for(i=0;i<2;i++){
+ __asm__ __volatile__(
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+ );
+ }
+ }
}
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
@@ -238,28 +254,28 @@
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
- "movq 0x60(%[c]),%%mm7\n\t" \
- "movq 0x20(%[c]),%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
"pmulhw %%mm2,%%mm6\n\t" \
"pmulhw %%mm2,%%mm7\n\t" \
- "movq 0x50(%[c]),%%mm5\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
"paddw %%mm6,%%mm2\n\t" \
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
- "movq 0x30(%[c]),%%mm2\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
"pmulhw %%mm3,%%mm5\n\t" \
"pmulhw %%mm3,%%mm2\n\t" \
- "movq 0x10(%[c]),%%mm7\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
"paddw %%mm3,%%mm5\n\t" \
"paddw %%mm3,%%mm2\n\t" \
- "movq 0x70(%[c]),%%mm3\n\t" \
+ "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
/*4-7 rotation by 7pi/16. \
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
"pmulhw %%mm1,%%mm3\n\t" \
"pmulhw %%mm1,%%mm7\n\t" \
- "movq 0x40(%[c]),%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
"movq %%mm3,%%mm6\n\t" \
"paddw %%mm1,%%mm7\n\t" \
/*0-1 butterfly. \
@@ -319,28 +335,28 @@
/*Stage 1:*/ \
/*2-3 rotation by 6pi/16. \
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
- "movdqa 0x60(%[c]),%%xmm7\n\t" \
- "movdqa 0x20(%[c]),%%xmm6\n\t" \
+ "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
"pmulhw %%xmm2,%%xmm6\n\t" \
"pmulhw %%xmm2,%%xmm7\n\t" \
- "movdqa 0x50(%[c]),%%xmm5\n\t" \
+ "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
"paddw %%xmm6,%%xmm2\n\t" \
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
- "movdqa 0x30(%[c]),%%xmm2\n\t" \
+ "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
/*5-6 rotation by 3pi/16. \
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
"pmulhw %%xmm3,%%xmm5\n\t" \
"pmulhw %%xmm3,%%xmm2\n\t" \
- "movdqa 0x10(%[c]),%%xmm7\n\t" \
+ "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
"paddw %%xmm3,%%xmm5\n\t" \
"paddw %%xmm3,%%xmm2\n\t" \
- "movdqa 0x70(%[c]),%%xmm3\n\t" \
+ "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
/*4-7 rotation by 7pi/16. \
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
"pmulhw %%xmm1,%%xmm3\n\t" \
"pmulhw %%xmm1,%%xmm7\n\t" \
- "movdqa 0x40(%[c]),%%xmm4\n\t" \
+ "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
"movdqa %%xmm3,%%xmm6\n\t" \
"paddw %%xmm1,%%xmm7\n\t" \
/*0-1 butterfly. \
@@ -378,27 +394,40 @@
"psubw %%xmm7,%%xmm4\n\t" \
"psubw %%xmm6,%%xmm5\n\t" \
-static void oc_idct8x8_10_sse2(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
OC_ALIGN16(ogg_int16_t buf[16]);
/*This routine accepts an 8x8 matrix pre-transposed.*/
__asm__ __volatile__(
- "movq 0x20(%[y]),%%mm2\n\t"
- "movq 0x30(%[y]),%%mm3\n\t"
- "movq 0x10(%[y]),%%mm1\n\t"
- "movq 0x00(%[y]),%%mm0\n\t"
+ "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+ "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+ "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+ "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
OC_IDCT_8x8_10_MMX
OC_TRANSPOSE_8x4_MMX2SSE
OC_IDCT_8x8_10_ABC
OC_IDCT_8x8_D_STORE
- :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
- :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+ :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+ [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+ :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+ [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
);
+ if(_x!=_y){
+ /*Clear input data for next block (decoder only).*/
+ __asm__ __volatile__(
+ "pxor %%mm0,%%mm0\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+ "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+ :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+ );
+ }
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
@@ -424,8 +453,8 @@
gets.
Needless to say we inherited this approach from VP3.*/
/*Then perform the iDCT.*/
- if(_last_zzi<10)oc_idct8x8_10_sse2(_y);
- else oc_idct8x8_slow_sse2(_y);
+ if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+ else oc_idct8x8_slow_sse2(_y,_x);
}
#endif
Copied: experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c (from rev 17375, experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,182 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+ Originally written by Rudolf Marek.
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+ return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+ compiling with -fPIC.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "cpuid\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# else
+/*On x86-32, not so much.*/
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ __asm__ __volatile__( \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ "cpuid\n\t" \
+ "xchgl %%ebx,%[ebx]\n\t" \
+ :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+ :"a"(_op) \
+ :"cc" \
+ )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+ if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+ if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+ if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+ if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+ if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+ return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+ if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+ if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+ if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+ if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+ return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags;
+ ogg_uint32_t eax;
+ ogg_uint32_t ebx;
+ ogg_uint32_t ecx;
+ ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+ /*Not all x86-32 chips support cpuid, so we have to check.*/
+ __asm__ __volatile__(
+ "pushfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "movl %[a],%[b]\n\t"
+ "xorl $0x200000,%[a]\n\t"
+ "pushl %[a]\n\t"
+ "popfl\n\t"
+ "pushfl\n\t"
+ "popl %[a]\n\t"
+ "popfl\n\t"
+ :[a]"=r"(eax),[b]"=r"(ebx)
+ :
+ :"cc"
+ );
+ /*No cpuid.*/
+ if(eax==ebx)return 0;
+# endif
+ cpuid(0,eax,ebx,ecx,edx);
+ /* l e t n I e n i u n e G*/
+ if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+ /* 6 8 x M T e n i u n e G*/
+ ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+ int family;
+ int model;
+ /*Intel, Transmeta (tested with Crusoe TM5800):*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ family=(eax>>8)&0xF;
+ model=(eax>>4)&0xF;
+ /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+ unit, so don't use it.*/
+ if(family==6&&(model==9||model==13||model==14)){
+ flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+ }
+ }
+ /* D M A c i t n e h t u A*/
+ else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+ /* C S N y b e d o e G*/
+ ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+ /*AMD, Geode:*/
+ cpuid(0x80000000,eax,ebx,ecx,edx);
+ if(eax<0x80000001)flags=0;
+ else{
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ flags=oc_parse_amd_flags(edx,ecx);
+ }
+ /*Also check for SSE.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags|=oc_parse_intel_flags(edx,ecx);
+ }
+ /*Technically some VIA chips can be configured in the BIOS to return any
+ string here the user wants.
+ There is a special detection method that can be used to identify such
+ processors, but in my opinion, if the user really wants to change it, they
+ deserve what they get.*/
+ /* s l u a H r u a t n e C*/
+ else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+ /*VIA:*/
+ /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+ chips (thanks to the engineers from Centaur Technology who provided it).
+ These chips support Intel-like cpuid info.
+ The C3-2 (Nehemiah) cores appear to, as well.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ if(eax>=0x80000001){
+ /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+ We need to check this even if the Intel test succeeds to pick up 3DNow!
+ support on these processors.
+ Unlike actual AMD processors, we cannot _rely_ on this info, since
+ some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+ this function, yet return edx=0, despite the Intel test indicating
+ MMX support.
+ Therefore the features detected here are strictly added to those
+ detected by the Intel test.*/
+ /*TODO: How about earlier chips?*/
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ /*Note: As of the C7, this function returns Intel-style extended feature
+ flags, not AMD-style.
+ Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+ do not conflict with any of the AMD flags we inspect.
+ For the remaining bits, Intel tells us, "Do not count on their value",
+ but VIA assures us that they will all be zero (at least on the C7 and
+ Isaiah chips).
+ In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+ (0xC0C00000) for something else, we will have to add code to detect
+ the model to decide when it is appropriate to inspect them.*/
+ flags|=oc_parse_amd_flags(edx,ecx);
+ }
+ }
+ else{
+ /*Implement me.*/
+ flags=0;
+ }
+ return flags;
+}
+#endif
Copied: experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h (from rev 17375, experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,36 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX (1<<0)
+#define OC_CPU_X86_3DNOW (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT (1<<3)
+#define OC_CPU_X86_SSE (1<<4)
+#define OC_CPU_X86_SSE2 (1<<5)
+#define OC_CPU_X86_PNI (1<<6)
+#define OC_CPU_X86_SSSE3 (1<<7)
+#define OC_CPU_X86_SSE4_1 (1<<8)
+#define OC_CPU_X86_SSE4_2 (1<<9)
+#define OC_CPU_X86_SSE4A (1<<10)
+#define OC_CPU_X86_SSE5 (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -28,16 +28,21 @@
call.*/
# define oc_frag_copy(_state,_dst,_src,_ystride) \
oc_frag_copy_mmx(_dst,_src,_ystride)
+# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs)
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
-# define oc_idct8x8(_state,_y,_last_zzi) \
- oc_idct8x8_sse2(_y,_last_zzi)
+# define oc_idct8x8(_state,_y,_x,_last_zzi) \
+ oc_idct8x8_sse2(_y,_x,_last_zzi)
# define oc_state_frag_recon oc_state_frag_recon_mmx
-# define oc_state_frag_copy_list oc_state_frag_copy_list_mmx
+# define oc_loop_filter_init(_state,_bv,_flimit) \
+ oc_loop_filter_init_mmxext(_bv,_flimit)
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
# define oc_restore_fpu(_state) \
oc_restore_fpu_mmx()
@@ -47,7 +52,7 @@
# endif
# include "../state.h"
-# include "cpu.h"
+# include "x86cpu.h"
/*Converts the expression in the argument to a string.*/
#define OC_M2STR(_s) #_s
@@ -74,7 +79,7 @@
stack pointer, without allocating a separate register to point to them.*/
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
(*({ \
- struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
+ struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
array_addr__; \
}))
@@ -84,8 +89,8 @@
stack pointer, without allocating a separate register to point to them.*/
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
(*({ \
- const struct{_type array_value__[_size];} *array_addr__= \
- (const void *)_ptr; \
+ const struct{_type array_value__[(_size)];} *array_addr__= \
+ (const void *)(_ptr); \
array_addr__; \
}))
@@ -95,23 +100,25 @@
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
-void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu_mmx(void);
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -67,18 +67,20 @@
# if defined(OC_STATE_USE_VTABLE)
if(_state->cpu_flags&OC_CPU_X86_MMX){
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
- _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+ _state->opt_vtable.loop_filter_init=oc_state_loop_filter_init_mmx;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+ _state->opt_vtable.loop_filter_init=oc_state_loop_filter_init_mmxext;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmxext;
}
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,192 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- * *
- ********************************************************************
-
- CPU capability detection for x86 processors.
- Originally written by Rudolf Marek.
-
- function:
- last mod: $Id$
-
- ********************************************************************/
-
-#include "cpu.h"
-
-#if !defined(OC_X86_ASM)
-ogg_uint32_t oc_cpu_flags_get(void){
- return 0;
-}
-#else
-/*Why does MSVC need this complicated rigamarole?
- At this point I honestly do not care.*/
-
-/*Visual C cpuid helper function.
- For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
- for VS2003 users, so we do it in inline assembler.*/
-static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
- _asm{
- mov eax,[_op]
- mov esi,_cpu_info
- cpuid
- mov [esi+0],eax
- mov [esi+4],ebx
- mov [esi+8],ecx
- mov [esi+12],edx
- }
-}
-
-# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
- do{ \
- ogg_uint32_t cpu_info[4]; \
- oc_cpuid_helper(cpu_info,_op); \
- (_eax)=cpu_info[0]; \
- (_ebx)=cpu_info[1]; \
- (_ecx)=cpu_info[2]; \
- (_edx)=cpu_info[3]; \
- }while(0)
-
-static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
- _asm{
- pushfd
- pushfd
- pop eax
- mov ebx,eax
- xor eax,200000h
- push eax
- popfd
- pushfd
- pop eax
- popfd
- mov ecx,_eax
- mov [ecx],eax
- mov ecx,_ebx
- mov [ecx],ebx
- }
-}
-
-static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
- ogg_uint32_t flags;
- /*If there isn't even MMX, give up.*/
- if(!(_edx&0x00800000))return 0;
- flags=OC_CPU_X86_MMX;
- if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
- if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
- if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
- if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
- if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
- if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
- return flags;
-}
-
-static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
- ogg_uint32_t flags;
- /*If there isn't even MMX, give up.*/
- if(!(_edx&0x00800000))return 0;
- flags=OC_CPU_X86_MMX;
- if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
- if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
- if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
- if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
- if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
- return flags;
-}
-
-ogg_uint32_t oc_cpu_flags_get(void){
- ogg_uint32_t flags;
- ogg_uint32_t eax;
- ogg_uint32_t ebx;
- ogg_uint32_t ecx;
- ogg_uint32_t edx;
-# if !defined(__amd64__)&&!defined(__x86_64__)
- /*Not all x86-32 chips support cpuid, so we have to check.*/
- oc_detect_cpuid_helper(&eax,&ebx);
- /*No cpuid.*/
- if(eax==ebx)return 0;
-# endif
- cpuid(0,eax,ebx,ecx,edx);
- /* l e t n I e n i u n e G*/
- if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
- /* 6 8 x M T e n i u n e G*/
- ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
- int family;
- int model;
- /*Intel, Transmeta (tested with Crusoe TM5800):*/
- cpuid(1,eax,ebx,ecx,edx);
- flags=oc_parse_intel_flags(edx,ecx);
- family=(eax>>8)&0xF;
- model=(eax>>4)&0xF;
- /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
- unit, so don't use it.*/
- if(family==6&&(model==9||model==13||model==14)){
- flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
- }
- }
- /* D M A c i t n e h t u A*/
- else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
- /* C S N y b e d o e G*/
- ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
- /*AMD, Geode:*/
- cpuid(0x80000000,eax,ebx,ecx,edx);
- if(eax<0x80000001)flags=0;
- else{
- cpuid(0x80000001,eax,ebx,ecx,edx);
- flags=oc_parse_amd_flags(edx,ecx);
- }
- /*Also check for SSE.*/
- cpuid(1,eax,ebx,ecx,edx);
- flags|=oc_parse_intel_flags(edx,ecx);
- }
- /*Technically some VIA chips can be configured in the BIOS to return any
- string here the user wants.
- There is a special detection method that can be used to identify such
- processors, but in my opinion, if the user really wants to change it, they
- deserve what they get.*/
- /* s l u a H r u a t n e C*/
- else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
- /*VIA:*/
- /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
- chips (thanks to the engineers from Centaur Technology who provided it).
- These chips support Intel-like cpuid info.
- The C3-2 (Nehemiah) cores appear to, as well.*/
- cpuid(1,eax,ebx,ecx,edx);
- flags=oc_parse_intel_flags(edx,ecx);
- if(eax>=0x80000001){
- /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
- We need to check this even if the Intel test succeeds to pick up 3DNow!
- support on these processors.
- Unlike actual AMD processors, we cannot _rely_ on this info, since
- some cores (e.g., the 693 stepping of the Nehemiah) claim to support
- this function, yet return edx=0, despite the Intel test indicating
- MMX support.
- Therefore the features detected here are strictly added to those
- detected by the Intel test.*/
- /*TODO: How about earlier chips?*/
- cpuid(0x80000001,eax,ebx,ecx,edx);
- /*Note: As of the C7, this function returns Intel-style extended feature
- flags, not AMD-style.
- Currently, this only defines bits 11, 20, and 29 (0x20100800), which
- do not conflict with any of the AMD flags we inspect.
- For the remaining bits, Intel tells us, "Do not count on their value",
- but VIA assures us that they will all be zero (at least on the C7 and
- Isaiah chips).
- In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
- (0xC0C00000) for something else, we will have to add code to detect
- the model to decide when it is appropriate to inspect them.*/
- flags|=oc_parse_amd_flags(edx,ecx);
- }
- }
- else{
- /*Implement me.*/
- flags=0;
- }
- return flags;
-}
-#endif
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,36 +0,0 @@
-/********************************************************************
- * *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
- * *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- * *
- ********************************************************************
- function:
- last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "../internal.h"
-
-#define OC_CPU_X86_MMX (1<<0)
-#define OC_CPU_X86_3DNOW (1<<1)
-#define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT (1<<3)
-#define OC_CPU_X86_SSE (1<<4)
-#define OC_CPU_X86_SSE2 (1<<5)
-#define OC_CPU_X86_PNI (1<<6)
-#define OC_CPU_X86_SSSE3 (1<<7)
-#define OC_CPU_X86_SSE4_1 (1<<8)
-#define OC_CPU_X86_SSE4_2 (1<<9)
-#define OC_CPU_X86_SSE4A (1<<10)
-#define OC_CPU_X86_SSE5 (1<<11)
-
-ogg_uint32_t oc_cpu_flags_get(void);
-
-#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -22,12 +22,63 @@
The iteration each instruction belongs to is marked in the comments as #i.*/
#include <stddef.h>
#include "x86int.h"
-#include "mmxfrag.h"
#if defined(OC_X86_ASM)
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+ do{ \
+ const unsigned char *src; \
+ unsigned char *dst; \
+ src=(_src); \
+ dst=(_dst); \
+ __asm mov SRC,src \
+ __asm mov DST,dst \
+ __asm mov YSTRIDE,_ystride \
+ /*src+0*ystride*/ \
+ __asm movq mm0,[SRC] \
+ /*src+1*ystride*/ \
+ __asm movq mm1,[SRC+YSTRIDE] \
+ /*ystride3=ystride*3*/ \
+ __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+ /*src+2*ystride*/ \
+ __asm movq mm2,[SRC+YSTRIDE*2] \
+ /*src+3*ystride*/ \
+ __asm movq mm3,[SRC+YSTRIDE3] \
+ /*dst+0*ystride*/ \
+ __asm movq [DST],mm0 \
+ /*dst+1*ystride*/ \
+ __asm movq [DST+YSTRIDE],mm1 \
+ /*Pointer to next 4.*/ \
+ __asm lea SRC,[SRC+YSTRIDE*4] \
+ /*dst+2*ystride*/ \
+ __asm movq [DST+YSTRIDE*2],mm2 \
+ /*dst+3*ystride*/ \
+ __asm movq [DST+YSTRIDE3],mm3 \
+ /*Pointer to next 4.*/ \
+ __asm lea DST,[DST+YSTRIDE*4] \
+ /*src+0*ystride*/ \
+ __asm movq mm0,[SRC] \
+ /*src+1*ystride*/ \
+ __asm movq mm1,[SRC+YSTRIDE] \
+ /*src+2*ystride*/ \
+ __asm movq mm2,[SRC+YSTRIDE*2] \
+ /*src+3*ystride*/ \
+ __asm movq mm3,[SRC+YSTRIDE3] \
+ /*dst+0*ystride*/ \
+ __asm movq [DST],mm0 \
+ /*dst+1*ystride*/ \
+ __asm movq [DST+YSTRIDE],mm1 \
+ /*dst+2*ystride*/ \
+ __asm movq [DST+YSTRIDE*2],mm2 \
+ /*dst+3*ystride*/ \
+ __asm movq [DST+YSTRIDE3],mm3 \
+ } \
+ while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+ between rows.*/
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride){
#define SRC edx
@@ -41,6 +92,34 @@
#undef YSTRIDE3
}
+/*Copies the fragments specified by the lists of fragment indices from one
+ frame to another.
+ _dst_frame: The reference frame to copy to.
+ _src_frame: The reference frame to copy from.
+ _ystride: The row stride of the reference frames.
+ _fragis: A pointer to a list of fragment indices.
+ _nfragis: The number of fragment indices to copy.
+ _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t _frag_buf_offs){
+ ptrdiff_t fragii;
+ for(fragii=0;fragii<_nfragis;fragii++){
+ ptrdiff_t frag_buf_off;
+ frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+ OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+ _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+ }
+}
+
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue){
__asm{
Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,61 +0,0 @@
-#if !defined(_x86_vc_mmxfrag_H)
-# define _x86_vc_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
- between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
- do{ \
- const unsigned char *src; \
- unsigned char *dst; \
- src=(_src); \
- dst=(_dst); \
- __asm mov SRC,src \
- __asm mov DST,dst \
- __asm mov YSTRIDE,_ystride \
- /*src+0*ystride*/ \
- __asm movq mm0,[SRC] \
- /*src+1*ystride*/ \
- __asm movq mm1,[SRC+YSTRIDE] \
- /*ystride3=ystride*3*/ \
- __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
- /*src+2*ystride*/ \
- __asm movq mm2,[SRC+YSTRIDE*2] \
- /*src+3*ystride*/ \
- __asm movq mm3,[SRC+YSTRIDE3] \
- /*dst+0*ystride*/ \
- __asm movq [DST],mm0 \
- /*dst+1*ystride*/ \
- __asm movq [DST+YSTRIDE],mm1 \
- /*Pointer to next 4.*/ \
- __asm lea SRC,[SRC+YSTRIDE*4] \
- /*dst+2*ystride*/ \
- __asm movq [DST+YSTRIDE*2],mm2 \
- /*dst+3*ystride*/ \
- __asm movq [DST+YSTRIDE3],mm3 \
- /*Pointer to next 4.*/ \
- __asm lea DST,[DST+YSTRIDE*4] \
- /*src+0*ystride*/ \
- __asm movq mm0,[SRC] \
- /*src+1*ystride*/ \
- __asm movq mm1,[SRC+YSTRIDE] \
- /*src+2*ystride*/ \
- __asm movq mm2,[SRC+YSTRIDE*2] \
- /*src+3*ystride*/ \
- __asm movq mm3,[SRC+YSTRIDE3] \
- /*dst+0*ystride*/ \
- __asm movq [DST],mm0 \
- /*dst+1*ystride*/ \
- __asm movq [DST+YSTRIDE],mm1 \
- /*dst+2*ystride*/ \
- __asm movq [DST+YSTRIDE*2],mm2 \
- /*dst+3*ystride*/ \
- __asm movq [DST+YSTRIDE3],mm3 \
- } \
- while(0)
-
-# endif
-#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -24,15 +24,16 @@
/*These are offsets into the table of constants below.*/
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
+#define OC_COSINE_OFFSET (8)
/*A row of 8's.*/
-#define OC_EIGHT_OFFSET (56)
+#define OC_EIGHT_OFFSET (0)
/*A table of constants used by the MMX routines.*/
static const __declspec(align(16))ogg_uint16_t
- OC_IDCT_CONSTS[(7+1)*4]={
+ OC_IDCT_CONSTS[(1+7)*4]={
+ 8, 8, 8, 8
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -47,27 +48,26 @@
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
- 8, 8, 8, 8
};
/*38 cycles*/
-#define OC_IDCT_BEGIN __asm{ \
- __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN(_y,_x) __asm{ \
+ __asm movq mm2,OC_I(3,_x) \
__asm movq mm6,OC_C(3) \
__asm movq mm4,mm2 \
- __asm movq mm7,OC_J(5) \
+ __asm movq mm7,OC_J(5,_x) \
__asm pmulhw mm4,mm6 \
__asm movq mm1,OC_C(5) \
__asm pmulhw mm6,mm7 \
__asm movq mm5,mm1 \
__asm pmulhw mm1,mm2 \
- __asm movq mm3,OC_I(1) \
+ __asm movq mm3,OC_I(1,_x) \
__asm pmulhw mm5,mm7 \
__asm movq mm0,OC_C(1) \
__asm paddw mm4,mm2 \
__asm paddw mm6,mm7 \
__asm paddw mm2,mm1 \
- __asm movq mm1,OC_J(7) \
+ __asm movq mm1,OC_J(7,_x) \
__asm paddw mm7,mm5 \
__asm movq mm5,mm0 \
__asm pmulhw mm0,mm3 \
@@ -77,13 +77,13 @@
__asm psubw mm6,mm2 \
__asm paddw mm0,mm3 \
__asm pmulhw mm3,mm7 \
- __asm movq mm2,OC_I(2) \
+ __asm movq mm2,OC_I(2,_x) \
__asm pmulhw mm7,mm1 \
__asm paddw mm5,mm1 \
__asm movq mm1,mm2 \
__asm pmulhw mm2,OC_C(2) \
__asm psubw mm3,mm5 \
- __asm movq mm5,OC_J(6) \
+ __asm movq mm5,OC_J(6,_x) \
__asm paddw mm0,mm7 \
__asm movq mm7,mm5 \
__asm psubw mm0,mm4 \
@@ -97,18 +97,18 @@
__asm paddw mm6,mm6 \
__asm pmulhw mm7,OC_C(6) \
__asm paddw mm6,mm3 \
- __asm movq OC_I(1),mm4 \
+ __asm movq OC_I(1,_y),mm4 \
__asm psubw mm1,mm5 \
__asm movq mm4,OC_C(4) \
__asm movq mm5,mm3 \
__asm pmulhw mm3,mm4 \
__asm paddw mm7,mm2 \
- __asm movq OC_I(2),mm6 \
+ __asm movq OC_I(2,_y),mm6 \
__asm movq mm2,mm0 \
- __asm movq mm6,OC_I(0) \
+ __asm movq mm6,OC_I(0,_x) \
__asm pmulhw mm0,mm4 \
__asm paddw mm5,mm3 \
- __asm movq mm3,OC_J(4) \
+ __asm movq mm3,OC_J(4,_x) \
__asm psubw mm5,mm1 \
__asm paddw mm2,mm0 \
__asm psubw mm6,mm3 \
@@ -122,17 +122,17 @@
__asm paddw mm6,mm0 \
__asm psubw mm6,mm2 \
__asm paddw mm2,mm2 \
- __asm movq mm0,OC_I(1) \
+ __asm movq mm0,OC_I(1,_y) \
__asm paddw mm2,mm6 \
__asm paddw mm4,mm3 \
__asm psubw mm2,mm1 \
}
/*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm{ \
- OC_IDCT_BEGIN \
+#define OC_ROW_IDCT(_y,_x) __asm{ \
+ OC_IDCT_BEGIN(_y,_x) \
/*r3=D'*/ \
- __asm movq mm3,OC_I(2) \
+ __asm movq mm3,OC_I(2,_y) \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=H'+H'*/ \
@@ -157,7 +157,7 @@
__asm psubw mm7,mm0 \
__asm paddw mm0,mm0 \
/*Save R1.*/ \
- __asm movq OC_I(1),mm1 \
+ __asm movq OC_I(1,_y),mm1 \
/*r0=R0=G.+C.*/ \
__asm paddw mm0,mm7 \
}
@@ -190,10 +190,10 @@
Since r1 is free at entry, we calculate the Js first.*/
/*19 cycles.*/
-#define OC_TRANSPOSE __asm{ \
+#define OC_TRANSPOSE(_y) __asm{ \
__asm movq mm1,mm4 \
__asm punpcklwd mm4,mm5 \
- __asm movq OC_I(0),mm0 \
+ __asm movq OC_I(0,_y),mm0 \
__asm punpckhwd mm1,mm5 \
__asm movq mm0,mm6 \
__asm punpcklwd mm6,mm7 \
@@ -201,17 +201,17 @@
__asm punpckldq mm4,mm6 \
__asm punpckhdq mm5,mm6 \
__asm movq mm6,mm1 \
- __asm movq OC_J(4),mm4 \
+ __asm movq OC_J(4,_y),mm4 \
__asm punpckhwd mm0,mm7 \
- __asm movq OC_J(5),mm5 \
+ __asm movq OC_J(5,_y),mm5 \
__asm punpckhdq mm6,mm0 \
- __asm movq mm4,OC_I(0) \
+ __asm movq mm4,OC_I(0,_y) \
__asm punpckldq mm1,mm0 \
- __asm movq mm5,OC_I(1) \
+ __asm movq mm5,OC_I(1,_y) \
__asm movq mm0,mm4 \
- __asm movq OC_J(7),mm6 \
+ __asm movq OC_J(7,_y),mm6 \
__asm punpcklwd mm0,mm5 \
- __asm movq OC_J(6),mm1 \
+ __asm movq OC_J(6,_y),mm1 \
__asm punpckhwd mm4,mm5 \
__asm movq mm5,mm2 \
__asm punpcklwd mm2,mm3 \
@@ -219,18 +219,18 @@
__asm punpckldq mm0,mm2 \
__asm punpckhdq mm1,mm2 \
__asm movq mm2,mm4 \
- __asm movq OC_I(0),mm0 \
+ __asm movq OC_I(0,_y),mm0 \
__asm punpckhwd mm5,mm3 \
- __asm movq OC_I(1),mm1 \
+ __asm movq OC_I(1,_y),mm1 \
__asm punpckhdq mm4,mm5 \
__asm punpckldq mm2,mm5 \
- __asm movq OC_I(3),mm4 \
- __asm movq OC_I(2),mm2 \
+ __asm movq OC_I(3,_y),mm4 \
+ __asm movq OC_I(2,_y),mm2 \
}
/*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm{ \
- OC_IDCT_BEGIN \
+#define OC_COLUMN_IDCT(_y) __asm{ \
+ OC_IDCT_BEGIN(_y,_y) \
__asm paddw mm2,OC_8 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
@@ -243,15 +243,15 @@
/*r1=NR1*/ \
__asm psraw mm1,4 \
/*r3=D'*/ \
- __asm movq mm3,OC_I(2) \
+ __asm movq mm3,OC_I(2,_y) \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*Store NR2 at I(2).*/ \
- __asm movq OC_I(2),mm2 \
+ __asm movq OC_I(2,_y),mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*Store NR1 at I(1).*/ \
- __asm movq OC_I(1),mm1 \
+ __asm movq OC_I(1,_y),mm1 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm4,OC_8 \
@@ -273,11 +273,11 @@
/*r6=NR6*/ \
__asm psraw mm6,4 \
/*Store NR4 at J(4).*/ \
- __asm movq OC_J(4),mm4 \
+ __asm movq OC_J(4,_y),mm4 \
/*r5=NR5*/ \
__asm psraw mm5,4 \
/*Store NR3 at I(3).*/ \
- __asm movq OC_I(3),mm3 \
+ __asm movq OC_I(3,_y),mm3 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm7,OC_8 \
@@ -288,71 +288,90 @@
/*r7=NR7*/ \
__asm psraw mm7,4 \
/*Store NR6 at J(6).*/ \
- __asm movq OC_J(6),mm6 \
+ __asm movq OC_J(6,_y),mm6 \
/*r0=NR0*/ \
__asm psraw mm0,4 \
/*Store NR5 at J(5).*/ \
- __asm movq OC_J(5),mm5 \
+ __asm movq OC_J(5,_y),mm5 \
/*Store NR7 at J(7).*/ \
- __asm movq OC_J(7),mm7 \
+ __asm movq OC_J(7,_y),mm7 \
/*Store NR0 at I(0).*/ \
- __asm movq OC_I(0),mm0 \
+ __asm movq OC_I(0,_y),mm0 \
}
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+ int i;
/*This routine accepts an 8x8 matrix, but in partially transposed form.
Every 4x4 block is transposed.*/
__asm{
#define CONSTS eax
#define Y edx
+#define X ecx
mov CONSTS,offset OC_IDCT_CONSTS
mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
- OC_ROW_IDCT
- OC_TRANSPOSE
+ mov X,_x
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) [(_y)+(_k-4)*16+8]
+ OC_ROW_IDCT(_y,_x)
+ OC_TRANSPOSE(_y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) [Y+(_k*16)+64]
-#define OC_J(_k) [Y+(_k-4)*16+72]
- OC_ROW_IDCT
- OC_TRANSPOSE
+#define OC_I(_k,_y) [(_y)+(_k*16)+64]
+#define OC_J(_k,_y) [(_y)+(_k-4)*16+72]
+ OC_ROW_IDCT(_y,_x)
+ OC_TRANSPOSE(_y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(_y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT
+#define OC_I(_k,_y) [(_y)+_k*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT(_y)
#undef OC_I
#undef OC_J
#undef CONSTS
#undef Y
+#undef X
}
+ if(_x!=_y){
+ int i;
+ __asm pxor mm0,mm0;
+ for(i=0;i<4;i++){
+#define X ecx
+ __asm{
+ mov X,(_x+16*i)
+ movq [X+0x00],mm0
+ movq [X+0x08],mm0
+ movq [X+0x10],mm0
+ movq [X+0x18],mm0
+ }
+#undef X
+ }
+ }
}
/*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm{ \
- __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
+ __asm movq mm2,OC_I(3,_x) \
__asm nop \
__asm movq mm6,OC_C(3) \
__asm movq mm4,mm2 \
__asm movq mm1,OC_C(5) \
__asm pmulhw mm4,mm6 \
- __asm movq mm3,OC_I(1) \
+ __asm movq mm3,OC_I(1,_x) \
__asm pmulhw mm1,mm2 \
__asm movq mm0,OC_C(1) \
__asm paddw mm4,mm2 \
__asm pxor mm6,mm6 \
__asm paddw mm2,mm1 \
- __asm movq mm5,OC_I(2) \
+ __asm movq mm5,OC_I(2,_x) \
__asm pmulhw mm0,mm3 \
__asm movq mm1,mm5 \
__asm paddw mm0,mm3 \
@@ -360,43 +379,43 @@
__asm psubw mm6,mm2 \
__asm pmulhw mm5,OC_C(2) \
__asm psubw mm0,mm4 \
- __asm movq mm7,OC_I(2) \
+ __asm movq mm7,OC_I(2,_x) \
__asm paddw mm4,mm4 \
__asm paddw mm7,mm5 \
__asm paddw mm4,mm0 \
__asm pmulhw mm1,OC_C(6) \
__asm psubw mm3,mm6 \
- __asm movq OC_I(1),mm4 \
+ __asm movq OC_I(1,_y),mm4 \
__asm paddw mm6,mm6 \
__asm movq mm4,OC_C(4) \
__asm paddw mm6,mm3 \
__asm movq mm5,mm3 \
__asm pmulhw mm3,mm4 \
- __asm movq OC_I(2),mm6 \
+ __asm movq OC_I(2,_y),mm6 \
__asm movq mm2,mm0 \
- __asm movq mm6,OC_I(0) \
+ __asm movq mm6,OC_I(0,_x) \
__asm pmulhw mm0,mm4 \
__asm paddw mm5,mm3 \
__asm paddw mm2,mm0 \
__asm psubw mm5,mm1 \
__asm pmulhw mm6,mm4 \
- __asm paddw mm6,OC_I(0) \
+ __asm paddw mm6,OC_I(0,_x) \
__asm paddw mm1,mm1 \
__asm movq mm4,mm6 \
__asm paddw mm1,mm5 \
__asm psubw mm6,mm2 \
__asm paddw mm2,mm2 \
- __asm movq mm0,OC_I(1) \
+ __asm movq mm0,OC_I(1,_y) \
__asm paddw mm2,mm6 \
__asm psubw mm2,mm1 \
__asm nop \
}
/*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm{ \
- OC_IDCT_BEGIN_10 \
+#define OC_ROW_IDCT_10(_y,_x) __asm{ \
+ OC_IDCT_BEGIN_10(_y,_x) \
/*r3=D'*/ \
- __asm movq mm3,OC_I(2) \
+ __asm movq mm3,OC_I(2,_y) \
/*r4=E'=E-G*/ \
__asm psubw mm4,mm7 \
/*r1=H'+H'*/ \
@@ -421,14 +440,14 @@
__asm psubw mm7,mm0 \
__asm paddw mm0,mm0 \
/*Save R1.*/ \
- __asm movq OC_I(1),mm1 \
+ __asm movq OC_I(1,_y),mm1 \
/*r0=R0=G'+C'*/ \
__asm paddw mm0,mm7 \
}
/*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm{ \
- OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10(_y) __asm{ \
+ OC_IDCT_BEGIN_10(_y,_y) \
__asm paddw mm2,OC_8 \
/*r1=H'+H'*/ \
__asm paddw mm1,mm1 \
@@ -441,15 +460,15 @@
/*r1=NR1*/ \
__asm psraw mm1,4 \
/*r3=D'*/ \
- __asm movq mm3,OC_I(2) \
+ __asm movq mm3,OC_I(2,_y) \
/*r7=G+G*/ \
__asm paddw mm7,mm7 \
/*Store NR2 at I(2).*/ \
- __asm movq OC_I(2),mm2 \
+ __asm movq OC_I(2,_y),mm2 \
/*r7=G'=E+G*/ \
__asm paddw mm7,mm4 \
/*Store NR1 at I(1).*/ \
- __asm movq OC_I(1),mm1 \
+ __asm movq OC_I(1,_y),mm1 \
/*r4=R4=E'-D'*/ \
__asm psubw mm4,mm3 \
__asm paddw mm4,OC_8 \
@@ -471,11 +490,11 @@
/*r6=NR6*/ \
__asm psraw mm6,4 \
/*Store NR4 at J(4).*/ \
- __asm movq OC_J(4),mm4 \
+ __asm movq OC_J(4,_y),mm4 \
/*r5=NR5*/ \
__asm psraw mm5,4 \
/*Store NR3 at I(3).*/ \
- __asm movq OC_I(3),mm3 \
+ __asm movq OC_I(3,_y),mm3 \
/*r7=R7=G'-C'*/ \
__asm psubw mm7,mm0 \
__asm paddw mm7,OC_8 \
@@ -486,50 +505,65 @@
/*r7=NR7*/ \
__asm psraw mm7,4 \
/*Store NR6 at J(6).*/ \
- __asm movq OC_J(6),mm6 \
+ __asm movq OC_J(6,_y),mm6 \
/*r0=NR0*/ \
__asm psraw mm0,4 \
/*Store NR5 at J(5).*/ \
- __asm movq OC_J(5),mm5 \
+ __asm movq OC_J(5,_y),mm5 \
/*Store NR7 at J(7).*/ \
- __asm movq OC_J(7),mm7 \
+ __asm movq OC_J(7,_y),mm7 \
/*Store NR0 at I(0).*/ \
- __asm movq OC_I(0),mm0 \
+ __asm movq OC_I(0,_y),mm0 \
}
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
__asm{
#define CONSTS eax
#define Y edx
+#define X ecx
mov CONSTS,offset OC_IDCT_CONSTS
mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
+ mov X,_x
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) [(_y)+(_k-4)*16+8]
/*Done with dequant, descramble, and partial transpose.
Now do the iDCT itself.*/
- OC_ROW_IDCT_10
- OC_TRANSPOSE
+ OC_ROW_IDCT_10(_y,_x)
+ OC_TRANSPOSE(_y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(_y)
#undef OC_I
#undef OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
- OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+_k*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+ OC_COLUMN_IDCT_10(_y)
#undef OC_I
#undef OC_J
#undef CONSTS
#undef Y
+#undef X
}
+ if(_x!=_y){
+#define X ecx
+ __asm{
+ mm0,mm0;
+ mov X,(_x+16*i)
+ movq [X+0x00],mm0
+ movq [X+0x10],mm0
+ movq [X+0x20],mm0
+ movq [X+0x30],mm0
+ }
+#undef X
+ }
}
/*Performs an inverse 8x8 Type-II DCT transform.
The input is assumed to be scaled by a factor of 4 relative to orthonormal
version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
/*_last_zzi is subtly different from an actual count of the number of
coefficients we decoded for this block.
It contains the value of zzi BEFORE the final token in the block was
@@ -555,8 +589,8 @@
gets.
Needless to say we inherited this approach from VP3.*/
/*Perform the iDCT.*/
- if(_last_zzi<10)oc_idct8x8_10(_y);
- else oc_idct8x8_slow(_y);
+ if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+ else oc_idct8x8_slow(_y,_x);
}
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -19,13 +19,12 @@
Originally written by Rudolf Marek.*/
#include <string.h>
#include "x86int.h"
-#include "mmxfrag.h"
#include "mmxloop.h"
#if defined(OC_X86_ASM)
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
unsigned char *dst;
ptrdiff_t frag_buf_off;
int ystride;
@@ -45,6 +44,7 @@
#define P ecx
mov Y,_dct_coeffs
movzx P,p
+ lea Y,[Y+128]
/*mm0=0000 0000 0000 AAAA*/
movd mm0,P
/*mm0=0000 0000 AAAA AAAA*/
@@ -74,14 +74,14 @@
else{
/*Dequantize the DC coefficient.*/
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
- oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+ oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
}
/*Fill in the target buffer.*/
frag_buf_off=_state->frag_buf_offs[_fragi];
mb_mode=_state->frags[_fragi].mb_mode;
ystride=_state->ref_ystride[_pli];
dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
- if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+ if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
else{
const unsigned char *ref;
int mvoffsets[2];
@@ -91,48 +91,17 @@
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
_state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
- _dct_coeffs);
+ _dct_coeffs+64);
}
- else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+ else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
}
}
/*We copy these entire function to inline the actual MMX routines so that we
use only a single indirect call.*/
-/*Copies the fragments specified by the lists of fragment indices from one
- frame to another.
- _fragis: A pointer to a list of fragment indices.
- _nfragis: The number of fragment indices to copy.
- _dst_frame: The reference frame to copy to.
- _src_frame: The reference frame to copy from.
- _pli: The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
- const ptrdiff_t *frag_buf_offs;
- const unsigned char *src_frame_data;
- unsigned char *dst_frame_data;
- ptrdiff_t fragii;
- int ystride;
- dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
- src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
- ystride=_state->ref_ystride[_pli];
- frag_buf_offs=_state->frag_buf_offs;
- for(fragii=0;fragii<_nfragis;fragii++){
- ptrdiff_t frag_buf_off;
- frag_buf_off=frag_buf_offs[_fragis[fragii]];
-#define SRC edx
-#define DST eax
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
- OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
- src_frame_data+frag_buf_off,ystride);
-#undef SRC
-#undef DST
-#undef YSTRIDE
-#undef YSTRIDE3
- }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+ memset(_bv,~(_flimit<<1),8);
}
/*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -144,8 +113,7 @@
_fragy0: The Y coordinate of the first fragment row to filter.
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
- OC_ALIGN8(unsigned char ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
const oc_fragment_plane *fplane;
const oc_fragment *frags;
const ptrdiff_t *frag_buf_offs;
@@ -156,13 +124,12 @@
ptrdiff_t fragi0_end;
int ystride;
int nhfrags;
- memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
fplane=_state->fplanes+_pli;
nhfrags=fplane->nhfrags;
fragi_top=fplane->froffset;
fragi_bot=fragi_top+fplane->nfrags;
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
- fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+ fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
ystride=_state->ref_ystride[_pli];
frags=_state->frags;
frag_buf_offs=_state->frag_buf_offs;
@@ -187,13 +154,13 @@
#define LL edx
#define D esi
#define D_WORD si
- if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
- if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+ if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
+ if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
- OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+ OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
}
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
- OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+ OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
}
#undef PIX
#undef YSTRIDE3
Copied: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c (from rev 17379, experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,192 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+ Originally written by Rudolf Marek.
+
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+ return 0;
+}
+#else
+/*Why does MSVC need this complicated rigamarole?
+ At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+ For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+ for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+ _asm{
+ mov eax,[_op]
+ mov esi,_cpu_info
+ cpuid
+ mov [esi+0],eax
+ mov [esi+4],ebx
+ mov [esi+8],ecx
+ mov [esi+12],edx
+ }
+}
+
+# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+ do{ \
+ ogg_uint32_t cpu_info[4]; \
+ oc_cpuid_helper(cpu_info,_op); \
+ (_eax)=cpu_info[0]; \
+ (_ebx)=cpu_info[1]; \
+ (_ecx)=cpu_info[2]; \
+ (_edx)=cpu_info[3]; \
+ }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+ _asm{
+ pushfd
+ pushfd
+ pop eax
+ mov ebx,eax
+ xor eax,200000h
+ push eax
+ popfd
+ pushfd
+ pop eax
+ popfd
+ mov ecx,_eax
+ mov [ecx],eax
+ mov ecx,_ebx
+ mov [ecx],ebx
+ }
+}
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+ if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+ if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+ if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+ if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+ if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+ return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+ ogg_uint32_t flags;
+ /*If there isn't even MMX, give up.*/
+ if(!(_edx&0x00800000))return 0;
+ flags=OC_CPU_X86_MMX;
+ if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+ if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+ if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+ if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+ if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+ return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+ ogg_uint32_t flags;
+ ogg_uint32_t eax;
+ ogg_uint32_t ebx;
+ ogg_uint32_t ecx;
+ ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+ /*Not all x86-32 chips support cpuid, so we have to check.*/
+ oc_detect_cpuid_helper(&eax,&ebx);
+ /*No cpuid.*/
+ if(eax==ebx)return 0;
+# endif
+ cpuid(0,eax,ebx,ecx,edx);
+ /* l e t n I e n i u n e G*/
+ if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+ /* 6 8 x M T e n i u n e G*/
+ ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+ int family;
+ int model;
+ /*Intel, Transmeta (tested with Crusoe TM5800):*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ family=(eax>>8)&0xF;
+ model=(eax>>4)&0xF;
+ /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+ unit, so don't use it.*/
+ if(family==6&&(model==9||model==13||model==14)){
+ flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+ }
+ }
+ /* D M A c i t n e h t u A*/
+ else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+ /* C S N y b e d o e G*/
+ ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+ /*AMD, Geode:*/
+ cpuid(0x80000000,eax,ebx,ecx,edx);
+ if(eax<0x80000001)flags=0;
+ else{
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ flags=oc_parse_amd_flags(edx,ecx);
+ }
+ /*Also check for SSE.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags|=oc_parse_intel_flags(edx,ecx);
+ }
+ /*Technically some VIA chips can be configured in the BIOS to return any
+ string here the user wants.
+ There is a special detection method that can be used to identify such
+ processors, but in my opinion, if the user really wants to change it, they
+ deserve what they get.*/
+ /* s l u a H r u a t n e C*/
+ else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+ /*VIA:*/
+ /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+ chips (thanks to the engineers from Centaur Technology who provided it).
+ These chips support Intel-like cpuid info.
+ The C3-2 (Nehemiah) cores appear to, as well.*/
+ cpuid(1,eax,ebx,ecx,edx);
+ flags=oc_parse_intel_flags(edx,ecx);
+ if(eax>=0x80000001){
+ /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+ We need to check this even if the Intel test succeeds to pick up 3DNow!
+ support on these processors.
+ Unlike actual AMD processors, we cannot _rely_ on this info, since
+ some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+ this function, yet return edx=0, despite the Intel test indicating
+ MMX support.
+ Therefore the features detected here are strictly added to those
+ detected by the Intel test.*/
+ /*TODO: How about earlier chips?*/
+ cpuid(0x80000001,eax,ebx,ecx,edx);
+ /*Note: As of the C7, this function returns Intel-style extended feature
+ flags, not AMD-style.
+ Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+ do not conflict with any of the AMD flags we inspect.
+ For the remaining bits, Intel tells us, "Do not count on their value",
+ but VIA assures us that they will all be zero (at least on the C7 and
+ Isaiah chips).
+ In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+ (0xC0C00000) for something else, we will have to add code to detect
+ the model to decide when it is appropriate to inspect them.*/
+ flags|=oc_parse_amd_flags(edx,ecx);
+ }
+ }
+ else{
+ /*Implement me.*/
+ flags=0;
+ }
+ return flags;
+}
+#endif
Copied: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h (from rev 17379, experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,36 @@
+/********************************************************************
+ * *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
+ * *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ * *
+ ********************************************************************
+ function:
+ last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86cpu_H)
+# define _x86_vc_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX (1<<0)
+#define OC_CPU_X86_3DNOW (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT (1<<3)
+#define OC_CPU_X86_SSE (1<<4)
+#define OC_CPU_X86_SSE2 (1<<5)
+#define OC_CPU_X86_PNI (1<<6)
+#define OC_CPU_X86_SSSE3 (1<<7)
+#define OC_CPU_X86_SSE4_1 (1<<8)
+#define OC_CPU_X86_SSE4_2 (1<<9)
+#define OC_CPU_X86_SSE4A (1<<10)
+#define OC_CPU_X86_SSE5 (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -18,10 +18,10 @@
#if defined(OC_X86_ASM)
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
ogg_uint32_t cpu_flags;
cpu_flags=_enc->state.cpu_flags;
- oc_enc_vtable_init_c(_enc);
+ oc_enc_accel_init_c(_enc);
if(cpu_flags&OC_CPU_X86_MMX){
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -17,10 +17,14 @@
#if !defined(_x86_vc_x86enc_H)
# define _x86_vc_x86enc_H (1)
+# include "x86int.h"
+# if defined(OC_X86_ASM)
+# define oc_enc_accel_init oc_enc_accel_init_x86
+# define OC_ENC_USE_VTABLE (1)
+# endif
# include "../encint.h"
-# include "x86int.h"
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
const unsigned char *_ref,int _ystride);
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h 2010-09-21 21:53:48 UTC (rev 17410)
@@ -18,26 +18,32 @@
#if !defined(_x86_vc_x86int_H)
# define _x86_vc_x86int_H (1)
# include "../internal.h"
-# include "cpu.h"
+# if defined(OC_X86_ASM)
+# define oc_state_accel_init oc_state_accel_init_x86
+# define OC_STATE_USE_VTABLE (1)
+# endif
+# include "../state.h"
+# include "x86cpu.h"
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+void oc_state_accel_init_x86(oc_theora_state *_state);
void oc_frag_copy_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
const ogg_int16_t *_residue);
void oc_frag_recon_inter_mmx(unsigned char *_dst,
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
void oc_restore_fpu_mmx(void);
#endif
Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c 2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c 2010-09-21 21:53:48 UTC (rev 17410)
@@ -40,21 +40,22 @@
64,64,64,64,64,64,64,64,
};
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
_state->cpu_flags=oc_cpu_flags_get();
if(_state->cpu_flags&OC_CPU_X86_MMX){
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+ _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
- _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+ _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
_state->opt_vtable.state_loop_filter_frag_rows=
oc_state_loop_filter_frag_rows_mmx;
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
}
- else oc_state_vtable_init_c(_state);
+ else oc_state_accel_init_c(_state);
}
#endif
Added: experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4
===================================================================
--- experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4 (rev 0)
+++ experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4 2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,106 @@
+dnl as-gcc-inline-assembly.m4 0.1.0
+
+dnl autostars m4 macro for detection of gcc inline assembly
+
+dnl David Schleef <ds at schleef.org>
+
+dnl $Id$
+
+dnl AS_COMPILER_FLAG(ACTION-IF-ACCEPTED, [ACTION-IF-NOT-ACCEPTED])
+dnl Tries to compile with the given CFLAGS.
+dnl Runs ACTION-IF-ACCEPTED if the compiler can compile with the flags,
+dnl and ACTION-IF-NOT-ACCEPTED otherwise.
+
+AC_DEFUN([AS_GCC_INLINE_ASSEMBLY],
+[
+ AC_MSG_CHECKING([if compiler supports gcc-style inline assembly])
+
+ AC_TRY_COMPILE([], [
+#ifdef __GNUC_MINOR__
+#if (__GNUC__ * 1000 + __GNUC_MINOR__) < 3004
+#error GCC before 3.4 has critical bugs compiling inline assembly
+#endif
+#endif
+__asm__ (""::) ], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+AC_DEFUN([AC_TRY_ASSEMBLE],
+[ac_c_ext=$ac_ext
+ ac_ext=${ac_s_ext-s}
+ cat > conftest.$ac_ext <<EOF
+ .file "configure"
+[$1]
+EOF
+if AC_TRY_EVAL(ac_compile); then
+ ac_ext=$ac_c_ext
+ ifelse([$2], , :, [ $2
+ rm -rf conftest*])
+else
+ echo "configure: failed program was:" >&AC_FD_CC
+ cat conftest.$ac_ext >&AC_FD_CC
+ ac_ext=$ac_c_ext
+ifelse([$3], , , [ rm -rf conftest*
+ $3
+])dnl
+fi
+rm -rf conftest*])
+
+
+AC_DEFUN([AS_ASM_ARM_NEON],
+[
+ AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])
+
+ AC_TRY_ASSEMBLE([vorr d0,d0,d0], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_MEDIA],
+[
+ AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
+
+ AC_TRY_ASSEMBLE([shadd8 r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_EDSP],
+[
+ AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
+
+ AC_TRY_ASSEMBLE([qadd r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+ if test "X$flag_ok" = Xyes ; then
+ $1
+ true
+ else
+ $2
+ true
+ fi
+ AC_MSG_RESULT([$flag_ok])
+])
More information about the commits
mailing list