[xiph-commits] r17410 - in experimental/derf/theora-ptalarbvorm: . lib lib/arm lib/c64x lib/x86 lib/x86_vc m4

Tue Sep 21 14:53:48 PDT 2010

Author: tterribe
Date: 2010-09-21 14:53:48 -0700 (Tue, 21 Sep 2010)
New Revision: 17410

Added:
   experimental/derf/theora-ptalarbvorm/lib/arm/
   experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl
   experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h
   experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s
   experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c
   experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h
   experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s
   experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s
   experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
   experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s
   experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in
   experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h
   experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4
Removed:
   experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c
   experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h
Modified:
   experimental/derf/theora-ptalarbvorm/AUTHORS
   experimental/derf/theora-ptalarbvorm/autogen.sh
   experimental/derf/theora-ptalarbvorm/configure.ac
   experimental/derf/theora-ptalarbvorm/lib/Makefile.am
   experimental/derf/theora-ptalarbvorm/lib/analyze.c
   experimental/derf/theora-ptalarbvorm/lib/bitpack.c
   experimental/derf/theora-ptalarbvorm/lib/bitpack.h
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
   experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
   experimental/derf/theora-ptalarbvorm/lib/decint.h
   experimental/derf/theora-ptalarbvorm/lib/decode.c
   experimental/derf/theora-ptalarbvorm/lib/encint.h
   experimental/derf/theora-ptalarbvorm/lib/fragment.c
   experimental/derf/theora-ptalarbvorm/lib/huffdec.c
   experimental/derf/theora-ptalarbvorm/lib/huffdec.h
   experimental/derf/theora-ptalarbvorm/lib/idct.c
   experimental/derf/theora-ptalarbvorm/lib/internal.h
   experimental/derf/theora-ptalarbvorm/lib/state.c
   experimental/derf/theora-ptalarbvorm/lib/state.h
   experimental/derf/theora-ptalarbvorm/lib/tokenize.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
   experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
   experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
   experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
   experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h
   experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c
Log:
Initial port of Robin Watts's Theorarm optimizations.

This includes the assembly for motion compensation, the iDCT, and the loop
 filter, with minor fixes, additions, and speed improvements.
David Schleef and Ralph Giles contributed to the autoconf and automake support.
Cristian Adam contributed to testing the CPU detection on Windows.

New assembly was written for the libtheora bitpacker, as well as new ARMv6
 versions of the iDCT and loop filter, and a new NEON version of the iDCT.
The iDCT was also modified to zero out its input coefficients afterwards, so
 they can be used for the next block, avoiding the need to clear many
 coefficients which were already zero (as an alternative to Robin's
 oc_memzero16_64, which always cleared all the coefficients).
CPU detection for Linux and Windows was written to allow execution of a single
 binary on multiple ARM CPUs; iOS detection is needed.
We currently do not try to detect the _minimum_ CPU the code is being compiled
 for, and thus will include some code for processors older than is strictly
 needed, though it is certainly possible to fix this.

A number of other updates were applied to the optimizations for other
 architectures to adapt to interface changes that helped ARM, and some
 long-standing bugs were fixed (such as an off-by-one in the iDCT for every
 architecture that was harmless, but hurt performance).
The x86 Windows assembly had bitrotted a bit, and has been restored closer to
 something that should work, but is completely untested.

What has _not_ been ported over yet is assembly for some of the less-frequently
 used zero-filling, border padding, coded block flag reading, the libogg2
 bitpacker, any of the post-processing, as well as most of the C code changes,
 in particular the MV handling and the use of pointer arithmetic instead of
 indexing in loops.
Some or all of this may follow in subsequent commits.
The current code, however, already benchmarks faster than the original Theorarm
 version on ARMv6 or NEON-capable hardware.
Some of that is due to the new routines written for these CPUs, while the rest
 is due to other general decoder improvemnts since the 1.1 release Theorarm was
 based on.


Modified: experimental/derf/theora-ptalarbvorm/AUTHORS
===================================================================

--- experimental/derf/theora-ptalarbvorm/AUTHORS	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/AUTHORS	2010-09-21 21:53:48 UTC (rev 17410)
@@ -48,5 +48,7 @@
 Rodolphe Ortalo
 	- Bug fixes
 
+Robin Watts
+	- ARM code optimisations
 
 and other Xiph.org contributors

Modified: experimental/derf/theora-ptalarbvorm/autogen.sh
===================================================================
--- experimental/derf/theora-ptalarbvorm/autogen.sh	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/autogen.sh	2010-09-21 21:53:48 UTC (rev 17410)
@@ -127,4 +127,4 @@
 autoconf || exit 1
 
 cd $olddir
-$srcdir/configure --enable-maintainer-mode "$@" && /bin/echo
+$srcdir/configure "$@" && /bin/echo

Modified: experimental/derf/theora-ptalarbvorm/configure.ac
===================================================================
--- experimental/derf/theora-ptalarbvorm/configure.ac	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/configure.ac	2010-09-21 21:53:48 UTC (rev 17410)
@@ -12,7 +12,6 @@
 AM_CONFIG_HEADER([config.h])
 AC_CONFIG_SRCDIR([lib/fdct.c])
 AM_INIT_AUTOMAKE
-AM_MAINTAINER_MODE
 
 dnl we use doc_DATA in doc/Makefile.am which requires autoconf >= 2.60
 dnl to define docdir for us.
@@ -55,6 +54,8 @@
 dnl Check for programs
 dnl --------------------------------------------------  
 
+AM_PROG_AS
+
 dnl save $CFLAGS since AC_PROG_CC likes to insert "-g -O2"
 dnl if $CFLAGS is blank
 cflags_save="$CFLAGS"
@@ -195,6 +196,8 @@
 
 cpu_x86_64=no
 cpu_x86_32=no
+cpu_arm=no
+cpu_c64x=no
 AC_ARG_ENABLE(asm,
     AS_HELP_STRING([--disable-asm], [Disable assembly optimizations]),
     [ ac_enable_asm=$enableval ], [ ac_enable_asm=yes] )
@@ -203,7 +206,7 @@
   cpu_optimization="no optimization for your platform, please send a patch"
   case $target_cpu in
   i[[3456]]86)
-    cpu_x86_32=yes 
+    cpu_x86_32=yes
     cpu_optimization="32 bit x86"
     AC_DEFINE([OC_X86_ASM], [],  [make use of x86 asm optimization])
     if test "x$target_vendor" = "xapple"; then
@@ -216,6 +219,99 @@
     AC_DEFINE([OC_X86_ASM], [],  [make use of x86 asm optimization])
     AC_DEFINE([OC_X86_64_ASM], [],  [make use of x86_64 asm optimization])
     ;;
+  arm*)
+    cpu_arm=yes
+    cpu_optimization="ARM"
+    AC_DEFINE([OC_ARM_ASM], [],  [make use of arm asm optimization])
+    AC_ARG_ENABLE(asflag-probe,
+        AS_HELP_STRING([--disable-asflag-probe], [Disable instructions not supported by the default ASFLAGS (ARM only).]),
+        [ ac_enable_asflag_probe=$enableval ], [ ac_enable_asflag_probe=yes] )
+
+    dnl our ARM assembly requires perl to run the arm2gnu reformatter
+    AC_CHECK_PROG([HAVE_PERL], perl, yes, no)
+    if test "x$HAVE_PERL" = "xno"; then
+      AC_MSG_WARN([*** ARM assembly requires perl -- disabling optimizations])
+      cpu_arm=no
+      cpu_optimization="(missing perl dependency for ARM)"
+    fi
+
+    dnl AC_TRY_ASSEMBLE uses CFLAGS instead of CCASFLAGS
+    save_CFLAGS="$CFLAGS"
+    ARM_CCASFLAGS=
+    dnl Test for instruction set support with the default CCASFLAGS.
+    AS_ASM_ARM_NEON([HAVE_ARM_ASM_NEON=1],[HAVE_ARM_ASM_NEON=0])
+    AS_ASM_ARM_MEDIA([HAVE_ARM_ASM_MEDIA=1],[HAVE_ARM_ASM_MEDIA=0])
+    AS_ASM_ARM_EDSP([HAVE_ARM_ASM_EDSP=1],[HAVE_ARM_ASM_EDSP=0])
+    dnl gas will not assemble instructions unless the architecture explicitly
+    dnl  supports it (unlike on x86).
+    dnl Try to speculatively add ASFLAGS to enable usage of these instructions
+    dnl  at assembly time (actual support is detected at runtime).
+    dnl If the user has already specified -march or -mcpu flags, this may give
+    dnl  some spurious warnings (use --disable-asflag-probe to avoid this if
+    dnl  you don't want run-time support for instructions not available on the
+    dnl  architecture you specified).
+    dnl Order here is important.
+    if test "x${ac_enable_asflag_probe}" = xyes; then
+      if test x$HAVE_ARM_ASM_NEON != x1 ; then
+        dnl Try to set some flags to enable NEON instructions.
+        AC_MSG_NOTICE([trying custom CCASFLAGS to enable NEON instructions...])
+        ARM_CCASFLAGS="-mfpu=neon -march=armv7-a"
+        CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+        AS_ASM_ARM_NEON([HAVE_ARM_ASM_NEON=1],[HAVE_ARM_ASM_NEON=0])
+        if test x$HAVE_ARM_ASM_NEON != x1 ; then
+          ARM_CCASFLAGS=
+          CFLAGS="$save_CFLAGS"
+        fi
+      fi
+      if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+        dnl Try to set some flags to enable ARMv6 media instructions.
+        AC_MSG_NOTICE([trying custom CCASFLAGS to enable ARMv6 media instructions...])
+        ARM_CCASFLAGS="-march=armv6j"
+        CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+        AS_ASM_ARM_MEDIA([HAVE_ARM_ASM_MEDIA=1],[HAVE_ARM_ASM_MEDIA=0])
+        if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+          ARM_CCASFLAGS=
+          CFLAGS="$save_CFLAGS"
+        fi
+      fi
+      if test x$HAVE_ARM_ASM_EDSP != x1 ; then
+        dnl Try to set some flags to enable EDSP instructions.
+        AC_MSG_NOTICE([trying custom CCASFLAGS to enable EDSP compilation...])
+        ARM_CCASFLAGS="-march=armv5e"
+        CFLAGS="$save_CFLAGS $ARM_CCASFLAGS"
+        AS_ASM_ARM_EDSP([HAVE_ARM_ASM_EDSP=1],[HAVE_ARM_ASM_EDSP=0])
+        if test x$HAVE_ARM_ASM_MEDIA != x1 ; then
+          ARM_CCASFLAGS=
+          CFLAGS="$save_CFLAGS"
+        fi
+      fi
+    fi
+
+    dnl Only enable if we passed the perl test above
+    if test x$cpu_arm = xyes; then
+      if test x$HAVE_ARM_ASM_EDSP = x1 ; then
+        AC_DEFINE(OC_ARM_ASM_EDSP, 1,
+          [Define if assembler supports EDSP instructions])
+        cpu_optimization="$cpu_optimization (EDSP)"
+      fi
+      AC_SUBST(HAVE_ARM_ASM_EDSP)
+      if test x$HAVE_ARM_ASM_MEDIA = x1 ; then
+        AC_DEFINE(OC_ARM_ASM_MEDIA, 1,
+          [Define if assembler supports ARMv6 media instructions])
+        cpu_optimization="$cpu_optimization (Media)"
+      fi
+      AC_SUBST(HAVE_ARM_ASM_MEDIA)
+      if test x$HAVE_ARM_ASM_NEON = x1 ; then
+        AC_DEFINE(OC_ARM_ASM_NEON, 1,
+          [Define if compiler supports NEON instructions])
+        cpu_optimization="$cpu_optimization (NEON)"
+      fi
+      AC_SUBST(HAVE_ARM_ASM_NEON)
+    fi
+
+    CFLAGS="$save_CFLAGS"
+    CCASFLAGS="$CCASFLAGS $ARM_CCASFLAGS"
+    ;;
   tic6x)
     cpu_c64x=yes
     cpu_optimization="TI C64x+"
@@ -227,6 +323,7 @@
 fi
 AM_CONDITIONAL([CPU_x86_64], [test x$cpu_x86_64 = xyes])
 AM_CONDITIONAL([CPU_x86_32], [test x$cpu_x86_32 = xyes])
+AM_CONDITIONAL([CPU_arm], [test x$cpu_arm = xyes])
 AM_CONDITIONAL([CPU_c64x], [test x$cpu_c64x = xyes])
 
 # Test whenever ld supports -version-script
@@ -477,6 +574,7 @@
 AC_OUTPUT([
   Makefile 
   lib/Makefile
+  lib/arm/armopts.s
   include/Makefile include/theora/Makefile
   examples/Makefile
   doc/Makefile doc/Doxyfile doc/spec/Makefile

Modified: experimental/derf/theora-ptalarbvorm/lib/Makefile.am
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/Makefile.am	2010-09-21 21:53:48 UTC (rev 17410)
@@ -3,7 +3,17 @@
 
 EXTRA_DIST = \
 	encoder_disabled.c \
-	x86/cpu.c \
+	arm/armcpu.c \
+	arm/armbits.h \
+	arm/armbits.s \
+	arm/armfrag.s \
+	arm/armidct.s \
+	arm/armint.h \
+	arm/armopts.s.in \
+	arm/arm2gnu.pl \
+	c64x/c64xint.h \
+	c64x/c64xdec.h \
+	x86/x86cpu.c \
 	x86/mmxencfrag.c \
 	x86/mmxfdct.c \
 	x86/sse2encfrag.c \
@@ -13,16 +23,13 @@
 	x86/x86enc.h \
 	x86/x86enquant.c \
 	x86/mmxfrag.c \
-	x86/mmxfrag.h \
 	x86/mmxidct.c \
 	x86/mmxloop.h \
 	x86/mmxstate.c \
 	x86/sse2idct.c \
 	x86/x86int.h \
 	x86/x86state.c \
-	x86_vc \
-	c64x/c64xint.h \
-	c64x/c64xdec.h
+	x86_vc
 
 lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
 
@@ -44,7 +51,7 @@
 	x86/sse2fdct.c
 
 encoder_shared_x86_sources = \
-	x86/cpu.c \
+	x86/x86cpu.c \
 	x86/mmxfrag.c \
 	x86/mmxidct.c \
 	x86/mmxstate.c \
@@ -53,6 +60,26 @@
 
 encoder_shared_x86_64_sources =
 
+encoder_uniq_arm_sources =
+
+if CPU_arm
+BUILT_SOURCES = \
+	armbits-gnu.S \
+	armfrag-gnu.S \
+	armidct-gnu.S \
+	armloop-gnu.S \
+	armopts-gnu.S
+endif
+
+encoder_shared_arm_sources = \
+	armbits-gnu.S \
+	armfrag-gnu.S \
+	armidct-gnu.S \
+	armloop-gnu.S \
+	armopts-gnu.S \
+	arm/armcpu.c \
+	arm/armstate.c
+
 if CPU_x86_64
 encoder_uniq_arch_sources = \
  $(encoder_uniq_x86_sources) \
@@ -65,10 +92,15 @@
 encoder_uniq_arch_sources = $(encoder_uniq_x86_sources)
 encoder_shared_arch_sources = $(encoder_shared_x86_sources)
 else
+if CPU_arm
+encoder_uniq_arch_sources = $(encoder_uniq_arm_sources)
+encoder_shared_arch_sources = $(encoder_shared_arm_sources)
+else
 encoder_uniq_arch_sources =
 encoder_shared_arch_sources =
 endif
 endif
+endif
 
 encoder_uniq_sources = \
 	analyze.c \
@@ -100,13 +132,24 @@
 endif
 
 decoder_x86_sources = \
-	x86/cpu.c \
+	x86/x86cpu.c \
 	x86/mmxidct.c \
 	x86/mmxfrag.c \
 	x86/mmxstate.c \
 	x86/sse2idct.c \
 	x86/x86state.c
 
+decoder_arm_sources = \
+	arm/armcpu.c \
+	arm/armstate.c
+
+nodist_decoder_arm_sources = \
+	armbits-gnu.S \
+	armfrag-gnu.S \
+	armidct-gnu.S \
+	armloop-gnu.S \
+	armopts-gnu.S
+
 decoder_c64x_sources = \
 	c64x/c64xdec.c \
 	c64x/c64xfrag.c \
@@ -115,17 +158,26 @@
 
 if CPU_x86_64
 decoder_arch_sources = $(decoder_x86_sources)
+nodist_decoder_arch_sources =
 else
 if CPU_x86_32
 decoder_arch_sources = $(decoder_x86_sources)
+nodist_decoder_arch_sources =
 else
+if CPU_arm
+decoder_arch_sources = $(decoder_arm_sources)
+nodist_decoder_arch_sources = $(nodist_decoder_arm_sources)
+else
 if CPU_c64x
 decoder_arch_sources = $(decoder_c64x_sources)
+nodist_decoder_arch_sources =
 else
 decoder_arch_sources =
+nodist_decoder_arch_sources =
 endif
 endif
 endif
+endif
 
 decoder_sources = \
 	apiwrapper.c \
@@ -160,8 +212,10 @@
 	ocintrin.h \
 	quant.h \
 	state.h \
-	x86/cpu.h \
-	x86/mmxfrag.h \
+	arm/armcpu.h \
+	c64x/c64xdec.h \
+	c64x/c64xint.h \
+	x86/x86cpu.h \
 	x86/mmxloop.h \
 	x86/sse2trans.h \
 	x86/x86enc.h \
@@ -170,6 +224,8 @@
 libtheoradec_la_SOURCES = \
 	$(decoder_sources) \
 	Version_script-dec theoradec.exp
+nodist_libtheoradec_la_SOURCES = \
+	$(nodist_decoder_arch_sources)
 libtheoradec_la_LDFLAGS = \
   -version-info @THDEC_LIB_CURRENT@:@THDEC_LIB_REVISION@:@THDEC_LIB_AGE@ \
   @THEORADEC_LDFLAGS@ @CAIRO_LIBS@ \
@@ -188,6 +244,8 @@
 	$(decoder_sources) \
 	$(encoder_uniq_sources) \
 	Version_script theora.exp
+nodist_libtheora_la_SOURCES = \
+	$(nodist_decoder_arch_sources)
 libtheora_la_LDFLAGS = \
   -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
   @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS) \
@@ -202,3 +260,19 @@
 # contstruct various symbol export list files
 .def.exp : defexp.awk
 	awk -f defexp.awk $< > $@
+
+CLEANFILES = \
+	armbits-gnu.S \
+	armfrag-gnu.S \
+	armidct-gnu.S \
+	armloop-gnu.S \
+	armopts-gnu.S
+
+# automake doesn't do dependency tracking for asm files, that I can tell
+armfrag-gnu.S: armopts-gnu.S
+armidct-gnu.S: armopts-gnu.S
+armloop-gnu.S: armopts-gnu.S
+
+# convert ARM asm to GNU as format
+%-gnu.S: $(srcdir)/arm/%.s
+	$(srcdir)/arm/arm2gnu.pl < $< > $@

Modified: experimental/derf/theora-ptalarbvorm/lib/analyze.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/analyze.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -24,9 +24,6 @@
 
 
 
-typedef struct oc_fr_state           oc_fr_state;
-typedef struct oc_qii_state          oc_qii_state;
-typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
 typedef struct oc_rd_metric          oc_rd_metric;
 typedef struct oc_mode_choice        oc_mode_choice;
 
@@ -223,43 +220,6 @@
 
 
 
-/*State to track coded block flags and their bit cost.
-  We use opportunity cost to measure the bits required to code or skip the next
-   block, using the cheaper of the cost to code it fully or partially, so long
-   as both are possible.*/
-struct oc_fr_state{
-  /*The number of bits required for the coded block flags so far this frame.*/
-  ptrdiff_t  bits;
-  /*The length of the current run for the partial super block flag, not
-     including the current super block.*/
-  unsigned   sb_partial_count:16;
-  /*The length of the current run for the full super block flag, not
-     including the current super block.*/
-  unsigned   sb_full_count:16;
-  /*The length of the coded block flag run when the current super block
-     started.*/
-  unsigned   b_coded_count_prev:6;
-  /*The coded block flag when the current super block started.*/
-  signed int b_coded_prev:2;
-  /*The length of the current coded block flag run.*/
-  unsigned   b_coded_count:6;
-  /*The current coded block flag.*/
-  signed int b_coded:2;
-  /*The number of blocks processed in the current super block.*/
-  unsigned   b_count:5;
-  /*Whether or not it is cheaper to code the current super block partially,
-     even if it could still be coded fully.*/
-  unsigned   sb_prefer_partial:1;
-  /*Whether the last super block was coded partially.*/
-  signed int sb_partial:2;
-  /*The number of bits required for the flags for the current super block.*/
-  unsigned   sb_bits:6;
-  /*Whether the last non-partial super block was coded fully.*/
-  signed int sb_full:2;
-};
-
-
-
 static void oc_fr_state_init(oc_fr_state *_fr){
   _fr->bits=0;
   _fr->sb_partial_count=0;
@@ -492,16 +452,6 @@
 
 
 
-struct oc_qii_state{
-  ptrdiff_t  bits;
-  unsigned   qi01_count:14;
-  signed int qi01:2;
-  unsigned   qi12_count:14;
-  signed int qi12:2;
-};
-
-
-
 static void oc_qii_state_init(oc_qii_state *_qs){
   _qs->bits=0;
   _qs->qi01_count=0;
@@ -555,41 +505,11 @@
 
 
 
-/*Temporary encoder state for the analysis pipeline.*/
-struct oc_enc_pipeline_state{
-  int                 bounding_values[256];
-  oc_fr_state         fr[3];
-  oc_qii_state        qs[3];
-  /*Skip SSD storage for the current MCU in each plane.*/
-  unsigned           *skip_ssd[3];
-  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
-  ptrdiff_t          *coded_fragis[3];
-  ptrdiff_t          *uncoded_fragis[3];
-  ptrdiff_t           ncoded_fragis[3];
-  ptrdiff_t           nuncoded_fragis[3];
-  /*The starting fragment for the current MCU in each plane.*/
-  ptrdiff_t           froffset[3];
-  /*The starting row for the current MCU in each plane.*/
-  int                 fragy0[3];
-  /*The ending row for the current MCU in each plane.*/
-  int                 fragy_end[3];
-  /*The starting superblock for the current MCU in each plane.*/
-  unsigned            sbi0[3];
-  /*The ending superblock for the current MCU in each plane.*/
-  unsigned            sbi_end[3];
-  /*The number of tokens for zzi=1 for each color plane.*/
-  int                 ndct_tokens1[3];
-  /*The outstanding eob_run count for zzi=1 for each color plane.*/
-  int                 eob_run1[3];
-  /*Whether or not the loop filter is enabled.*/
-  int                 loop_filter;
-};
-
-
 static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
   ptrdiff_t *coded_fragis;
   unsigned   mcu_nvsbs;
   ptrdiff_t  mcu_nfrags;
+  int        flimit;
   int        hdec;
   int        vdec;
   int        pli;
@@ -649,8 +569,9 @@
     _pipe->eob_run1[pli]=0;
   }
   /*Initialize the bounding value array for the loop filter.*/
-  _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
-   _pipe->bounding_values);
+  flimit=_enc->state.loop_filter_limits[_enc->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_enc->state,_pipe->bounding_values,flimit);
 }
 
 /*Sets the current MCU stripe to super block row _sby.
@@ -692,10 +613,15 @@
   int refi;
   /*Copy over all the uncoded fragments from this plane and advance the uncoded
      fragment list.*/
-  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
-   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
-  _pipe->nuncoded_fragis[_pli]=0;
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_enc->state,
+     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]],
+     _enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]],
+     _enc->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_enc->state.frag_buf_offs);
+    _pipe->nuncoded_fragis[_pli]=0;
+  }
   /*Perform DC prediction.*/
   oc_enc_pred_dc_frag_rows(_enc,_pli,
    _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
@@ -741,8 +667,8 @@
  oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,
  unsigned _rd_scale,unsigned _rd_iscale,oc_rd_metric *_mo,
  oc_fr_state *_fr,oc_token_checkpoint **_stack){
-  OC_ALIGN16(ogg_int16_t  dct[64]);
-  OC_ALIGN16(ogg_int16_t  data[64]);
+  ogg_int16_t            *dct;
+  ogg_int16_t            *data;
   oc_qii_state            qs;
   const ogg_uint16_t     *dequant;
   ogg_uint16_t            dequant_dc;
@@ -773,6 +699,8 @@
    +frag_offs;
   borderi=frags[_fragi].borderi;
   qii=frags[_fragi].qii;
+  data=_enc->pipe.dct_data;
+  dct=data+64;
   if(qii&~3){
 #if !defined(OC_COLLECT_METRICS)
     if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
@@ -872,7 +800,7 @@
   }
   else{
     data[0]=dc*dequant_dc;
-    oc_idct8x8(&_enc->state,data,nonzero+1);
+    oc_idct8x8(&_enc->state,data,data,nonzero+1);
   }
   if(nqis>1){
     oc_qii_state_advance(&qs,_pipe->qs+_pli,qii);
@@ -1675,7 +1603,6 @@
 
 /*Analysis stage for an INTRA frame.*/
 void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
-  oc_enc_pipeline_state   pipe;
   ogg_int64_t             activity_sum;
   ogg_int64_t             luma_sum;
   unsigned                activity_avg;
@@ -1698,7 +1625,7 @@
   int                     pli;
   _enc->state.frame_type=OC_INTRA_FRAME;
   oc_enc_tokenize_start(_enc);
-  oc_enc_pipeline_init(_enc,&pipe);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
   oc_enc_mode_rd_init(_enc);
   activity_sum=luma_sum=0;
   activity_avg=_enc->activity_avg;
@@ -1725,10 +1652,10 @@
     ptrdiff_t cfroffset;
     unsigned  sbi;
     unsigned  sbi_end;
-    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
-    sbi_end=pipe.sbi_end[0];
-    cfroffset=pipe.froffset[1];
-    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
       int quadi;
       /*Mode addressing is through Y plane, always 4 MB per SB.*/
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
@@ -1763,10 +1690,10 @@
           oc_mcenc_search(_enc,mbi);
         }
         if(_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
-          oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi,rd_scale);
+          oc_analyze_intra_mb_luma(_enc,_enc->pipe.qs+0,mbi,rd_scale);
         }
         mb_modes[mbi]=OC_MODE_INTRA;
-        oc_enc_mb_transform_quantize_intra_luma(_enc,&pipe,
+        oc_enc_mb_transform_quantize_intra_luma(_enc,&_enc->pipe,
          mbi,rd_scale,rd_iscale);
         /*Propagate final MB mode and MVs to the chroma blocks.*/
         for(mapii=4;mapii<nmap_idxs;mapii++){
@@ -1786,12 +1713,12 @@
         }
       }
     }
-    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
     /*Code chroma planes.*/
     for(pli=1;pli<3;pli++){
-      oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
-       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
-      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+      oc_enc_sb_transform_quantize_intra_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
     }
     notstart=1;
   }
@@ -2316,7 +2243,6 @@
 
 int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
   oc_set_chroma_mvs_func  set_chroma_mvs;
-  oc_enc_pipeline_state   pipe;
   oc_qii_state            intra_luma_qs;
   oc_mv                   last_mv;
   oc_mv                   prior_mv;
@@ -2356,7 +2282,7 @@
   _enc->state.frame_type=OC_INTER_FRAME;
   oc_mode_scheme_chooser_reset(&_enc->chooser);
   oc_enc_tokenize_start(_enc);
-  oc_enc_pipeline_init(_enc,&pipe);
+  oc_enc_pipeline_init(_enc,&_enc->pipe);
   oc_enc_mode_rd_init(_enc);
   if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
   _enc->mv_bits[0]=_enc->mv_bits[1]=0;
@@ -2391,10 +2317,10 @@
   mcu_nvsbs=_enc->mcu_nvsbs;
   for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
     ptrdiff_t cfroffset;
-    notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
-    sbi_end=pipe.sbi_end[0];
-    cfroffset=pipe.froffset[1];
-    for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
+    notdone=oc_enc_pipeline_set_stripe(_enc,&_enc->pipe,stripe_sby);
+    sbi_end=_enc->pipe.sbi_end[0];
+    cfroffset=_enc->pipe.froffset[1];
+    for(sbi=_enc->pipe.sbi0[0];sbi<sbi_end;sbi++){
       int quadi;
       /*Mode addressing is through Y plane, always 4 MB per SB.*/
       for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
@@ -2448,7 +2374,7 @@
         /*Estimate the cost of coding this MB in a keyframe.*/
         if(_allow_keyframe){
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-           pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
+           _enc->pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP,rd_scale);
           intrabits+=modes[OC_MODE_INTRA].rate;
           for(bi=0;bi<4;bi++){
             oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
@@ -2456,24 +2382,28 @@
           }
         }
         /*Estimate the cost in a delta frame for various modes.*/
-        oc_skip_cost(_enc,&pipe,mbi,rd_scale,skip_ssd);
+        oc_skip_cost(_enc,&_enc->pipe,mbi,rd_scale,skip_ssd);
         if(sp_level<OC_SP_LEVEL_NOMC){
           oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
-           OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-           pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
            OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
-           pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
-           OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_INTER_MV_LAST,last_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
-           OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_INTER_MV_LAST2,prior_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
-           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
            OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
-           pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           /*The explicit MV modes (2,6,7) have not yet gone through halfpel
              refinement.
             We choose the explicit MV mode that's already furthest ahead on
@@ -2483,7 +2413,8 @@
           inter_mv_pref=_enc->lambda*3;
           if(sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
             oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-             embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+             embs[mbi].block_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
           }
           else{
             modes[OC_MODE_INTER_MV_FOUR].cost=UINT_MAX;
@@ -2495,7 +2426,8 @@
               embs[mbi].refined|=0x80;
             }
             oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
-             embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+             embs[mbi].ref_mv,_enc->pipe.fr+0,_enc->pipe.qs+0,
+             skip_ssd,rd_scale);
           }
           else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
            modes[OC_MODE_INTER_MV].cost){
@@ -2505,7 +2437,7 @@
             }
             mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
              OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
-             pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+             _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           }
           if(!(embs[mbi].refined&0x04)){
             oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
@@ -2513,7 +2445,7 @@
           }
           mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
            OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
-           pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,skip_ssd,rd_scale);
           /*Finally, pick the mode with the cheapest estimated R-D cost.*/
           mb_mode=OC_MODE_INTER_NOMV;
           if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
@@ -2544,11 +2476,13 @@
         }
         else{
           oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
-           OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_INTER_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
-           pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd,rd_scale);
+           _enc->pipe.fr+0,_enc->pipe.qs+0,intra_satd,skip_ssd,rd_scale);
           oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
-           OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd,rd_scale);
+           OC_MODE_GOLDEN_NOMV,_enc->pipe.fr+0,_enc->pipe.qs+0,
+           skip_ssd,rd_scale);
           mb_mode=OC_MODE_INTER_NOMV;
           if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
             mb_mode=OC_MODE_INTRA;
@@ -2589,7 +2523,7 @@
           fragi=sb_maps[mbi>>2][mbi&3][bi];
           frags[fragi].qii=modes[mb_mode].qii[bi];
         }
-        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&pipe,mbi,
+        if(oc_enc_mb_transform_quantize_inter_luma(_enc,&_enc->pipe,mbi,
          modes[mb_mode].overhead>>OC_BIT_SCALE,rd_scale,rd_iscale)>0){
           int orig_mb_mode;
           orig_mb_mode=mb_mode;
@@ -2693,16 +2627,16 @@
           mcu_rd_iscale[fragi-cfroffset]=(ogg_uint16_t)rd_iscale[4];
         }
       }
-      oc_fr_state_flush_sb(pipe.fr+0);
-      sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
-      sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
+      oc_fr_state_flush_sb(_enc->pipe.fr+0);
+      sb_flags[sbi].coded_fully=_enc->pipe.fr[0].sb_full;
+      sb_flags[sbi].coded_partially=_enc->pipe.fr[0].sb_partial;
     }
-    oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
+    oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,0,notstart,notdone);
     /*Code chroma planes.*/
     for(pli=1;pli<3;pli++){
-      oc_enc_sb_transform_quantize_inter_chroma(_enc,&pipe,
-       pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
-      oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
+      oc_enc_sb_transform_quantize_inter_chroma(_enc,&_enc->pipe,
+       pli,_enc->pipe.sbi0[pli],_enc->pipe.sbi_end[pli]);
+      oc_enc_pipeline_finish_mcu_plane(_enc,&_enc->pipe,pli,notstart,notdone);
     }
     notstart=1;
   }
@@ -2724,7 +2658,7 @@
        inaccuracy is small.
       We don't need to add the luma plane coding flag costs, because they are
        already included in the MB rate estimates.*/
-    for(pli=1;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
+    for(pli=1;pli<3;pli++)interbits+=_enc->pipe.fr[pli].bits<<OC_BIT_SCALE;
     if(interbits>intrabits)return 1;
   }
   _enc->ncoded_mbis=ncoded_mbis;

Copied: experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl (from rev 17378, branches/theorarm-merge-branch/arm2gnu.pl)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/arm2gnu.pl	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,271 @@
+#!/usr/bin/perl
+
+my $bigend;  # little/big endian
+
+eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
+    if $running_under_some_shell;
+
+while ($ARGV[0] =~ /^-/) {
+    $_ = shift;
+  last if /^--/;
+    if (/^-n/) {
+    $nflag++;
+    next;
+    }
+    die "I don't recognize this switch: $_\\n";
+}
+$printit++ unless $nflag;
+
+$\ = "\n";      # automatically add newline on print
+$n=0;
+
+$thumb = 0;     # ARM mode by default, not Thumb.
+
+LINE:
+while (<>) {
+
+    # For ADRLs we need to add a new line after the substituted one.
+    $addPadding = 0;
+
+    # First, we do not dare to touch *anything* inside double quotes, do we?
+    # Second, if you want a dollar character in the string,
+    # insert two of them -- that's how ARM C and assembler treat strings.
+    s/^([A-Za-z_]\w*)[ \t]+DCB[ \t]*\"/$1:   .ascii \"/   && do { s/\$\$/\$/g; next };
+    s/\bDCB\b[ \t]*\"/.ascii \"/                          && do { s/\$\$/\$/g; next };
+    s/^(\S+)\s+RN\s+(\S+)/$1 .req r$2/                    && do { s/\$\$/\$/g; next };
+    # If there's nothing on a line but a comment, don't try to apply any further
+    #  substitutions (this is a cheap hack to avoid mucking up the license header)
+    s/^([ \t]*);/$1@/                                     && do { s/\$\$/\$/g; next };
+    # If substituted -- leave immediately !
+
+    s/@/,:/;
+    s/;/@/;
+    while ( /@.*'/ ) {
+      s/(@.*)'/$1/g;
+    }
+    s/\{FALSE\}/0/g;
+    s/\{TRUE\}/1/g;
+    s/\{(\w\w\w\w+)\}/$1/g;
+    s/\bINCLUDE[ \t]*([^ \t\n]+)/.include \"$1\"/;
+    s/\bGET[ \t]*([^ \t\n]+)/.include \"${ my $x=$1; $x =~ s|\.s|-gnu.S|; \$x }\"/;
+    s/\bIMPORT\b/.extern/;
+    s/\bEXPORT\b/.global/;
+    s/^(\s+)\[/$1IF/;
+    s/^(\s+)\|/$1ELSE/;
+    s/^(\s+)\]/$1ENDIF/;
+    s/IF *:DEF:/ .ifdef/;
+    s/IF *:LNOT: *:DEF:/ .ifndef/;
+    s/ELSE/ .else/;
+    s/ENDIF/ .endif/;
+
+    if( /\bIF\b/ ) {
+      s/\bIF\b/ .if/;
+      s/=/==/;
+    }
+    if ( $n == 2) {
+        s/\$/\\/g;
+    }
+    if ($n == 1) {
+        s/\$//g;
+        s/label//g;
+    $n = 2;
+      }
+    if ( /MACRO/ ) {
+      s/MACRO *\n/.macro/;
+      $n=1;
+    }
+    if ( /\bMEND\b/ ) {
+      s/\bMEND\b/.endm/;
+      $n=0;
+    }
+
+    # ".rdata" doesn't work in 'as' version 2.13.2, as it is ".rodata" there.
+    #
+    if ( /\bAREA\b/ ) {
+        s/^(.+)CODE(.+)READONLY(.*)/    .text/;
+        s/^(.+)DATA(.+)READONLY(.*)/    .section .rdata\n    .align 2/;
+        s/^(.+)\|\|\.data\|\|(.+)/    .data\n    .align 2/;
+        s/^(.+)\|\|\.bss\|\|(.+)/    .bss/;
+    }
+
+    s/\|\|\.constdata\$(\d+)\|\|/.L_CONST$1/;       # ||.constdata$3||
+    s/\|\|\.bss\$(\d+)\|\|/.L_BSS$1/;               # ||.bss$2||
+    s/\|\|\.data\$(\d+)\|\|/.L_DATA$1/;             # ||.data$2||
+    s/\|\|([a-zA-Z0-9_]+)\@([a-zA-Z0-9_]+)\|\|/@ $&/;
+    s/^(\s+)\%(\s)/    .space $1/;
+
+    s/\|(.+)\.(\d+)\|/\.$1_$2/;                     # |L80.123| -> .L80_123
+    s/\bCODE32\b/.code 32/ && do {$thumb = 0};
+    s/\bCODE16\b/.code 16/ && do {$thumb = 1};
+    if (/\bPROC\b/)
+    {
+        print "    .thumb_func" if ($thumb);
+        s/\bPROC\b/@ $&/;
+    }
+    s/^(\s*)(S|Q|SH|U|UQ|UH)ASX\b/$1$2ADDSUBX/;
+    s/^(\s*)(S|Q|SH|U|UQ|UH)SAX\b/$1$2SUBADDX/;
+    s/\bENDP\b/@ $&/;
+    s/\bSUBT\b/@ $&/;
+    s/\bDATA\b/@ $&/;   # DATA directive is deprecated -- Asm guide, p.7-25
+    s/\bKEEP\b/@ $&/;
+    s/\bEXPORTAS\b/@ $&/;
+    s/\|\|(.)+\bEQU\b/@ $&/;
+    s/\|\|([\w\$]+)\|\|/$1/;
+    s/\bENTRY\b/@ $&/;
+    s/\bASSERT\b/@ $&/;
+    s/\bGBLL\b/@ $&/;
+    s/\bGBLA\b/@ $&/;
+    s/^\W+OPT\b/@ $&/;
+    s/:OR:/|/g;
+    s/:SHL:/<</g;
+    s/:SHR:/>>/g;
+    s/:AND:/&/g;
+    s/:LAND:/&&/g;
+    s/CPSR/cpsr/;
+    s/SPSR/spsr/;
+    s/ALIGN$/.balign 4/;
+    s/ALIGN\s+([0-9x]+)$/.balign $1/;
+    s/psr_cxsf/psr_all/;
+    s/LTORG/.ltorg/;
+    s/^([A-Za-z_]\w*)[ \t]+EQU/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETL/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+SETA/ .set $1,/;
+    s/^([A-Za-z_]\w*)[ \t]+\*/ .set $1,/;
+
+    #  {PC} + 0xdeadfeed  -->  . + 0xdeadfeed
+    s/\{PC\} \+/ \. +/;
+
+    # Single hex constant on the line !
+    #
+    # >>> NOTE <<<
+    #   Double-precision floats in gcc are always mixed-endian, which means
+    #   bytes in two words are little-endian, but words are big-endian.
+    #   So, 0x0000deadfeed0000 would be stored as 0x0000dead at low address
+    #   and 0xfeed0000 at high address.
+    #
+    s/\bDCFD\b[ \t]+0x([a-fA-F0-9]{8})([a-fA-F0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+    s/\bDCFD\b[ \t]+([0-9\.\-]+)/.double $1/;
+
+    # Single hex constant on the line !
+#    s/\bDCFS\b[ \t]+0x([a-f0-9]{8})([a-f0-9]{8})/.long 0x$1, 0x$2/;
+    # Only decimal constants on the line, no hex !
+#    s/\bDCFS\b[ \t]+([0-9\.\-]+)/.double $1/;
+    s/\bDCFS[ \t]+0x/.word 0x/;
+    s/\bDCFS\b/.float/;
+
+    s/^([A-Za-z_]\w*)[ \t]+DCD/$1 .word/;
+    s/\bDCD\b/.word/;
+    s/^([A-Za-z_]\w*)[ \t]+DCW/$1 .short/;
+    s/\bDCW\b/.short/;
+    s/^([A-Za-z_]\w*)[ \t]+DCB/$1 .byte/;
+    s/\bDCB\b/.byte/;
+    s/^([A-Za-z_]\w*)[ \t]+\%/.comm $1,/;
+    s/^[A-Za-z_\.]\w+/$&:/;
+    s/^(\d+)/$1:/;
+    s/\%(\d+)/$1b_or_f/;
+    s/\%[Bb](\d+)/$1b/;
+    s/\%[Ff](\d+)/$1f/;
+    s/\%[Ff][Tt](\d+)/$1f/;
+    s/&([\dA-Fa-f]+)/0x$1/;
+    if ( /\b2_[01]+\b/ ) {
+      s/\b2_([01]+)\b/conv$1&&&&/g;
+      while ( /[01][01][01][01]&&&&/ ) {
+        s/0000&&&&/&&&&0/g;
+        s/0001&&&&/&&&&1/g;
+        s/0010&&&&/&&&&2/g;
+        s/0011&&&&/&&&&3/g;
+        s/0100&&&&/&&&&4/g;
+        s/0101&&&&/&&&&5/g;
+        s/0110&&&&/&&&&6/g;
+        s/0111&&&&/&&&&7/g;
+        s/1000&&&&/&&&&8/g;
+        s/1001&&&&/&&&&9/g;
+        s/1010&&&&/&&&&A/g;
+        s/1011&&&&/&&&&B/g;
+        s/1100&&&&/&&&&C/g;
+        s/1101&&&&/&&&&D/g;
+        s/1110&&&&/&&&&E/g;
+        s/1111&&&&/&&&&F/g;
+      }
+      s/000&&&&/&&&&0/g;
+      s/001&&&&/&&&&1/g;
+      s/010&&&&/&&&&2/g;
+      s/011&&&&/&&&&3/g;
+      s/100&&&&/&&&&4/g;
+      s/101&&&&/&&&&5/g;
+      s/110&&&&/&&&&6/g;
+      s/111&&&&/&&&&7/g;
+      s/00&&&&/&&&&0/g;
+      s/01&&&&/&&&&1/g;
+      s/10&&&&/&&&&2/g;
+      s/11&&&&/&&&&3/g;
+      s/0&&&&/&&&&0/g;
+      s/1&&&&/&&&&1/g;
+      s/conv&&&&/0x/g;
+    }
+
+    if ( /commandline/)
+    {
+        if( /-bigend/)
+        {
+            $bigend=1;
+        }
+    }
+
+    if ( /\bDCDU\b/ )
+    {
+        my $cmd=$_;
+        my $value;
+        my $w1;
+        my $w2;
+        my $w3;
+        my $w4;
+
+        s/\s+DCDU\b/@ $&/;
+
+        $cmd =~ /\bDCDU\b\s+0x(\d+)/;
+        $value = $1;
+        $value =~ /(\w\w)(\w\w)(\w\w)(\w\w)/;
+        $w1 = $1;
+        $w2 = $2;
+        $w3 = $3;
+        $w4 = $4;
+
+        if( $bigend ne "")
+        {
+            # big endian
+
+            print "        .byte      0x".$w1;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w4;
+        }
+        else
+        {
+            # little endian
+
+            print "        .byte      0x".$w4;
+            print "        .byte      0x".$w3;
+            print "        .byte      0x".$w2;
+            print "        .byte      0x".$w1;
+        }
+
+    }
+
+
+    if ( /\badrl\b/i )
+    {
+        s/\badrl\s+(\w+)\s*,\s*(\w+)/ldr $1,=$2/i;
+        $addPadding = 1;
+    }
+    s/\bEND\b/@ END/;
+} continue {
+    printf ("%s", $_) if $printit;
+    if ($addPadding != 0)
+    {
+        printf ("   mov r0,r0\n");
+        $addPadding = 0;
+    }
+}
+

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armbits.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,32 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armbits_H)
+# define _arm_armbits_H (1)
+# include "../bitpack.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+#  define oc_pack_read oc_pack_read_arm
+#  define oc_pack_read1 oc_pack_read1_arm
+#  define oc_huff_token_decode oc_huff_token_decode_arm
+# endif
+
+long oc_pack_read_arm(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_arm(oc_pack_buf *_b);
+int oc_huff_token_decode_arm(oc_pack_buf *_b,const ogg_int16_t *_tree);
+
+#endif

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armbits.s	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,227 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+;
+; function:
+;   last mod: $Id: armbits.c 17344 2010-07-21 01:42:18Z tterribe $
+;
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	EXPORT oc_pack_read_arm
+	EXPORT oc_pack_read1_arm
+	EXPORT oc_huff_token_decode_arm
+
+oc_pack_read_arm
+	; r0 = oc_pack_buf *_b
+	; r1 = int          _bits
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,r1          ; r3 = available-_bits, available<_bits => LT
+	BLT oc_pack_read_refill
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+oc_pack_read1_arm
+	; r0 = oc_pack_buf *_b
+	ADD r12,r0,#8
+	LDMIA r12,{r2,r3}      ; r2 = window
+	; Stall...             ; r3 = available
+	; Stall...
+	SUBS r3,r3,#1          ; r3 = available-1, available<1 => LT
+	BLT oc_pack_read1_refill
+	MOV r0,r2,LSR #31      ; r0 = window>>31
+	MOV r2,r2,LSL #1       ; r2 = window<<=1
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	MOV PC,r14
+
+; We need to refill window.
+oc_pack_read1_refill
+	MOV r1,#1
+oc_pack_read_refill
+	STMFD r13!,{r10,r11,r14}
+	LDMIA r0,{r10,r11}     ; r10 = stop
+	                       ; r11 = ptr
+	RSB r0,r1,#32          ; r0 = 32-_bits
+	RSB r3,r3,r0           ; r3 = 32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMP r10,r11            ; ptr<stop => HI
+	CMPHI r3,#7            ;   available<=24 => HI
+	LDRHIB r14,[r11],#1    ;     r14 = *ptr++
+	SUBHI r3,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;     r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;     ptr<stop => HI
+	CMPHI r3,#7            ;       available<=24 => HI
+	LDRHIB r14,[r11],#1    ;         r14 = *ptr++
+	SUBHI r3,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;         r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;         ptr<stop => HI
+	CMPHI r3,#7            ;           available<=24 => HI
+	LDRHIB r14,[r11],#1    ;             r14 = *ptr++
+	SUBHI r3,#8            ;             available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;             r2 = window|=r14<<32-available
+	CMPHI r10,r11          ;             ptr<stop => HI
+	CMPHI r3,#7            ;               available<=24 => HI
+	LDRHIB r14,[r11],#1    ;                 r14 = *ptr++
+	SUBHI r3,#8            ;                 available += 8
+	; (HI) Stall...
+	ORRHI r2,r14,LSL r3    ;                 r2 = window|=r14<<32-available
+	SUBS r3,r0,r3          ; r3 = available-=_bits, available<bits => GT
+	BLT oc_pack_read_refill_last
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+; Either we wanted to read more than 24 bits and didn't have enough room to
+;  stuff the last byte into the window, or we hit the end of the packet.
+oc_pack_read_refill_last
+	CMP r11,r10            ; ptr<stop => LO
+; If we didn't hit the end of the packet, then pull enough of the next byte to
+;  to fill up the window.
+	LDRLOB r14,[r11]       ; (LO) r14 = *ptr
+; Otherwise, set the EOF flag and pretend we have lots of available bits.
+	MOVHS r14,#1           ; (HS) r14 = 1
+	ADDLO r10,r3,r1        ; (LO) r10 = available
+	STRHS r14,[r12,#8]     ; (HS) eof = 1
+	ANDLO r10,r10,#7       ; (LO) r10 = available&7
+	MOVHS r3,#1<<30        ; (HS) available = OC_LOTS_OF_BITS
+	ORRLO r2,r14,LSL r10   ; (LO) r2 = window|=*ptr>>(available&7)
+	MOV r0,r2,LSR r0       ; r0 = window>>32-_bits
+	MOV r2,r2,LSL r1       ; r2 = window<<=_bits
+	STR r11,[r12,#-4]      ; ptr = r11
+	STMIA r12,{r2,r3}      ; window = r2
+	                       ; available = r3
+	LDMFD r13!,{r10,r11,PC}
+
+
+
+oc_huff_token_decode_arm
+	; r0 = oc_pack_buf       *_b
+	; r1 = const ogg_int16_t *_tree
+	STMFD r13!,{r4,r5,r10,r14}
+	LDRSH r10,[r1]         ; r10 = n=_tree[0]
+	LDMIA r0,{r2-r5}       ; r2 = stop
+	; Stall...             ; r3 = ptr
+	; Stall...             ; r4 = window
+	                       ; r5 = available
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill0
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r14,r1,r14,LSL #1  ; r14 = _tree+bits
+	LDRSH r12,[r14,#2]     ; r12 = node=_tree[1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+; The first tree node wasn't enough to reach a leaf, read another
+oc_huff_token_decode_continue
+	ADD r12,r1,r12,LSL #1  ; r12 = _tree+node
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r5,r10          ; r5 = available-=n
+	LDRSH r10,[r12],#2     ; r10 = n=_tree[node]
+	; Stall...             ; r12 = _tree+node+1
+	; Stall...
+	CMP r10,r5             ; n>available => GT
+	BGT oc_huff_token_decode_refill
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	; Stall...
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+oc_huff_token_decode_refill0
+	ADD r12,r1,#2          ; r12 = _tree+1
+oc_huff_token_decode_refill
+; We can't possibly need more than 15 bits, so available must be <= 15.
+; Therefore we can load at least two bytes without checking it.
+	CMP r2,r3              ; ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;   r14 = *ptr++
+	RSBHI r5,r5,#24        ; (HI) available = 32-(available+=8)
+	RSBLS r5,r5,#32        ; (LS) r5 = 32-available
+	ORRHI r4,r14,LSL r5    ;   r4 = window|=r14<<32-available
+	CMPHI r2,r3            ;   ptr<stop => HI
+	LDRHIB r14,[r3],#1     ;     r14 = *ptr++
+	SUBHI r5,#8            ;     available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;     r4 = window|=r14<<32-available
+; We can use unsigned compares for both the pointers and for available
+;  (allowing us to chain condition codes) because available will never be
+;  larger than 32 (or we wouldn't be here), and thus 32-available will never be
+;  negative.
+	CMPHI r2,r3            ;     ptr<stop => HI
+	CMPHI r5,#7            ;       available<=24 => HI
+	LDRHIB r14,[r3],#1     ;         r14 = *ptr++
+	SUBHI r5,#8            ;         available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ;         r4 = window|=r14<<32-available
+	CMP r2,r3              ; ptr<stop => HI
+	MOVLS r5,#-1<<30       ; (LS) available = OC_LOTS_OF_BITS+32
+	CMPHI r5,#7            ; (HI) available<=24 => HI
+	LDRHIB r14,[r3],#1     ; (HI)   r14 = *ptr++
+	SUBHI r5,#8            ; (HI)   available += 8
+	; (HI) Stall...
+	ORRHI r4,r14,LSL r5    ; (HI)   r4 = window|=r14<<32-available
+	RSB r14,r10,#32        ; r14 = 32-n
+	MOV r14,r4,LSR r14     ; r14 = bits=window>>32-n
+	ADD r12,r12,r14        ;
+	LDRSH r12,[r12,r14]    ; r12 = node=_tree[node+1+bits]
+	RSB r5,r5,#32          ; r5 = available
+	; Stall...
+	RSBS r14,r12,#0        ; r14 = -node, node>0 => MI
+	BMI oc_huff_token_decode_continue
+	MOV r10,r14,LSR #8     ; r10 = n=node>>8
+	MOV r4,r4,LSL r10      ; r4 = window<<=n
+	SUB r5,r10             ; r5 = available-=n
+	STMIB r0,{r3-r5}       ; ptr = r3
+	                       ; window = r4
+	                       ; available = r5
+	AND r0,r14,#255        ; r0 = node&255
+	LDMFD r13!,{r4,r5,r10,pc}
+
+	END

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,116 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for ARM processors.
+
+ function:
+  last mod: $Id: cpu.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#include "armcpu.h"
+
+#if !defined(OC_ARM_ASM)|| \
+ !defined(OC_ARM_ASM_EDSP)&&!defined(OC_ARM_ASM_ARMV6)&& \
+ !defined(OC_ARM_ASM_NEON)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+
+#elif defined(_MSC_VER)
+/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
+# define WIN32_LEAN_AND_MEAN
+# define WIN32_EXTRA_LEAN
+# include <windows.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  flags=0;
+  /*MSVC has no inline __asm support for ARM, but it does let you __emit
+     instructions via their assembled hex code.
+    All of these instructions should be essentially nops.*/
+# if defined(OC_ARM_ASM_EDSP)
+  __try{
+    /*PLD [r13]*/
+    __emit(0xF5DDF000);
+    flags|=OC_CPU_ARM_EDSP;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  __try{
+    /*SHADD8 r3,r3,r3*/
+    __emit(0xE6333F93);
+    flags|=OC_CPU_ARM_MEDIA;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  __try{
+    /*VORR q0,q0,q0*/
+    __emit(0xF2200150);
+    flags|=OC_CPU_ARM_NEON;
+  }
+  __except(GetExceptionCode()==EXCEPTION_ILLEGAL_INSTRUCTION){
+    /*Ignore exception.*/
+  }
+#   endif
+#  endif
+# endif
+  return flags;
+}
+
+#elif defined(__linux__)
+# include <stdio.h>
+# include <stdlib.h>
+# include <string.h>
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t  flags;
+  FILE         *fin;
+  flags=0;
+  /*Reading /proc/self/auxv would be easier, but that doesn't work reliably on
+     Android.
+    This also means that detection will fail in Scratchbox.*/
+  fin=fopen("/proc/cpuinfo","r");
+  if(fin!=NULL){
+    /*512 should be enough for anybody (it's even enough for all the flags that
+       x86 has accumulated... so far).*/
+    char buf[512];
+    while(fgets(buf,511,fin)!=NULL){
+      if(memcmp(buf,"Features",8)==0){
+        char *p;
+        p=strstr(buf," edsp");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_EDSP;
+        p=strstr(buf," neon");
+        if(p!=NULL&&(p[5]==' '||p[5]=='\n'))flags|=OC_CPU_ARM_NEON;
+      }
+      if(memcmp(buf,"CPU architecture:",17)==0){
+        int version;
+        version=atoi(buf+17);
+        if(version>=6)flags|=OC_CPU_ARM_MEDIA;
+      }
+    }
+    fclose(fin);
+  }
+  return flags;
+}
+
+#else
+/*The feature registers which can tell us what the processor supports are
+   accessible in priveleged modes only, so we can't have a general user-space
+   detection method like on x86.*/
+# error "Configured to use ARM asm but no CPU detection method available for " \
+ "your platform.  Reconfigure with --disable-asm (or send patches)."
+#endif

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armcpu.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,29 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_arm_armcpu_H)
+# define _arm_armcpu_H (1)
+#include "../internal.h"
+
+/*"Parallel instructions" from ARM v6 and above.*/
+#define OC_CPU_ARM_MEDIA    (1<<24)
+/*Flags chosen to match arch/arm/include/asm/hwcap.h in the Linux kernel.*/
+#define OC_CPU_ARM_EDSP     (1<<7)
+#define OC_CPU_ARM_NEON     (1<<12)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMfrag.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armfrag.s	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,633 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+; Vanilla ARM v4 versions
+	EXPORT	oc_frag_copy_list_arm
+	EXPORT	oc_frag_recon_intra_arm
+	EXPORT	oc_frag_recon_inter_arm
+	EXPORT	oc_frag_recon_inter2_arm
+
+oc_frag_copy_list_arm
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r11,r14}
+	SUBS	r12, r12, #1
+	LDR	r4,[r3],#4		; r4 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*6]		; r14 = _frag_buf_offs
+	BLT	ofcl_arm_end
+	SUB	r2, r2, #4
+ofcl_arm_lp
+	LDR	r11,[r14,r4,LSL #2]	; r11 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	ADD	r4, r1, r11		; r4 = _src_frame+frag_buf_off
+	LDR	r6, [r4], #4
+	ADD	r11,r0, r11		; r11 = _dst_frame+frag_buf_off
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4], r2
+	STR	r6, [r11],#4
+	LDR	r6, [r4], #4
+	STR	r5, [r11],r2
+	LDR	r5, [r4]
+	LDRGE	r4,[r3],#4		; r4 = _fragis[fragii]
+	STR	r6, [r11],#4
+	STR	r5, [r11]
+	BGE	ofcl_arm_lp
+ofcl_arm_end
+	LDMFD	r13!,{r4-r6,r11,PC}
+oc_frag_recon_intra_arm
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4,r5,r14}
+	MOV	r14,#8
+	MOV	r5, #255
+	SUB	r1, r1, #7
+ofrintra_lp_arm
+	LDRSH	r3, [r2], #2
+	LDRSH	r4, [r2], #2
+	LDRSH	r12,[r2], #2
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	LDRSH	r12,[r2], #2
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	LDRSH	r3, [r2], #2
+	STRB	r4, [r0], #1
+	ADDS	r12,r12,#128
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	LDRSH	r4, [r2], #2
+	STRB	r12,[r0], #1
+	ADDS	r3, r3, #128
+	CMPGT	r5, r3
+	EORLT	r3, r5, r3, ASR #32
+	STRB	r3, [r0], #1
+	ADDS	r4, r4, #128
+	CMPGT	r5, r4
+	EORLT	r4, r5, r4, ASR #32
+	STRB	r4, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	ofrintra_lp_arm
+	LDMFD	r13!,{r4,r5,PC}
+
+oc_frag_recon_inter_arm
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src
+	; r2 =       int            ystride
+	; r3 = const ogg_int16_t    residue[64]
+	STMFD	r13!,{r5,r9-r11,r14}
+	MOV	r9, #8
+	MOV	r5, #255
+	SUB	r2, r2, #7
+ofrinter_lp_arm
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], #1
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	LDRSH	r12,[r3], #2
+	LDRB	r14,[r1], #1
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], #1
+	ADDS	r12,r12,r14
+	CMPGT	r5, r12
+	LDRSH	r11,[r3], #2
+	LDRB	r10,[r1], r2
+	EORLT	r12,r5, r12,ASR #32
+	STRB	r12,[r0], #1
+	ADDS	r11,r11,r10
+	CMPGT	r5, r11
+	EORLT	r11,r5, r11,ASR #32
+	STRB	r11,[r0], r2
+	SUBS	r9, r9, #1
+	BGT	ofrinter_lp_arm
+	LDMFD	r13!,{r5,r9-r11,PC}
+
+oc_frag_recon_inter2_arm
+	; r0 =       unsigned char *dst
+	; r1 = const unsigned char *src1
+	; r2 = const unsigned char *src2
+	; r3 =       int            ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    residue[64]
+	STMFD	r13!,{r4-r8,r14}
+	MOV	r14,#8
+	MOV	r8, #255
+	SUB	r3, r3, #7
+ofrinter2_lp_arm
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	LDRB	r7, [r1], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	LDRB	r5, [r1], #1
+	LDRB	r6, [r2], #1
+	LDRSH	r4, [r12],#2
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], #1
+	ADD	r5, r5, r6
+	ADDS	r5, r4, r5, LSR #1
+	CMPGT	r8, r5
+	LDRB	r7, [r1], r3
+	LDRB	r6, [r2], r3
+	LDRSH	r4, [r12],#2
+	EORLT	r5, r8, r5, ASR #32
+	STRB	r5, [r0], #1
+	ADD	r7, r7, r6
+	ADDS	r7, r4, r7, LSR #1
+	CMPGT	r8, r7
+	EORLT	r7, r8, r7, ASR #32
+	STRB	r7, [r0], r3
+	SUBS	r14,r14,#1
+	BGT	ofrinter2_lp_arm
+	LDMFD	r13!,{r4-r8,PC}
+
+ [ OC_ARM_ASM_EDSP
+	EXPORT	oc_frag_copy_list_edsp
+
+oc_frag_copy_list_edsp
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r11,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*10]		; r14 = _frag_buf_offs
+	BLT	ofcl_edsp_end
+ofcl_edsp_lp
+	MOV	r4, r1
+	LDR	r5, [r14,r5, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	SUBS	r12, r12, #1
+	; Stall (on XScale)
+	LDRD	r6, [r4, r5]!		; r4 = _src_frame+frag_buf_off
+	LDRD	r8, [r4, r2]!
+	; Stall
+	STRD	r6, [r5, r0]!		; r5 = _dst_frame+frag_buf_off
+	STRD	r8, [r5, r2]!
+	; Stall
+	LDRD	r6, [r4, r2]!	; On Xscale at least, doing 3 consecutive
+	LDRD	r8, [r4, r2]!	; loads causes a stall, but that's no worse
+	LDRD	r10,[r4, r2]!	; than us only doing 2, and having to do
+				; another pair of LDRD/STRD later on.
+	; Stall
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRD	r6, [r4, r2]!
+	LDRD	r8, [r4, r2]!
+	LDRD	r10,[r4, r2]!
+	STRD	r6, [r5, r2]!
+	STRD	r8, [r5, r2]!
+	STRD	r10,[r5, r2]!
+	LDRGE	r5, [r3],#4		; r5 = _fragis[fragii]
+	BGE	ofcl_edsp_lp
+ofcl_edsp_end
+	LDMFD	r13!,{r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_frag_recon_intra_v6
+	EXPORT	oc_frag_recon_inter_v6
+	EXPORT	oc_frag_recon_inter2_v6
+
+oc_frag_recon_intra_v6
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r6,r14}
+	MOV	r14,#8
+	MOV	r12,r2
+	LDR	r6, =0x00800080
+ofrintra_v6_lp
+	LDRD	r2, [r12],#8	; r2 = 11110000 r3 = 33332222
+	LDRD	r4, [r12],#8	; r4 = 55554444 r5 = 77776666
+	SUBS	r14,r14,#1
+	QADD16	r2, r2, r6
+	QADD16	r3, r3, r6
+	QADD16	r4, r4, r6
+	QADD16	r5, r5, r6
+	USAT16	r2, #8, r2		; r2 = __11__00
+	USAT16	r3, #8, r3		; r3 = __33__22
+	USAT16	r4, #8, r4		; r4 = __55__44
+	USAT16	r5, #8, r5		; r5 = __77__66
+	ORR	r2, r2, r2, LSR #8	; r2 = __111100
+	ORR	r3, r3, r3, LSR #8	; r3 = __333322
+	ORR	r4, r4, r4, LSR #8	; r4 = __555544
+	ORR	r5, r5, r5, LSR #8	; r5 = __777766
+	PKHBT   r2, r2, r3, LSL #16     ; r2 = 33221100
+	PKHBT   r3, r4, r5, LSL #16     ; r3 = 77665544
+	STRD	r2, [r0], r1
+	BGT	ofrintra_v6_lp
+	LDMFD	r13!,{r4-r6,PC}
+
+oc_frag_recon_inter_v6
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r7,r14}
+	MOV	r14,#8
+ofrinter_v6_lp
+	LDRD	r6, [r3], #8		; r6 = 11110000 r7 = 33332222
+	SUBS	r14,r14,#1
+ [ OC_ARM_CAN_UNALIGN_LDRD
+	LDRD	r4, [r1], r2	; Unaligned ; r4 = 33221100 r5 = 77665544
+ |
+	LDR	r5, [r1, #4]
+	LDR	r4, [r1], r2
+ ]
+	PKHBT	r12,r6, r7, LSL #16	; r12= 22220000
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r6,r4			; r6 = __22__00
+	UXTB16	r4,r4, ROR #8		; r4 = __33__11
+	QADD16	r12,r12,r6		; r12= xx22xx00
+	QADD16	r4, r7, r4		; r4 = xx33xx11
+	LDRD	r6, [r3], #8		; r6 = 55554444 r7 = 77776666
+	USAT16	r4, #8, r4		; r4 = __33__11
+	USAT16	r12,#8,r12		; r12= __22__00
+	ORR	r4, r12,r4, LSL #8	; r4 = 33221100
+	PKHBT	r12,r6, r7, LSL #16	; r12= 66664444
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 77775555
+	UXTB16	r6,r5			; r6 = __66__44
+	UXTB16	r5,r5, ROR #8		; r5 = __77__55
+	QADD16	r12,r12,r6		; r12= xx66xx44
+	QADD16	r5, r7, r5		; r5 = xx77xx55
+	USAT16	r12,#8, r12		; r12= __66__44
+	USAT16	r5, #8, r5		; r4 = __77__55
+	ORR	r5, r12,r5, LSL #8	; r5 = 33221100
+	STRD	r4, [r0], r2
+	BGT	ofrinter_v6_lp
+	LDMFD	r13!,{r4-r7,PC}
+
+oc_frag_recon_inter2_v6
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	STMFD	r13!,{r4-r9,r14}
+	MOV	r14,#8
+ofrinter2_v6_lp
+	LDRD	r6, [r12,#8]	; r6 = 55554444 r7 = 77776666
+	SUBS	r14,r14,#1
+	LDR	r4, [r1, #4]	; Unaligned	; r4 = src1[1] = 77665544
+	LDR	r5, [r2, #4]	; Unaligned	; r5 = src2[1] = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 66664444
+	PKHTB	r9, r7, r6, ASR #16	; r9 = 77775555
+	UHADD8	r4, r4, r5	; r4 = (src1[7,6,5,4] + src2[7,6,5,4])>>1
+	UXTB16	r5, r4			; r5 = __66__44
+	UXTB16	r4, r4, ROR #8		; r4 = __77__55
+	QADD16	r8, r8, r5		; r8 = xx66xx44
+	QADD16	r9, r9, r4		; r9 = xx77xx55
+	LDRD	r6,[r12],#16	; r6 = 33332222 r7 = 11110000
+	USAT16	r8, #8, r8		; r8 = __66__44
+	LDR	r4, [r1], r3	; Unaligned	; r4 = src1[0] = 33221100
+	USAT16	r9, #8, r9		; r9 = __77__55
+	LDR	r5, [r2], r3	; Unaligned	; r5 = src2[0] = 33221100
+	ORR	r9, r8, r9, LSL #8	; r9 = 77665544
+	PKHBT	r8, r6, r7, LSL #16	; r8 = 22220000
+	UHADD8	r4, r4, r5	; r4 = (src1[3,2,1,0] + src2[3,2,1,0])>>1
+	PKHTB	r7, r7, r6, ASR #16	; r7 = 33331111
+	UXTB16	r5, r4			; r5 = __22__00
+	UXTB16	r4, r4, ROR #8		; r4 = __33__11
+	QADD16	r8, r8, r5		; r8 = xx22xx00
+	QADD16	r7, r7, r4		; r7 = xx33xx11
+	USAT16	r8, #8, r8		; r8 = __22__00
+	USAT16	r7, #8, r7		; r7 = __33__11
+	ORR	r8, r8, r7, LSL #8	; r8 = 33221100
+	STRD	r8, [r0], r3
+	BGT	ofrinter2_v6_lp
+	LDMFD	r13!,{r4-r9,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_frag_copy_list_neon
+	EXPORT	oc_frag_recon_intra_neon
+	EXPORT	oc_frag_recon_inter_neon
+	EXPORT	oc_frag_recon_inter2_neon
+
+oc_frag_copy_list_neon
+	; r0 = _dst_frame
+	; r1 = _src_frame
+	; r2 = _ystride
+	; r3 = _fragis
+	; <> = _nfragis
+	; <> = _frag_buf_offs
+	LDR	r12,[r13]		; r12 = _nfragis
+	STMFD	r13!,{r4-r6,r14}
+	SUBS	r12, r12, #1
+	LDRGE	r6, [r3],#4		; r6 = _fragis[fragii]
+	LDRGE	r14,[r13,#4*5]		; r14 = _frag_buf_offs
+	BLT	ofcl_neon_end
+	; Stall (2 on Xscale)
+	LDR	r6, [r14,r6, LSL #2]	; r6 = _frag_buf_offs[_fragis[fragii]]
+	; Stall (on XScale)
+ofcl_neon_lp
+	ADD	r4, r1, r6
+	VLD1.64	{D0}, [r4 at 64], r2
+	VLD1.64	{D1}, [r4 at 64], r2
+	VLD1.64	{D2}, [r4 at 64], r2
+	VLD1.64	{D3}, [r4 at 64], r2
+	ADD	r5, r6, r0
+	VLD1.64	{D4}, [r4], r2
+	SUBS	r12, r12, #1
+	VLD1.64	{D5}, [r4 at 64], r2
+	VLD1.64	{D6}, [r4 at 64], r2
+	VLD1.64	{D7}, [r4 at 64], r2
+	VST1.64	{D0}, [r5 at 64], r2
+	LDRGE	r6, [r3],#4		; r6 = _fragis[fragii]
+	VST1.64	{D1}, [r5 at 64], r2
+	VST1.64	{D2}, [r5 at 64], r2
+	VST1.64	{D3}, [r5 at 64], r2
+	VST1.64	{D4}, [r5 at 64], r2
+	LDRGE	r6, [r14,r6, LSL #2]	; r5 = _frag_buf_offs[_fragis[fragii]]
+	VST1.64	{D5}, [r5 at 64], r2
+	VST1.64	{D6}, [r5 at 64], r2
+	VST1.64	{D7}, [r5 at 64], r2
+	BGE	ofcl_neon_lp
+ofcl_neon_end
+	LDMFD	r13!,{r4-r6,PC}
+
+oc_frag_recon_intra_neon
+	; r0 =       unsigned char *_dst
+	; r1 =       int            _ystride
+	; r2 = const ogg_int16_t    _residue[64]
+	MOV	r3, #128
+	VDUP.S16	Q0, r3
+	VLDMIA	r2,  {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q0
+	VQADD.S16	Q10,Q10,Q0
+	VQADD.S16	Q11,Q11,Q0
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q0
+	VQADD.S16	Q14,Q14,Q0
+	VQADD.S16	Q15,Q15,Q0
+	VQMOVUN.S16	D16,Q8	; D16= 7766554433221100		; 1 cycle
+	VQMOVUN.S16	D17,Q9	; D17= FFEEDDCCBBAA9988		; 1 cycle
+	VQMOVUN.S16	D18,Q10	; D18= NNMMLLKKJJIIHHGG		; 1 cycle
+	VST1.64	{D16},[r0 at 64], r1
+	VQMOVUN.S16	D19,Q11	; D19= VVUUTTSSRRQQPPOO		; 1 cycle
+	VST1.64	{D17},[r0 at 64], r1
+	VQMOVUN.S16	D20,Q12	; D20= ddccbbaaZZYYXXWW		; 1 cycle
+	VST1.64	{D18},[r0 at 64], r1
+	VQMOVUN.S16	D21,Q13	; D21= llkkjjiihhggffee		; 1 cycle
+	VST1.64	{D19},[r0 at 64], r1
+	VQMOVUN.S16	D22,Q14	; D22= ttssrrqqppoonnmm		; 1 cycle
+	VST1.64	{D20},[r0 at 64], r1
+	VQMOVUN.S16	D23,Q15	; D23= !!@@zzyyxxwwvvuu		; 1 cycle
+	VST1.64	{D21},[r0 at 64], r1
+	VST1.64	{D22},[r0 at 64], r1
+	VST1.64	{D23},[r0 at 64], r1
+	MOV	PC,R14
+
+oc_frag_recon_inter_neon
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src
+	; r2 =       int            _ystride
+	; r3 = const ogg_int16_t    _residue[64]
+	VLDMIA	r3, {D16-D31}	; D16= 3333222211110000 etc	; 9(8) cycles
+	VLD1.64	{D0}, [r1], r2
+	VLD1.64	{D2}, [r1], r2
+	VMOVL.U8	Q0, D0	; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D4}, [r1], r2
+	VMOVL.U8	Q1, D2	; etc
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q8, Q8, Q0
+	VLD1.64	{D0}, [r1], r2
+	VQADD.S16	Q9, Q9, Q1
+	VLD1.64	{D2}, [r1], r2
+	VQADD.S16	Q10,Q10,Q2
+	VLD1.64	{D4}, [r1], r2
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D6}, [r1], r2
+	VMOVL.U8	Q0, D0
+	VMOVL.U8	Q1, D2
+	VMOVL.U8	Q2, D4
+	VMOVL.U8	Q3, D6
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q1
+	VQADD.S16	Q14,Q14,Q2
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0 at 64], r2
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0 at 64], r2
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0 at 64], r2
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0 at 64], r2
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0 at 64], r2
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0 at 64], r2
+	VST1.64	{D22},[r0 at 64], r2
+	VST1.64	{D23},[r0 at 64], r2
+	MOV	PC,R14
+
+oc_frag_recon_inter2_neon
+	; r0 =       unsigned char *_dst
+	; r1 = const unsigned char *_src1
+	; r2 = const unsigned char *_src2
+	; r3 =       int            _ystride
+	LDR	r12,[r13]
+	; r12= const ogg_int16_t    _residue[64]
+	VLDMIA	r12,{D16-D31}
+	VLD1.64	{D0}, [r1], r3
+	VLD1.64	{D4}, [r2], r3
+	VLD1.64	{D1}, [r1], r3
+	VLD1.64	{D5}, [r2], r3
+	VHADD.U8	Q2, Q0, Q2	; Q2 = FFEEDDCCBBAA99887766554433221100
+	VLD1.64	{D2}, [r1], r3
+	VLD1.64	{D6}, [r2], r3
+	VMOVL.U8	Q0, D4		; Q0 = __77__66__55__44__33__22__11__00
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q2, D5		; etc
+	VLD1.64	{D7}, [r2], r3
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q8, Q8, Q0
+	VQADD.S16	Q9, Q9, Q2
+	VLD1.64	{D0}, [r1], r3
+	VMOVL.U8	Q1, D6
+	VLD1.64	{D4}, [r2], r3
+	VMOVL.U8	Q3, D7
+	VLD1.64	{D1}, [r1], r3
+	VQADD.S16	Q10,Q10,Q1
+	VLD1.64	{D5}, [r2], r3
+	VQADD.S16	Q11,Q11,Q3
+	VLD1.64	{D2}, [r1], r3
+	VHADD.U8	Q2, Q0, Q2
+	VLD1.64	{D6}, [r2], r3
+	VLD1.64	{D3}, [r1], r3
+	VMOVL.U8	Q0, D4
+	VLD1.64	{D7}, [r2], r3
+	VMOVL.U8	Q2, D5
+	VHADD.U8	Q3, Q1, Q3
+	VQADD.S16	Q12,Q12,Q0
+	VQADD.S16	Q13,Q13,Q2
+	VMOVL.U8	Q1, D6
+	VMOVL.U8	Q3, D7
+	VQADD.S16	Q14,Q14,Q1
+	VQADD.S16	Q15,Q15,Q3
+	VQMOVUN.S16	D16,Q8
+	VQMOVUN.S16	D17,Q9
+	VQMOVUN.S16	D18,Q10
+	VST1.64	{D16},[r0 at 64], r3
+	VQMOVUN.S16	D19,Q11
+	VST1.64	{D17},[r0 at 64], r3
+	VQMOVUN.S16	D20,Q12
+	VST1.64	{D18},[r0 at 64], r3
+	VQMOVUN.S16	D21,Q13
+	VST1.64	{D19},[r0 at 64], r3
+	VQMOVUN.S16	D22,Q14
+	VST1.64	{D20},[r0 at 64], r3
+	VQMOVUN.S16	D23,Q15
+	VST1.64	{D21},[r0 at 64], r3
+	VST1.64	{D22},[r0 at 64], r3
+	VST1.64	{D23},[r0 at 64], r3
+	MOV	PC,R14
+ ]
+
+	END

Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMidct.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armidct.s	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,1822 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_idct8x8_arm
+
+oc_idct8x8_arm
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_arm
+	CMP	r2, #6
+	BLE	oc_idct8x8_6_arm
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_arm
+oc_idct8x8_slow_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	BL	idct8core_arm
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_arm_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	MOV	r6, #0
+	MOV	r7, #0
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+	STMIA	r2!,{r4,r5,r6,r7}
+oc_idct8x8_slow_arm_cols
+; Column transforms
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	BL	idct8core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_10_arm
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct4core_arm
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #4*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_10_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#20]
+	STR	r4, [r0,#32]
+	STR	r4, [r0,#48]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_10_arm_cols
+; Column transforms
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	BL	idct4core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_6_arm
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct3core_arm
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	BEQ	oc_idct8x8_6_arm_cols
+	MOV	r4, #0
+	STR	r4, [r0]
+	STR	r4, [r0,#4]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r2		; Write to the final destination
+oc_idct8x8_6_arm_cols
+; Column transforms
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	BL	idct3core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+
+oc_idct8x8_3_arm
+	STMFD	r13!,{r4-r7,r9-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r2, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2core_arm
+	BL	idct1core_arm
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r2
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r2		; Write to the final destination
+; Column transforms
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	BL	idct2core_down_arm
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r7,r9-r11,PC}
+
+idct1core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #14]
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #46]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #78]
+	STRH	r3, [r0, #94]
+	STRH	r3, [r0, #110]
+	MOV	PC,R14
+
+idct2core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]
+	ADD	r11,r11,r9		; r11= t[0]+t[7]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r12,[r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r10,[r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r3, [r0, #46]		; y[3] = t[0]+t[4]
+	RSB	r3, r3, r9, LSL #1	; r3 = t[0]*2-(t[0]+t[4])=t[0]-t[4]
+	RSB	r10,r10,r9, LSL #1	; r10= t[0]*2-(t[0]+t[5])=t[0]-t[5]
+	RSB	r12,r12,r9, LSL #1	; r12= t[0]*2-(t[0]+t[6])=t[0]-t[6]
+	RSB	r11,r11,r9, LSL #1	; r1 = t[0]*2-(t[0]+t[7])=t[0]-t[7]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+
+idct2core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	LDR	r3, OC_C7S1
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r10,OC_C1S7
+	MUL	r3, r11,r3		; r3 = t[4]<<16 = OC_C7S1*x[1]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r10,r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MOV	r3, r3, ASR #16		; r3 = t[4]
+	MUL	r10,r12,r3		; r10= t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	MOV	r10,r10,ASR #16		; r10= t[5]
+	ADD	r12,r9,r12,ASR #16	; r12= t[0]+t[6]+8
+	ADD	r12,r12,r10		; r12= t[0]+t2[6] = t[0]+t[6]+t[5]+8
+	SUB	r10,r12,r10,LSL #1	; r10= t[0]+t2[5] = t[0]+t[6]-t[5]+8
+	ADD	r3, r3, r9		; r3 = t[0]+t[4]+8
+	ADD	r11,r11,r9		; r11= t[0]+t[7]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r4, r11,ASR #4
+	MOV	r5, r12,ASR #4
+	MOV	r6, r10,ASR #4
+	MOV	r7, r3, ASR #4
+	RSB	r3, r3, r9, LSL #1	;r3 =t[0]*2+8-(t[0]+t[4])=t[0]-t[4]+8
+	RSB	r10,r10,r9, LSL #1	;r10=t[0]*2+8-(t[0]+t[5])=t[0]-t[5]+8
+	RSB	r12,r12,r9, LSL #1	;r12=t[0]*2+8-(t[0]+t[6])=t[0]-t[6]+8
+	RSB	r11,r11,r9, LSL #1	;r11=t[0]*2+8-(t[0]+t[7])=t[0]-t[7]+8
+	MOV	r3, r3, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r11,r11,ASR #4
+	STRH	r4, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[0]+t[6]
+	STRH	r6, [r0, #30]		; y[2] = t[0]+t[5]
+	STRH	r7, [r0, #46]		; y[3] = t[0]+t[4]
+	STRH	r3, [r0, #62]		; y[4] = t[0]-t[4]
+	STRH	r10,[r0, #78]		; y[5] = t[0]-t[5]
+	STRH	r12,[r0, #94]		; y[6] = t[0]-t[6]
+	STRH	r11,[r0, #110]		; y[7] = t[0]-t[7]
+	MOV	PC,r14
+
+idct3core_arm
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1] = t[0]+t[2]
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2] = t[0]-t[2]
+					; r3 = t2[0] = t[0]+t[3]
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3] = t[0]-t[3]
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7]
+	ADD	r5, r10,r5		; r5 = t[1]+t2[6]
+	ADD	r12,r6, r12		; r12= t[2]+t2[5]
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r3, LSL #1	; r11= t2[0] - t[7]
+	RSB	r5, r5, r10,LSL #1	; r5 = t[1]  - t2[6]
+	RSB	r12,r12,r6, LSL #1	; r6 = t[2]  - t2[5]
+	RSB	r4, r4, r9, LSL #1	; r4 = t2[3] - t[4]
+	STRH	r4, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r12,[r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r5, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+
+idct3core_down_arm
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r12,OC_C4S4		; r12= OC_C4S4
+	LDRSH	r3, [r1, #-12]		; r3 = x[2]
+	LDR	r10,OC_C6S2		; r10= OC_C6S2
+	MUL	r9, r12,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r4, OC_C2S6		; r4 = OC_C2S6
+	MUL	r10,r3, r10		; r10= t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r11,[r1, #-14]		; r11= x[1]
+	MUL	r3, r4, r3		; r3 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r4, OC_C7S1		; r4 = OC_C7S1
+	LDR	r5, OC_C1S7		; r5 = OC_C1S7
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r4, r11,r4		; r4 = t[4]<<16 = OC_C7S1*x[1]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r11,r5, r11		; r11= t[7]<<16 = OC_C1S7*x[1]
+	ADD	r3, r9, r3, ASR #16	; r3 = t[0]+t[3]+8
+	MOV	r4, r4, ASR #16		; r4 = t[4]
+	MUL	r5, r12,r4		; r5 = t[5]<<16 = OC_C4S4*t[4]
+	MOV	r11,r11,ASR #16		; r11= t[7]
+	MUL	r12,r11,r12		; r12= t[6]<<16 = OC_C4S4*t[7]
+	ADD	r10,r9, r10,ASR #16	; r10= t[1]+8 = t[0]+t[2]+8
+	RSB	r6, r10,r9, LSL #1	; r6 = t[2]+8 = t[0]-t[2]+8
+					; r3 = t2[0]+8 = t[0]+t[3]+8
+	RSB	r9, r3, r9, LSL #1	; r9 = t2[3]+8 = t[0]-t[3]+8
+	MOV	r12,r12,ASR #16		; r12= t[6]
+	ADD	r5, r12,r5, ASR #16	; r5 = t2[6] = t[6]+t[5]
+	RSB	r12,r5, r12,LSL #1	; r12= t2[5] = t[6]-t[5]
+	ADD	r11,r3, r11		; r11= t2[0]+t[7] +8
+	ADD	r5, r10,r5		; r5 = t[1] +t2[6]+8
+	ADD	r12,r6, r12		; r12= t[2] +t2[5]+8
+	ADD	r4, r9, r4		; r4 = t2[3]+t[4] +8
+	RSB	r3, r11,r3, LSL #1	; r11= t2[0] - t[7]  + 8
+	RSB	r10,r5, r10,LSL #1	; r5 = t[1]  - t2[6] + 8
+	RSB	r6, r12,r6, LSL #1	; r6 = t[2]  - t2[5] + 8
+	RSB	r9, r4, r9, LSL #1	; r4 = t2[3] - t[4]  + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r5, r5, ASR #4
+	MOV	r12,r12,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r3, r3, ASR #4
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r5, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r12,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r4, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r6, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r3, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,R14
+
+idct4core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2]
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2]
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3]
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3]
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r11,r5, r11		; r11= t[0]+t2[7]
+	ADD	r6, r4, r6		; r6 = t[1]+t3[6]
+	ADD	r3, r10,r3		; r3 = t[2]+t3[5]
+	ADD	r7, r9, r7		; r7 = t[3]+t2[4]
+	STRH	r11,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r7, [r0, #46]		; y[3] = t2[3]+t[4]
+	RSB	r11,r11,r5, LSL #1	; r11= t[0]-t2[7]
+	RSB	r6, r6, r4, LSL #1	; r6 = t[1]-t3[6]
+	RSB	r3, r3, r10,LSL #1	; r3 = t[2]-t3[5]
+	RSB	r7, r7, r9, LSL #1	; r7 = t[3]-t2[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11, [r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+
+idct4core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r9, [r1], #16		; r9 = x[0]
+	LDR	r10,OC_C4S4		; r10= OC_C4S4
+	LDRSH	r12,[r1, #-12]		; r12= x[2]
+	LDR	r4, OC_C6S2		; r4 = OC_C6S2
+	MUL	r9, r10,r9		; r9 = t[0]<<16 = OC_C4S4*x[0]
+	LDR	r5, OC_C2S6		; r5 = OC_C2S6
+	MUL	r4, r12,r4		; r4 = t[2]<<16 = OC_C6S2*x[2]
+	LDRSH	r3, [r1, #-14]		; r3 = x[1]
+	MUL	r5, r12,r5		; r5 = t[3]<<16 = OC_C2S6*x[2]
+	LDR	r6, OC_C7S1		; r6 = OC_C7S1
+	LDR	r12,OC_C1S7		; r12= OC_C1S7
+	LDRSH	r11,[r1, #-10]		; r11= x[3]
+	MUL	r6, r3, r6		; r6 = t[4]<<16 = OC_C7S1*x[1]
+	LDR	r7, OC_C5S3		; r7 = OC_C5S3
+	MUL	r3, r12,r3		; r3 = t[7]<<16 = OC_C1S7*x[1]
+	LDR	r8, OC_C3S5		; r8 = OC_C3S5
+	MUL	r7, r11,r7		; r7 = -t[5]<<16 = OC_C5S3*x[3]
+	MOV	r9, r9, ASR #16		; r9 = t[0]
+	MUL	r11,r8, r11		; r11= t[6]<<16 = OC_C3S5*x[3]
+	MOV	r6, r6, ASR #16		; r6 = t[4]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	SUB	r7, r6, r7, ASR #16	; r7 = t2[4]=t[4]+t[5] (as r7=-t[5])
+	RSB	r6, r7, r6, LSL #1	; r6 = t[4]-t[5]
+	MUL	r6, r10,r6		; r6 = t2[5]<<16 =OC_C4S4*(t[4]-t[5])
+	MOV	r3, r3, ASR #16		; r3 = t[7]
+	ADD	r11,r3, r11,ASR #16	; r11= t2[7]=t[7]+t[6]
+	RSB	r3, r11,r3, LSL #1	; r3 = t[7]-t[6]
+	ADD	r9, r9, #8		; r9 = t[0]+8
+	MUL	r3, r10,r3		; r3 = t2[6]<<16 =OC_C4S4*(t[7]-t[6])
+	ADD	r4, r9, r4, ASR #16	; r4 = t[1] = t[0] + t[2] + 8
+	RSB	r10,r4, r9, LSL #1	; r10= t[2] = t[0] - t[2] + 8
+	ADD	r5, r9, r5, ASR #16	; r5 = t[0] = t[0] + t[3] + 8
+	RSB	r9, r5, r9, LSL #1	; r9 = t[3] = t[0] - t[3] + 8
+	MOV	r3, r3, ASR #16		; r3 = t2[6]
+	ADD	r6, r3, r6, ASR #16	; r6 = t3[6] = t2[6]+t2[5]
+	RSB	r3, r6, r3, LSL #1	; r3 = t3[5] = t2[6]-t2[5]
+	ADD	r5, r5, r11		; r5 = t[0]+t2[7]+8
+	ADD	r4, r4, r6		; r4 = t[1]+t3[6]+8
+	ADD	r10,r10,r3		; r10= t[2]+t3[5]+8
+	ADD	r9, r9, r7		; r9 = t[3]+t2[4]+8
+	SUB	r11,r5, r11,LSL #1	; r11= t[0]-t2[7]+8
+	SUB	r6, r4, r6, LSL #1	; r6 = t[1]-t3[6]+8
+	SUB	r3, r10,r3, LSL #1	; r3 = t[2]-t3[5]+8
+	SUB	r7, r9, r7, LSL #1	; r7 = t[3]-t2[4]+8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r11,r11,ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r7, r7, ASR #4
+	MOV	r9, r9, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r5, r5, ASR #4
+	STRH	r5,[r0], #2		; y[0] = t[0]+t[7]
+	STRH	r4, [r0, #14]		; y[1] = t[1]+t2[6]
+	STRH	r10,[r0, #30]		; y[2] = t[2]+t2[5]
+	STRH	r9, [r0, #46]		; y[3] = t2[3]+t[4]
+	STRH	r7, [r0, #62]		; y[4] = t2[3]-t[4]
+	STRH	r3, [r0, #78]		; y[5] = t[2]-t2[5]
+	STRH	r6, [r0, #94]		; y[6] = t[1]-t2[6]
+	STRH	r11,[r0, #110]		; y[7] = t2[0]-t[7]
+	MOV	PC,r14
+
+idct8core_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; Stage 2
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3]
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3]
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2]
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2]
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7]
+	ADD	r6, r6, r10		; r6 = t[1] + t[6]
+	ADD	r3, r3, r14		; r3 = t[2] + t[5]
+	ADD	r4, r4, r9		; r4 = t[3] + t[4]
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7]
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6]
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5]
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4]
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+
+idct8core_down_arm
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r2, [r1],#16		; r2 = x[0]
+	STMFD	r13!,{r1,r14}
+	LDRSH	r6, [r1, #-8]		; r6 = x[4]
+	LDR	r12,OC_C4S4		; r12= C4S4
+	LDRSH	r4, [r1, #-12]		; r4 = x[2]
+	ADD	r2, r2, r6		; r2 = x[0] + x[4]
+	SUB	r6, r2, r6, LSL #1	; r6 = x[0] - x[4]
+	; For spec compliance, these sums must be truncated to 16-bit precision
+	; _before_ the multiply (not after).
+	; Sadly, ARMv4 provides no simple way to do that.
+	MOV	r2, r2, LSL #16
+	MOV	r6, r6, LSL #16
+	MOV	r2, r2, ASR #16
+	MOV	r6, r6, ASR #16
+	MUL	r2, r12,r2		; r2 = t[0]<<16 = C4S4*(x[0]+x[4])
+	LDRSH	r8, [r1, #-4]		; r8 = x[6]
+	LDR	r7, OC_C6S2		; r7 = OC_C6S2
+	MUL	r6, r12,r6		; r6 = t[1]<<16 = C4S4*(x[0]-x[4])
+	LDR	r14,OC_C2S6		; r14= OC_C2S6
+	MUL	r3, r4, r7		; r3 = OC_C6S2*x[2]
+	LDR	r5, OC_C7S1		; r5 = OC_C7S1
+	MUL	r4, r14,r4		; r4 = OC_C2S6*x[2]
+	MOV	r3, r3, ASR #16		; r3 = OC_C6S2*x[2]>>16
+	MUL	r14,r8, r14		; r14= OC_C2S6*x[6]
+	MOV	r4, r4, ASR #16		; r4 = OC_C2S6*x[2]>>16
+	MUL	r8, r7, r8		; r8 = OC_C6S2*x[6]
+	LDR	r7, OC_C1S7		; r7 = OC_C1S7
+	SUB	r3, r3, r14,ASR #16	; r3=t[2]=C6S2*x[2]>>16-C2S6*x[6]>>16
+	LDRSH	r14,[r1, #-14]		; r14= x[1]
+	ADD	r4, r4, r8, ASR #16	; r4=t[3]=C2S6*x[2]>>16+C6S2*x[6]>>16
+	LDRSH	r8, [r1, #-2]		; r8 = x[7]
+	MUL	r9, r5, r14		; r9 = OC_C7S1*x[1]
+	LDRSH	r10,[r1, #-6]		; r10= x[5]
+	MUL	r14,r7, r14		; r14= OC_C1S7*x[1]
+	MOV	r9, r9, ASR #16		; r9 = OC_C7S1*x[1]>>16
+	MUL	r7, r8, r7		; r7 = OC_C1S7*x[7]
+	MOV	r14,r14,ASR #16		; r14= OC_C1S7*x[1]>>16
+	MUL	r8, r5, r8		; r8 = OC_C7S1*x[7]
+	LDRSH	r1, [r1, #-10]		; r1 = x[3]
+	LDR	r5, OC_C3S5		; r5 = OC_C3S5
+	LDR	r11,OC_C5S3		; r11= OC_C5S3
+	ADD	r8, r14,r8, ASR #16	; r8=t[7]=C1S7*x[1]>>16+C7S1*x[7]>>16
+	MUL	r14,r5, r10		; r14= OC_C3S5*x[5]
+	SUB	r9, r9, r7, ASR #16	; r9=t[4]=C7S1*x[1]>>16-C1S7*x[7]>>16
+	MUL	r10,r11,r10		; r10= OC_C5S3*x[5]
+	MOV	r14,r14,ASR #16		; r14= OC_C3S5*x[5]>>16
+	MUL	r11,r1, r11		; r11= OC_C5S3*x[3]
+	MOV	r10,r10,ASR #16		; r10= OC_C5S3*x[5]>>16
+	MUL	r1, r5, r1		; r1 = OC_C3S5*x[3]
+	SUB	r14,r14,r11,ASR #16	;r14=t[5]=C3S5*x[5]>>16-C5S3*x[3]>>16
+	ADD	r10,r10,r1, ASR #16	;r10=t[6]=C5S3*x[5]>>16+C3S5*x[3]>>16
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t[7] r9=t[4]
+	; r10=t[6] r12=C4S4 r14=t[5]
+	; Stage 2
+; TODO: This is wrong; t[4]-t[5] and t[7]-t[6] need to be truncated to 16-bit
+; before multiplying, not after (this is not equivalent)
+	; 4-5 butterfly
+	ADD	r9, r9, r14		; r9 = t2[4]     =       t[4]+t[5]
+	SUB	r14,r9, r14, LSL #1	; r14=                   t[4]-t[5]
+	MUL	r14,r12,r14		; r14= t2[5]<<16 = C4S4*(t[4]-t[5])
+	; 7-6 butterfly
+	ADD	r8, r8, r10		; r8 = t2[7]     =       t[7]+t[6]
+	SUB	r10,r8, r10, LSL #1	; r10=                   t[7]-t[6]
+	MUL	r10,r12,r10		; r10= t2[6]<<16 = C4S4*(t[7]+t[6])
+	; r2=t[0]<<16 r3=t[2] r4=t[3] r6=t[1]<<16 r8=t2[7] r9=t2[4]
+	; r10=t2[6]<<16 r12=C4S4 r14=t2[5]<<16
+	; Stage 3
+	ADD	r2, r2, #8<<16		; r2 = t[0]+8<<16
+	ADD	r6, r6, #8<<16		; r6 = t[1]+8<<16
+	; 0-3 butterfly
+	ADD	r2, r4, r2, ASR #16	; r2 = t2[0] = t[0] + t[3] + 8
+	SUB	r4, r2, r4, LSL #1	; r4 = t2[3] = t[0] - t[3] + 8
+	; 1-2 butterfly
+	ADD	r6, r3, r6, ASR #16	; r6 = t2[1] = t[1] + t[2] + 8
+	SUB	r3, r6, r3, LSL #1	; r3 = t2[2] = t[1] - t[2] + 8
+	; 6-5 butterfly
+	MOV	r14,r14,ASR #16		; r14= t2[5]
+	ADD	r10,r14,r10,ASR #16	; r10= t3[6] = t[6] + t[5]
+	SUB	r14,r10,r14,LSL #1	; r14= t3[5] = t[6] - t[5]
+	; r2=t2[0] r3=t2[2] r4=t2[3] r6=t2[1] r8=t2[7] r9=t2[4]
+	; r10=t3[6] r14=t3[5]
+	; Stage 4
+	ADD	r2, r2, r8		; r2 = t[0] + t[7] + 8
+	ADD	r6, r6, r10		; r6 = t[1] + t[6] + 8
+	ADD	r3, r3, r14		; r3 = t[2] + t[5] + 8
+	ADD	r4, r4, r9		; r4 = t[3] + t[4] + 8
+	SUB	r8, r2, r8, LSL #1	; r8 = t[0] - t[7] + 8
+	SUB	r10,r6, r10,LSL #1	; r10= t[1] - t[6] + 8
+	SUB	r14,r3, r14,LSL #1	; r14= t[2] - t[5] + 8
+	SUB	r9, r4, r9, LSL #1	; r9 = t[3] - t[4] + 8
+	; TODO: This is wrong.
+	; The C code truncates to 16 bits by storing to RAM and doing the
+	;  shifts later; we've got an extra 4 bits here.
+	MOV	r2, r2, ASR #4
+	MOV	r6, r6, ASR #4
+	MOV	r3, r3, ASR #4
+	MOV	r4, r4, ASR #4
+	MOV	r8, r8, ASR #4
+	MOV	r10,r10,ASR #4
+	MOV	r14,r14,ASR #4
+	MOV	r9, r9, ASR #4
+	STRH	r2, [r0], #2		; y[0] = t[0]+t[7]
+	STRH	r6, [r0, #14]		; y[1] = t[1]+t[6]
+	STRH	r3, [r0, #30]		; y[2] = t[2]+t[5]
+	STRH	r4, [r0, #46]		; y[3] = t[3]+t[4]
+	STRH	r9, [r0, #62]		; y[4] = t[3]-t[4]
+	STRH	r14,[r0, #78]		; y[5] = t[2]-t[5]
+	STRH	r10,[r0, #94]		; y[6] = t[1]-t[6]
+	STRH	r8, [r0, #110]		; y[7] = t[0]-t[7]
+	LDMFD	r13!,{r1,PC}
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_idct8x8_v6
+
+oc_idct8x8_v6
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #3
+	BLE	oc_idct8x8_3_v6
+	;CMP	r2, #6
+	;BLE	oc_idct8x8_6_v6
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_v6
+oc_idct8x8_slow_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	STR	r0, [r13,#-4]!
+	ADD	r0, r13, #4	; Write to temp storage.
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	BL	idct8_8core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #8*16
+	CMP	r0, r2
+	MOV	r1, r13		; And read from temp storage.
+	BEQ	oc_idct8x8_slow_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+	STRD	r4, [r2], #8
+oc_idct8x8_slow_v6_cols
+; Column transforms
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	BL	idct8_8core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_10_v6
+	STMFD	r13!,{r4-r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r2, r13
+	STR	r0, [r13,#-4]!
+	AND	r0, r2, #4	; Align the stack.
+	ADD	r0, r0, r2	; Write to temp storage.
+	BL	idct4_3core_v6
+	BL	idct2_1core_v6
+	LDR	r0, [r13], #4	; Write to the final destination.
+	; Clear input data for next block (decoder only).
+	SUB	r2, r1, #4*16
+	CMP	r0, r2
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_10_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r2]
+	STRD	r4, [r2,#16]
+	STR	r4, [r2,#32]
+	STR	r4, [r2,#48]
+oc_idct8x8_10_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	BL	idct4_4core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_idct8x8_3_v6
+	STMFD	r13!,{r4-r8,r14}
+	SUB	r13,r13,#64*2
+; Row transforms
+	MOV	r8, r0
+	MOV	r0, r13		; Write to temp storage.
+	BL	idct2_1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #2*16
+	CMP	r0, r8
+	MOV	r1, r13		; Read from temp storage.
+	MOVNE	r4, #0
+	STRNE	r4, [r0]
+	STRNE	r4, [r0,#16]
+	MOVNE	r0, r8		; Write to the final destination.
+; Column transforms
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	BL	idct2_2core_down_v6
+	ADD	r13,r13,#64*2
+	LDMFD	r13!,{r4-r8,PC}
+
+idct2_1core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	LDRSH	r6, [r1], #16		; r6 = x[1,0]
+	SMULWB	r12,r3, r2		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMULWB	r6, r3, r6		; r6 = t[1,0]=OC_C4S4*x[1,0]>>16
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+	SMULWT	r7, r5, r2		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+; Stage 2:
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r12,r12,r6, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r7, r7, r3		; r7 = <0|t[0,7]>
+; Stage 3:
+	PKHBT	r5, r6, r5, LSL #16	; r5 = <t[0,5]|t[0,6]>
+	PKHBT	r4, r4, r3		; r4 = <0|t[0,4]>
+	SASX	r5, r5, r5		; r5 = <t[0,6]+t[0,5]|t[0,6]-t[0,5]>
+; Stage 4:
+	PKHTB	r6, r3, r5, ASR #16	; r6 = <0|t[0,6]>
+	PKHBT	r5, r5, r3		; r5 = <0|t[0,5]>
+	SADD16	r3, r12,r7		; r3 = t[0]+t[7]
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r3, r12,r6		; r3 = t[0]+t[6]
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]
+	SADD16	r3, r12,r5		; r3 = t[0]+t[5]
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]
+	SADD16	r3, r12,r4		; r3 = t[0]+t[4]
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[0]-t[4]
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[0]-t[5]
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[0]-t[6]
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+ ]
+
+	ALIGN 8
+OC_C7S1
+	DCD	12785 ; 31F1
+OC_C1S7
+	DCD	64277 ; FB15
+OC_C6S2
+	DCD	25080 ; 61F8
+OC_C2S6
+	DCD	60547 ; EC83
+OC_C5S3
+	DCD	36410 ; 8E3A
+OC_C3S5
+	DCD	54491 ; D4DB
+OC_C4S4
+	DCD	46341 ; B505
+
+ [ OC_ARM_ASM_MEDIA
+idct2_2core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDR	r2, [r1], #16		; r2 = <x[0,1]|x[0,0]>
+	LDR	r3, OC_C4S4
+	MOV	r7 ,#8			; r7  = 8
+	LDR	r6, [r1], #16		; r6 = <x[1,1]|x[1,0]>
+	SMLAWB	r12,r3, r2, r7		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)+8
+	LDRD	r4, OC_C7S1		; r4 = OC_C7S1; r5 = OC_C1S7
+	SMLAWB	r7, r3, r6, r7		; r7 = (t[1,0]=OC_C4S4*x[1,0]>>16)+8
+	SMULWT  r5, r5, r2		; r2 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r12,r12,r7, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT	r4, r4, r2		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Here we cheat: row 1 had just a DC, so x[0,1]==x[1,1] by definition.
+	PKHBT	r7, r5, r5, LSL #16	; r7 = <t[0,7]|t[0,7]>
+; Stage 2:
+	SMULWB	r6, r3, r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r4, LSL #16	; r4 = <t[0,4]|t[0,4]>
+	SMULWT	r2, r3, r7		; r2 = t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r3, r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r2, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r2, r3, r4		; r2 = t[1,5]=OC_C4S4*t[1,4]>>16
+	PKHBT	r2, r5, r2, LSL #16	; r2 = <t[1,5]|t[0,5]>
+; Stage 3:
+	SSUB16	r5, r6, r2		; r5 = <t[1,6]-t[1,5]|t[0,6]-t[0,5]>
+	SADD16	r6, r6, r2		; r6 = <t[1,6]+t[1,5]|t[0,6]+t[0,5]>
+; Stage 4:
+	SADD16	r2, r12,r7		; r2 = t[0]+t[7]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[7]+8>>4
+	STR	r3, [r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r2, r12,r6		; r2 = t[0]+t[6]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[6]+8>>4
+	STR	r3, [r0, #12]		; y[1<<3] = t[0]+t[6]+8>>4
+	SADD16	r2, r12,r5		; r2 = t[0]+t[5]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[5]+8>>4
+	STR	r3, [r0, #28]		; y[2<<3] = t[0]+t[5]+8>>4
+	SADD16	r2, r12,r4		; r2 = t[0]+t[4]+8
+	MOV	r3, r2, ASR #4
+	MOV	r2, r2, LSL #16
+	PKHTB	r3, r3, r2, ASR #20	; r3 = t[0]+t[4]+8>>4
+	STR	r3, [r0, #44]		; y[3<<3] = t[0]+t[4]+8>>4
+	SSUB16	r4, r12,r4		; r4 = t[0]-t[4]+8
+	MOV	r3, r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r3, r3, r4, ASR #20	; r3 = t[0]-t[4]+8>>4
+	STR	r3, [r0, #60]		; y[4<<3] = t[0]-t[4]+8>>4
+	SSUB16	r5, r12,r5		; r5 = t[0]-t[5]+8
+	MOV	r3, r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r3, r3, r5, ASR #20	; r3 = t[0]-t[5]+8>>4
+	STR	r3, [r0, #76]		; y[5<<3] = t[0]-t[5]+8>>4
+	SSUB16	r6, r12,r6		; r6 = t[0]-t[6]+8
+	MOV	r3, r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r3, r3, r6, ASR #20	; r3 = t[0]-t[6]+8>>4
+	STR	r3, [r0, #92]		; y[6<<3] = t[0]-t[6]+8>>4
+	SSUB16	r7, r12,r7		; r7 = t[0]-t[7]+8
+	MOV	r3, r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r3, r3, r7, ASR #20	; r3 = t[0]-t[7]+8>>4
+	STR	r3, [r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+
+; In theory this should save ~75 cycles over oc_idct8x8_10, more than enough to
+;  pay for increased branch mis-prediction to get here, but in practice it
+;  doesn't seem to slow anything down to take it out, and it's less code this
+;  way.
+ [ 0
+oc_idct8x8_6_v6
+	STMFD	r13!,{r4-r8,r10,r11,r14}
+	SUB	r13,r13,#64*2+4
+; Row transforms
+	MOV	r8, r0
+	AND	r0, r13,#4	; Align the stack.
+	ADD	r0, r0, r13	; Write to temp storage.
+	BL	idct3_2core_v6
+	BL	idct1core_v6
+	; Clear input data for next block (decoder only).
+	SUB	r0, r1, #3*16
+	CMP	r0, r8
+	AND	r1, r13,#4	; Align the stack.
+	BEQ	oc_idct8x8_6_v6_cols
+	MOV	r4, #0
+	MOV	r5, #0
+	STRD	r4, [r0]
+	STR	r4, [r0,#16]
+	STR	r4, [r0,#32]
+	MOV	r0, r8		; Write to the final destination.
+oc_idct8x8_6_v6_cols
+; Column transforms
+	ADD	r1, r1, r13	; And read from temp storage.
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	BL	idct3_3core_down_v6
+	ADD	r13,r13,#64*2+4
+	LDMFD	r13!,{r4-r8,r10,r11,PC}
+
+idct1core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+	LDRSH	r3, [r1], #16
+	MOV	r12,#0x05
+	ORR	r12,r12,#0xB500
+	MUL	r3, r12, r3
+	; Stall ?
+	MOV	r3, r3, ASR #16
+	; Don't need to actually store the odd lines; they won't be read.
+	STRH	r3, [r0], #2
+	STRH	r3, [r0, #30]
+	STRH	r3, [r0, #62]
+	STRH	r3, [r0, #94]
+	MOV	PC,R14
+
+idct3_2core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r4, [r1], #16		; r4 = <x[0,1]|x[0,0]>; r5 = <*|x[0,2]>
+	LDRD	r10,OC_C6S2_3_v6	; r10= OC_C6S2; r11= OC_C2S6
+	; Stall
+	SMULWB	r3, r11,r5		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r2, r10,r5		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r5, [r1], #16		; r5 = <x[1,1]|x[1,0]>
+	SMULWB	r12,r11,r4		; r12= (t[0,0]=OC_C4S4*x[0,0]>>16)
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	SMULWB	r10,r11,r5		; r10= (t[1,0]=OC_C4S4*x[1,0]>>16)
+	PKHBT	r12,r12,r10,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r10,r7, r5		; r10= t[1,7]=OC_C1S7*x[1,1]>>16
+	PKHBT	r2, r2, r11		; r2 = <0|t[0,2]>
+	SMULWT  r7, r7, r4		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	PKHBT	r3, r3, r11		; r3 = <0|t[0,3]>
+	SMULWT	r5, r6, r5		; r10= t[1,4]=OC_C7S1*x[1,1]>>16
+	PKHBT	r7, r7, r10,LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[0,4]=OC_C7S1*x[0,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r4, r5, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_3core_stage3_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_3_v6
+	DCD	12785 ; 31F1
+OC_C1S7_3_v6
+	DCD	64277 ; FB15
+OC_C6S2_3_v6
+	DCD	25080 ; 61F8
+OC_C2S6_3_v6
+	DCD	60547 ; EC83
+
+idct3_3core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16		; r10= <x[0,1]|x[0,0]>; r11= <??|x[0,2]>
+	LDRD	r6, OC_C6S2_3_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	LDR	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	MOV	r7,#8
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+; Here we cheat: row 2 had just a DC, so x[0,2]==x[1,2] by definition.
+	PKHBT	r3, r3, r3, LSL #16	; r3 = <t[0,3]|t[0,3]>
+	SMLAWB	r5, r11,r4, r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	PKHBT	r2, r2, r2, LSL #16	; r2 = <t[0,2]|t[0,2]>
+	LDRD	r6, OC_C7S1_3_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SMULWB	r6, r11,r7		; r6 = t[0,6]=OC_C4S4*t[0,7]>>16
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SMULWT	r10,r11,r7		; r10= t[1,6]=OC_C4S4*t[1,7]>>16
+	SMULWB	r5, r11,r4		; r5 = t[0,5]=OC_C4S4*t[0,4]>>16
+	PKHBT	r6, r6, r10,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r10,r11,r4		; r10= t[1,5]=OC_C4S4*t[1,4]>>16
+; Stage 3:
+	B	idct4_4core_down_stage3_v6
+ ]
+
+idct4_3core_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16		; r4 = <x[1,1]|x[1,0]>; r5 = <??|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+	PKHBT	r9, r9, r2		; r9 = <0|t[0,6]>
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	PKHBT	r8, r8, r2		; r9 = <0|-t[0,5]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMULWB	r12,r11,r10		; r12= t[0,0]=OC_C4S4*x[0,0]>>16
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r5, r11,r4		; r5 = t[1,0]=OC_C4S4*x[1,0]>>16
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_3core_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]=t[0]+t[2]
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]=t[0]-t[2]
+idct4_3core_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'=t[0]+t[3]
+	SSUB16	r3, r12,r3		; r3 = t[3]=t[0]-t[3]
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]
+	STR	r12,[r0], #4		; y[0<<3] = t[0]+t[7]
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]
+	STR	r12,[r0, #12]		; y[1<<3] = t[1]+t[6]
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]
+	STR	r12,[r0, #28]		; y[2<<3] = t[2]+t[5]
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]
+	STR	r12,[r0, #44]		; y[3<<3] = t[3]+t[4]
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]
+	STR	r4, [r0, #60]		; y[4<<3] = t[3]-t[4]
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]
+	STR	r5, [r0, #76]		; y[5<<3] = t[2]-t[5]
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]
+	STR	r6, [r0, #92]		; y[6<<3] = t[1]-t[6]
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]
+	STR	r7, [r0, #108]		; y[7<<3] = t[0]-t[7]
+	MOV	PC,r14
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_4_v6
+	DCD	12785 ; 31F1
+OC_C1S7_4_v6
+	DCD	64277 ; FB15
+OC_C6S2_4_v6
+	DCD	25080 ; 61F8
+OC_C2S6_4_v6
+	DCD	60547 ; EC83
+OC_C5S3_4_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_4_v6
+	DCD	54491 ; D4DB
+
+idct4_4core_down_v6
+	; r0 =       ogg_int16_t *_y (destination)
+	; r1 = const ogg_int16_t *_x (source)
+; Stage 1:
+	LDRD	r10,[r1], #16	; r10= <x[0,1]|x[0,0]>; r11= <x[0,3]|x[0,2]>
+	LDRD	r2, OC_C5S3_4_v6	; r2 = OC_C5S3; r3 = OC_C3S5
+	LDRD	r4, [r1], #16	; r4 = <x[1,1]|x[1,0]>; r5 = <x[1,3]|x[1,2]>
+	SMULWT	r9, r3, r11		; r9 = t[0,6]=OC_C3S5*x[0,3]>>16
+	LDRD	r6, OC_C6S2_4_v6	; r6 = OC_C6S2; r7 = OC_C2S6
+	SMULWT	r8, r2, r11		; r8 = -t[0,5]=OC_C5S3*x[0,3]>>16
+; Here we cheat: row 3 had just a DC, so x[0,3]==x[1,3] by definition.
+	PKHBT	r9, r9, r9, LSL #16	; r9 = <t[0,6]|t[0,6]>
+	SMULWB	r3, r7, r11		; r3 = t[0,3]=OC_C2S6*x[0,2]>>16
+	PKHBT	r8, r8, r8, LSL #16	; r8 = <-t[0,5]|-t[0,5]>
+	SMULWB	r2, r6, r11		; r2 = t[0,2]=OC_C6S2*x[0,2]>>16
+	LDR	r11,OC_C4S4
+	SMULWB	r12,r7, r5		; r12= t[1,3]=OC_C2S6*x[1,2]>>16
+	MOV	r7,#8
+	SMULWB	r5, r6, r5		; r5 = t[1,2]=OC_C6S2*x[1,2]>>16
+	PKHBT	r3, r3, r12,LSL #16	; r3 = <t[1,3]|t[0,3]>
+	SMLAWB	r12,r11,r10,r7		; r12= t[0,0]+8=(OC_C4S4*x[0,0]>>16)+8
+	PKHBT	r2, r2, r5, LSL #16	; r2 = <t[1,2]|t[0,2]>
+	SMLAWB	r5, r11,r4 ,r7		; r5 = t[1,0]+8=(OC_C4S4*x[1,0]>>16)+8
+	LDRD	r6, OC_C7S1_4_v6	; r6 = OC_C7S1; r7 = OC_C1S7
+	PKHBT	r12,r12,r5, LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMULWT  r5, r7, r4		; r5 = t[1,7]=OC_C1S7*x[1,1]>>16
+	SMULWT  r7, r7, r10		; r7 = t[0,7]=OC_C1S7*x[0,1]>>16
+	SMULWT	r10,r6, r10		; r10= t[0,4]=OC_C7S1*x[0,1]>>16
+	PKHBT	r7, r7, r5, LSL #16	; r7 = <t[1,7]|t[0,7]>
+	SMULWT	r4, r6, r4		; r4 = t[1,4]=OC_C7S1*x[1,1]>>16
+; Stage 2:
+	SSUB16	r6, r7, r9		; r6 = t[7]-t[6]
+	PKHBT	r4, r10,r4, LSL #16	; r4 = <t[1,4]|t[0,4]>
+	SADD16	r7, r7, r9		; r7 = t[7]=t[7]+t[6]
+	SMULWT	r9, r11,r6		; r9 = t[1,6]=OC_C4S4*r6T>>16
+	SADD16	r5, r4, r8		; r5 = t[4]-t[5]
+	SMULWB	r6, r11,r6		; r6 = t[0,6]=OC_C4S4*r6B>>16
+	SSUB16	r4, r4, r8		; r4 = t[4]=t[4]+t[5]
+	SMULWT	r10,r11,r5		; r10= t[1,5]=OC_C4S4*r5T>>16
+	PKHBT	r6, r6, r9, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWB	r5, r11,r5		; r5 = t[0,5]=OC_C4S4*r5B>>16
+; Stage 3:
+idct4_4core_down_stage3_v6
+	SADD16	r11,r12,r2		; r11= t[1]+8=t[0]+t[2]+8
+	PKHBT	r10,r5, r10,LSL #16	; r10= <t[1,5]|t[0,5]>
+	SSUB16	r2, r12,r2		; r2 = t[2]+8=t[0]-t[2]+8
+	B	idct8_8core_down_stage3_5_v6
+
+idct8_8core_v6
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_4_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_4_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMULWB	r8, r11,r7		; r8 = t[0,0]=OC_C4S4*r7B>>16
+	SMULWT	r12,r11,r7		; r12= t[1,0]=OC_C4S4*r7T>>16
+	SMULWB	r7, r11,r4		; r7 = t[0,1]=OC_C4S4*r4B>>16
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]|t[0,0]>
+	SMULWT	r8, r11,r4		; r8 = t[1,1]=OC_C4S4*r4T>>16
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]|t[0,0]>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'=t[1]+t[2]
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]=t[1]-t[2]
+	LDMFD	r13!,{r0,r14}
+	B	idct4_3core_stage3_5_v6
+
+; Another copy so the LDRD offsets are less than +/- 255.
+	ALIGN 8
+OC_C7S1_8_v6
+	DCD	12785 ; 31F1
+OC_C1S7_8_v6
+	DCD	64277 ; FB15
+OC_C6S2_8_v6
+	DCD	25080 ; 61F8
+OC_C2S6_8_v6
+	DCD	60547 ; EC83
+OC_C5S3_8_v6
+	DCD	36410 ; 8E3A
+OC_C3S5_8_v6
+	DCD	54491 ; D4DB
+
+idct8_8core_down_v6
+	STMFD	r13!,{r0,r14}
+; Stage 1:
+	;5-6 rotation by 3pi/16
+	LDRD	r10,OC_C5S3_8_v6	; r10= OC_C5S3, r11= OC_C3S5
+	LDR	r4, [r1,#8]		; r4 = <x[0,5]|x[0,4]>
+	LDR	r7, [r1,#24]		; r7 = <x[1,5]|x[1,4]>
+	SMULWT	r5, r11,r4		; r5 = OC_C3S5*x[0,5]>>16
+	LDR	r0, [r1,#4]		; r0 = <x[0,3]|x[0,2]>
+	SMULWT	r3, r11,r7		; r3 = OC_C3S5*x[1,5]>>16
+	LDR	r12,[r1,#20]		; r12= <x[1,3]|x[1,2]>
+	SMULWT	r6, r11,r0		; r6 = OC_C3S5*x[0,3]>>16
+	SMULWT	r11,r11,r12		; r11= OC_C3S5*x[1,3]>>16
+	SMLAWT	r6, r10,r4, r6		; r6 = t[0,6]=r6+(OC_C5S3*x[0,5]>>16)
+	PKHBT	r5, r5, r3, LSL #16	; r5 = <r3|r5>
+	SMLAWT	r11,r10,r7, r11		; r11= t[1,6]=r11+(OC_C5S3*x[1,5]>>16)
+	PKHBT	r4, r4, r7, LSL #16	; r4 = <x[1,4]|x[0,4]>
+	SMULWT	r3, r10,r0		; r3 = OC_C5S3*x[0,3]>>16
+	PKHBT	r6, r6, r11,LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SMULWT	r8, r10,r12		; r8 = OC_C5S3*x[1,3]>>16
+	;2-3 rotation by 6pi/16
+	LDRD	r10,OC_C6S2_8_v6	; r10= OC_C6S2, r11= OC_C2S6
+	PKHBT	r3, r3, r8, LSL #16	; r3 = <r8|r3>
+	LDR	r8, [r1,#12]		; r8 = <x[0,7]|x[0,6]>
+	SMULWB	r2, r10,r0		; r2 = OC_C6S2*x[0,2]>>16
+	SSUB16	r5, r5, r3		; r5 = <t[1,5]|t[0,5]>
+	SMULWB	r9, r10,r12		; r9 = OC_C6S2*x[1,2]>>16
+	LDR	r7, [r1,#28]		; r7 = <x[1,7]|x[1,6]>
+	SMULWB	r3, r10,r8		; r3 = OC_C6S2*x[0,6]>>16
+	SMULWB	r10,r10,r7		; r10= OC_C6S2*x[1,6]>>16
+	PKHBT	r2, r2, r9, LSL #16	; r2 = <r2|r9>
+	SMLAWB	r3, r11,r0, r3		; r3 = t[0,3]=r3+(OC_C2S6*x[0,2]>>16)
+	SMLAWB	r10,r11,r12,r10		; r10= t[1,3]=r10+(OC_C2S6*x[1,2]>>16)
+	SMULWB	r9, r11,r8		; r9 = OC_C2S6*x[0,6]>>16
+	PKHBT	r3, r3, r10,LSL #16	; r3 = <t[1,6]|t[0,6]>
+	SMULWB	r12,r11,r7		; r12= OC_C2S6*x[1,6]>>16
+	;4-7 rotation by 7pi/16
+	LDRD	r10,OC_C7S1_8_v6	; r10= OC_C7S1, r11= OC_C1S7
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <r9|r12>
+	LDR	r0, [r1],#16		; r0 = <x[0,1]|x[0,0]>
+	PKHTB	r7, r7, r8, ASR #16	; r7 = <x[1,7]|x[0,7]>
+	SSUB16	r2, r2, r9		; r2 = <t[1,2]|t[0,2]>
+	SMULWB	r9, r10,r7		; r9 = OC_C7S1*x[0,7]>>16
+	LDR	r14,[r1],#16		; r14= <x[1,1]|x[1,0]>
+	SMULWT	r12,r10,r7		; r12= OC_C7S1*x[1,7]>>16
+	SMULWT	r8, r10,r0		; r8 = OC_C7S1*x[0,1]>>16
+	SMULWT	r10,r10,r14		; r10= OC_C7S1*x[1,1]>>16
+	SMLAWT	r9, r11,r0, r9		; r9 = t[0,7]=r9+(OC_C1S7*x[0,1]>>16)
+	PKHBT	r8, r8, r10,LSL #16	; r8 = <r12|r8>
+	SMLAWT	r12,r11,r14,r12		; r12= t[1,7]=r12+(OC_C1S7*x[1,1]>>16)
+	PKHBT	r0, r0, r14,LSL #16	; r0 = <x[1,0]|x[0,0]>
+	SMULWB	r10,r11,r7		; r10= OC_C1S7*x[0,6]>>16
+	PKHBT	r9, r9, r12,LSL #16	; r9 = <t[1,7]|t[0,7]>
+	SMULWT	r12,r11,r7		; r12= OC_C1S7*x[1,6]>>16
+	;0-1 butterfly
+	LDR	r11,OC_C4S4
+	MOV	r14,#8
+	PKHBT	r10,r10,r12,LSL #16	; r10= <r12|r10>
+	SADD16	r7, r0, r4		; r7 = x[0]+x[4]
+	SSUB16	r10,r8, r10		; r10= <t[1,4]|t[0,4]>
+	SMLAWB	r8, r11,r7, r14		; r8 = t[0,0]+8=(OC_C4S4*r7B>>16)+8
+	SSUB16	r4, r0, r4		; r4 = x[0]-x[4]
+	SMLAWT	r12,r11,r7, r14		; r12= t[1,0]+8=(OC_C4S4*r7T>>16)+8
+	SMLAWB	r7, r11,r4, r14		; r7 = t[0,1]+8=(OC_C4S4*r4B>>16)+8
+	PKHBT	r12,r8, r12,LSL #16	; r12= <t[1,0]+8|t[0,0]+8>
+	SMLAWT	r8, r11,r4, r14		; r8 = t[1,1]+8=(OC_C4S4*r4T>>16)+8
+; Stage 2:
+	SADD16	r4, r10,r5		; r4 = t[4]'=t[4]+t[5]
+	PKHBT	r8, r7, r8, LSL #16	; r8 = <t[1,0]+8|t[0,0]+8>
+	SSUB16	r5, r10,r5		; r5 = t[4]-t[5]
+	SMULWB	r10,r11,r5		; r10= t[0,5]=OC_C4S4*r5B>>16
+	SADD16	r7, r9, r6		; r7 = t[7]'=t[7]+t[6]
+	SMULWT	r5, r11,r5		; r5 = t[1,5]=OC_C4S4*r5T>>16
+	SSUB16	r6, r9, r6		; r6 = t[7]-t[6]
+	SMULWB	r9, r11,r6		; r9 = t[0,6]=OC_C4S4*r6B>>16
+	PKHBT	r10,r10,r5, LSL #16	; r10= <t[1,5]|t[0,5]>
+	SMULWT	r6, r11,r6		; r6 = t[1,6]=OC_C4S4*r6T>>16
+; Stage 3:
+	SADD16	r11,r8, r2		; r11= t[1]'+8=t[1]+t[2]+8
+	PKHBT	r6, r9, r6, LSL #16	; r6 = <t[1,6]|t[0,6]>
+	SSUB16	r2, r8, r2		; r2 = t[2]+8=t[1]-t[2]+8
+	LDMFD	r13!,{r0,r14}
+idct8_8core_down_stage3_5_v6
+	SSUB16	r5, r6, r10		; r5 = t[5]'=t[6]-t[5]
+	SADD16	r6, r6, r10		; r6 = t[6]=t[6]+t[5]
+	SADD16	r10,r12,r3		; r10= t[0]'+8=t[0]+t[3]+8
+	SSUB16	r3, r12,r3		; r3 = t[3]+8=t[0]-t[3]+8
+; Stage 4:
+	SADD16	r12,r10,r7		; r12= t[0]+t[7]+8
+	SSUB16	r7, r10,r7		; r7 = t[0]-t[7]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[0]+t[7]+8>>4
+	STR	r10,[r0], #4		; y[0<<3] = t[0]+t[7]+8>>4
+	SADD16	r12,r11,r6		; r12= t[1]+t[6]+8
+	SSUB16	r6, r11,r6		; r6 = t[1]-t[6]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[1]+t[6]+8>>4
+	STR	r10,[r0, #12]		; y[1<<3] = t[1]+t[6]+8>>4
+	SADD16	r12,r2, r5		; r12= t[2]+t[5]+8
+	SSUB16	r5, r2, r5		; r5 = t[2]-t[5]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[2]+t[5]+8>>4
+	STR	r10,[r0, #28]		; y[2<<3] = t[2]+t[5]+8>>4
+	SADD16	r12,r3, r4		; r12= t[3]+t[4]+8
+	SSUB16	r4, r3, r4		; r4 = t[3]-t[4]+8
+	MOV	r10,r12,ASR #4
+	MOV	r12,r12,LSL #16
+	PKHTB	r10,r10,r12,ASR #20	; r10= t[3]+t[4]+8>>4
+	STR	r10,[r0, #44]		; y[3<<3] = t[3]+t[4]+8>>4
+	MOV	r10,r4, ASR #4
+	MOV	r4, r4, LSL #16
+	PKHTB	r10,r10,r4, ASR #20	; r10= t[3]-t[4]+8>>4
+	STR	r10,[r0, #60]		; y[4<<3] = t[3]-t[4]+8>>4
+	MOV	r10,r5, ASR #4
+	MOV	r5, r5, LSL #16
+	PKHTB	r10,r10,r5, ASR #20	; r10= t[2]-t[5]+8>>4
+	STR	r10,[r0, #76]		; y[5<<3] = t[2]-t[5]+8>>4
+	MOV	r10,r6, ASR #4
+	MOV	r6, r6, LSL #16
+	PKHTB	r10,r10,r6, ASR #20	; r10= t[1]-t[6]+8>>4
+	STR	r10,[r0, #92]		; y[6<<3] = t[1]-t[6]+8>>4
+	MOV	r10,r7, ASR #4
+	MOV	r7, r7, LSL #16
+	PKHTB	r10,r10,r7, ASR #20	; r10= t[0]-t[7]+8>>4
+	STR	r10,[r0, #108]		; y[7<<3] = t[0]-t[7]+8>>4
+	MOV	PC,r14
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_idct8x8_neon
+
+	ALIGN 16
+OC_IDCT_CONSTS_NEON
+	DCW	    8
+	DCW	64277 ; FB15 (C1S7)
+	DCW	60547 ; EC83 (C2S6)
+	DCW	54491 ; D4DB (C3S5)
+	DCW	46341 ; B505 (C4S4)
+	DCW	36410 ; 471D (C5S3)
+	DCW	25080 ; 30FC (C6S2)
+	DCW	12785 ; 31F1 (C7S1)
+
+oc_idct8x8_neon
+	; r0 = ogg_int16_t *_y
+	; r1 = ogg_int16_t *_x
+	; r2 = int          _last_zzi
+	CMP	r2, #10
+	BLE	oc_idct8x8_10_neon
+oc_idct8x8_slow_neon
+	VPUSH		{D8-D15}
+	MOV	r2, r1
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	; Row transforms (input is pre-transposed)
+	VLD1.64		{D16,D17,D18,D19}, [r2 at 128]!
+	VLD1.64		{D20,D21,D22,D23}, [r2 at 128]!
+	VLD1.64		{D24,D25,D26,D27}, [r2 at 128]!
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VLD1.64		{D28,D29,D30,D31}, [r2 at 128]
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VLD1.64		{D0,D1},           [r3 at 128]
+	MOV	r12, r14
+	BL	oc_idct8x8_stage123_neon
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VTRN.16		Q14,Q15
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	; 8x8 Transpose
+	VTRN.16		Q8, Q9
+	VTRN.16		Q10,Q11
+	VTRN.16		Q12,Q13
+	VTRN.32		Q8, Q10
+	VTRN.32		Q9, Q11
+	VTRN.32		Q12,Q14
+	VTRN.32		Q13,Q15
+	VSWP		D17,D24
+	VSUB.S16	Q1, Q8, Q12	; Q8 = x[0]-x[4]
+	VSWP		D19,D26
+	VADD.S16	Q8, Q8, Q12	; Q1 = x[0]+x[4]
+	VSWP		D21,D28
+	VSWP		D23,D30
+	; Column transforms
+	BL	oc_idct8x8_stage123_neon
+	CMP	r0,r1
+	; We have to put the return address back in the LR, or the branch
+	;  predictor will not recognize the function return and mis-predict the
+	;  entire call stack.
+	MOV	r14, r12
+; Stage 4
+	VSUB.S16	Q15,Q8, Q7	; Q15 = y[7]=t[0]'-t[7]'
+	VADD.S16	Q8, Q8, Q7	; Q8  = y[0]=t[0]'+t[7]'
+	VSUB.S16	Q14,Q9, Q3	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q9, Q9, Q3	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q13,Q10,Q5	; Q13 = y[5]=t[2]'-t[5]''
+	VADD.S16	Q10,Q10,Q5	; Q10 = y[2]=t[2]'+t[5]''
+	VSUB.S16	Q12,Q11,Q4	; Q12 = y[4]=t[3]'-t[4]'
+	VADD.S16	Q11,Q11,Q4	; Q11 = y[3]=t[3]'+t[4]'
+	BEQ		oc_idct8x8_slow_neon_noclear
+	VMOV.I8		Q2,#0
+	VPOP		{D8-D15}
+	VMOV.I8		Q3,#0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1 at 128]!
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1 at 128]!
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1 at 128]!
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D4, D5, D6, D7}, [r1 at 128]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_slow_neon_noclear
+	VPOP		{D8-D15}
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_stage123_neon
+; Stages 1 & 2
+	VMULL.S16	Q4, D18,D1[3]
+	VMULL.S16	Q5, D19,D1[3]
+	VMULL.S16	Q7, D30,D1[3]
+	VMULL.S16	Q6, D31,D1[3]
+	VMULL.S16	Q2, D30,D0[1]
+	VMULL.S16	Q3, D31,D0[1]
+	VSHRN.S32	D8, Q4, #16
+	VSHRN.S32	D9, Q5, #16	; Q4 = (OC_C7S1*x[1]>>16)
+	VSHRN.S32	D14,Q7, #16
+	VSHRN.S32	D15,Q6, #16	; Q7 = (OC_C7S1*x[7]>>16)
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q3, #16	; Q2 = (OC_C1S7*x[7]>>16)-x[7]
+	VSUB.S16	Q4, Q4, Q15
+	VADD.S16	Q7, Q7, Q9
+	VSUB.S16	Q4, Q4, Q2	; Q4 = t[4]
+	VMULL.S16	Q2, D18,D0[1]
+	VMULL.S16	Q9, D19,D0[1]
+	VMULL.S16	Q5, D26,D0[3]
+	VMULL.S16	Q3, D27,D0[3]
+	VMULL.S16	Q6, D22,D0[3]
+	VMULL.S16	Q12,D23,D0[3]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q9, #16	; Q2 = (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D11,Q3, #16	; Q5 = (OC_C3S5*x[5]>>16)-x[5]
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D13,Q12,#16	; Q6 = (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q7, Q7, Q2	; Q7 = t[7]
+	VSUB.S16	Q5, Q5, Q11
+	VADD.S16	Q6, Q6, Q11
+	VADD.S16	Q5, Q5, Q13
+	VADD.S16	Q6, Q6, Q13
+	VMULL.S16	Q9, D22,D1[1]
+	VMULL.S16	Q11,D23,D1[1]
+	VMULL.S16	Q15,D26,D1[1]
+	VMULL.S16	Q13,D27,D1[1]
+	VMULL.S16	Q2, D20,D1[2]
+	VMULL.S16	Q12,D21,D1[2]
+	VSHRN.S32	D18,Q9, #16
+	VSHRN.S32	D19,Q11,#16	; Q9 = (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q13,#16	; Q15= (OC_C5S3*x[5]>>16)-x[5]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q12,#16	; Q2 = (OC_C6S2*x[2]>>16)
+	VSUB.S16	Q5, Q5, Q9	; Q5 = t[5]
+	VADD.S16	Q6, Q6, Q15	; Q6 = t[6]
+	VSUB.S16	Q2, Q2, Q14
+	VMULL.S16	Q3, D28,D1[2]
+	VMULL.S16	Q11,D29,D1[2]
+	VMULL.S16	Q12,D28,D0[2]
+	VMULL.S16	Q9, D29,D0[2]
+	VMULL.S16	Q13,D20,D0[2]
+	VMULL.S16	Q15,D21,D0[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q11,#16	; Q3 = (OC_C6S2*x[6]>>16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q9, #16	; Q12= (OC_C2S6*x[6]>>16)-x[6]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q15,#16	; Q13= (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q4, Q5	; Q9 = t[4]-t[5]
+	VSUB.S16	Q11,Q7, Q6	; Q11= t[7]-t[6]
+	VADD.S16	Q3, Q3, Q10
+	VADD.S16	Q4, Q4, Q5	; Q4 = t[4]'=t[4]+t[5]
+	VADD.S16	Q7, Q7, Q6	; Q7 = t[7]'=t[7]+t[6]
+	VSUB.S16	Q2, Q2, Q12	; Q2 = t[2]
+	VADD.S16	Q3, Q3, Q13	; Q3 = t[3]
+	VMULL.S16	Q12,D16,D1[0]
+	VMULL.S16	Q13,D17,D1[0]
+	VMULL.S16	Q14,D2, D1[0]
+	VMULL.S16	Q15,D3, D1[0]
+	VMULL.S16	Q5, D18,D1[0]
+	VMULL.S16	Q6, D22,D1[0]
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q13,#16
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q15,#16
+	VMULL.S16	Q13,D19,D1[0]
+	VMULL.S16	Q15,D23,D1[0]
+	VADD.S16	Q8, Q8, Q12	; Q8 = t[0]
+	VADD.S16	Q1, Q1, Q14	; Q1 = t[1]
+	VSHRN.S32	D10,Q5, #16
+	VSHRN.S32	D12,Q6, #16
+	VSHRN.S32	D11,Q13,#16
+	VSHRN.S32	D13,Q15,#16
+	VADD.S16	Q5, Q5, Q9	; Q5 = t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VADD.S16	Q6, Q6, Q11	; Q6 = t[6]'=OC_C4S4*(t[7]-t[6])>>16
+; Stage 3
+	VSUB.S16	Q11,Q8, Q3	; Q11 = t[3]''=t[0]-t[3]
+	VADD.S16	Q8, Q8, Q3	; Q8  = t[0]''=t[0]+t[3]
+	VADD.S16	Q9, Q1, Q2	; Q9  = t[1]''=t[1]+t[2]
+	VADD.S16	Q3, Q6, Q5	; Q3  = t[6]''=t[6]'+t[5]'
+	VSUB.S16	Q10,Q1, Q2	; Q10 = t[2]''=t[1]-t[2]
+	VSUB.S16	Q5, Q6, Q5	; Q5  = t[5]''=t[6]'-t[5]'
+	MOV	PC, r14
+
+oc_idct8x8_10_neon
+	ADR	r3, OC_IDCT_CONSTS_NEON
+	VLD1.64		{D0,D1},          [r3 at 128]
+	MOV	r2, r1
+	; Row transforms (input is pre-transposed)
+; Stage 1
+	VLD1.64		{D16,D17,D18,D19},[r2 at 128]!
+	MOV	r12, #16
+	VMULL.S16	Q15,D16,D1[0]	; Q15= OC_C4S4*x[0]-(x[0]<<16)
+	VLD1.64		{D17},            [r2 at 64], r12
+	VMULL.S16	Q2, D18,D0[1]	; Q2 = OC_C1S7*x[1]-(x[1]<<16)
+	VLD1.64		{D19},            [r2 at 64]
+	VMULL.S16	Q14,D17,D0[2]	; Q14= OC_C2S6*x[2]-(x[2]<<16)
+	VMULL.S16	Q3, D19,D0[3]	; Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VMULL.S16	Q13,D19,D1[1]	; Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q12,D18,D1[3]	; Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D17,D1[2]	; Q1 = OC_C6S2*x[2]
+	VSHRN.S32	D30,Q15,#16	; D30= t[0]-x[0]
+	VSHRN.S32	D4, Q2, #16	; D4 = t[7]-x[1]
+	VSHRN.S32	D31,Q14,#16	; D31= t[3]-x[2]
+	VSHRN.S32	D6, Q3, #16	; D6 = t[6]-x[3]
+	VSHRN.S32	D7, Q13,#16	; D7 = -t[5]-x[3]
+	VSHRN.S32	D5, Q12,#16	; D5 = t[4]
+	VSHRN.S32	D2, Q1, #16	; D2 = t[2]
+	VADD.S16	D4, D4, D18	; D4 = t[7]
+	VADD.S16	D6, D6, D19	; D6 = t[6]
+	VADD.S16	D7, D7, D19	; D7 = -t[5]
+	VADD.S16	Q15,Q15,Q8	; D30= t[0]
+					; D31= t[3]
+; Stages 2 & 3
+	VSUB.S16	Q12,Q2, Q3	; D24= t[7]-t[6]
+					; D25= t[4]'=t[4]+t[5]
+	VADD.S16	Q13,Q2, Q3	; D26= t[7]'=t[7]+t[6]
+					; D27= t[4]-t[5]
+	VMULL.S16	Q11,D24,D1[0]	; Q11= OC_C4S4*(t[7]-t[6])
+					;       -(t[7]-t[6]<<16)
+	VMULL.S16	Q14,D27,D1[0]	; Q14= OC_C4S4*(t[4]-t[5])
+					;       -(t[4]-t[5]<<16)
+	VADD.S16	D16,D30,D31	; D16= t[0]'=t[0]+t[3]
+	VSUB.S16	D17,D30,D2	; D17= t[2]'=t[0]-t[2]
+	VADD.S16	D18,D30,D2	; D18= t[1]'=t[0]+t[2]
+	VSHRN.S32	D22,Q11,#16	; D22= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VSHRN.S32	D23,Q14,#16	; D23= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VSUB.S16	D19,D30,D31	; D19= t[3]'=t[0]-t[3]
+	VADD.S16	D22,D22,D24	; D22= t[6]'=OC_C4S4*(t[7]-t[6])>>16
+	VADD.S16	D23,D23,D27	; D23= t[5]'=OC_C4S4*(t[4]-t[5])>>16
+	VSUB.S16	D27,D22,D23	; D27= t[5]''=t[6]'-t[5]'
+	VADD.S16	D24,D22,D23	; D24= t[6]''=t[6]'+t[5]'
+; Stage 4
+	VSUB.S16	Q11,Q8, Q13	; D22= y[7]=t[0]'-t[7]'
+					; D23= y[5]=t[2]'-t[5]''
+	VSUB.S16	Q10,Q9, Q12	; D20= y[6]=t[1]'-t[6]'
+					; D21= y[4]=t[3]'-t[4]''
+	VADD.S16	Q8, Q8, Q13	; D16= y[0]=t[0]'+t[7]'
+					; D17= y[2]=t[2]'+t[5]''
+	VADD.S16	Q9, Q9, Q12	; D18= y[1]=t[1]'-t[6]'
+					; D19= y[3]=t[3]'-t[4]''
+	; 8x4 transpose
+	VTRN.16		Q10,Q11		; Q10= c5c4a5a4 c7c6a7a6
+					; Q11= d5d4b5b4 d7d6b7b6
+	VTRN.16		Q8, Q9		; Q8 = c3c2a3a2 c1c0a1a0
+					; Q9 = d3d2b3b2 d1d0b1b0
+	VSWP		D20,D21		; Q10= c7c6a7a6 c5c4a5a4
+	VSWP		D22,D23		; Q11= d7d6b7b6 d5d4b5b4
+	VUZP.32		Q9, Q11		; Q9 = b7b6b5b4 b3b2b1b0
+					; Q11= d7d6d5d4 d3d2d1d0
+	VMULL.S16	Q15,D18,D0[1]
+	VMULL.S16	Q13,D22,D1[1]
+	VUZP.32		Q8, Q10		; Q8 = a7a6a5a4 a3a2a1a0
+					; Q10= c7c6c5c4 c3c2c1c0
+	; Column transforms
+; Stages 1, 2, & 3
+	VMULL.S16	Q14,D19,D0[1]	; Q14:Q15= OC_C1S7*x[1]-(x[1]<<16)
+	VMULL.S16	Q12,D23,D1[1]	; Q12:Q13= OC_C5S3*x[3]-(x[3]<<16)
+	VMULL.S16	Q3, D22,D0[3]
+	VMULL.S16	Q2, D23,D0[3]	;  Q2:Q3 = OC_C3S5*x[3]-(x[3]<<16)
+	VSHRN.S32	D30,Q15,#16
+	VSHRN.S32	D31,Q14,#16	; Q15= (OC_C1S7*x[1]>>16)-x[1]
+	VSHRN.S32	D26,Q13,#16
+	VSHRN.S32	D27,Q12,#16	; Q13= (OC_C5S3*x[3]>>16)-x[3]
+	VSHRN.S32	D28,Q3, #16
+	VSHRN.S32	D29,Q2, #16	; Q14= (OC_C3S5*x[3]>>16)-x[3]
+	VADD.S16	Q15,Q15,Q9	; Q15= t[7]
+	VADD.S16	Q13,Q13,Q11	; Q13= -t[5]
+	VADD.S16	Q14,Q14,Q11	; Q14= t[6]
+	VMULL.S16	Q12,D18,D1[3]
+	VMULL.S16	Q2, D19,D1[3]	;  Q2:Q12= OC_C7S1*x[1]
+	VMULL.S16	Q1, D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q1 = OC_C4S4*x[0]-(x[0]<<16)
+	VMULL.S16	Q3, D20,D0[2]
+	VMULL.S16	Q9, D21,D0[2]	;  Q9:Q3 = OC_C2S6*x[2]-(x[2]<<16)
+	VSHRN.S32	D24,Q12,#16
+	VSHRN.S32	D25,Q2, #16	; Q12= t[4]
+	VMULL.S16	Q2, D20,D1[2]
+	VSHRN.S32	D2, Q1, #16
+	VSHRN.S32	D3, Q11,#16	; Q1 = (OC_C4S4*x[0]>>16)-x[0]
+	VMULL.S16	Q11,D21,D1[2]	;  Q2:Q11= OC_C6S2*x[2]
+	VSHRN.S32	D6, Q3, #16
+	VSHRN.S32	D7, Q9, #16	; Q3 = (OC_C2S6*x[2]>>16)-x[2]
+	VSUB.S16	Q9, Q15,Q14	; Q9 = t[7]-t[6]
+	VADD.S16	Q15,Q15,Q14	; Q15= t[7]'=t[7]+t[6]
+	VSHRN.S32	D4, Q2, #16
+	VSHRN.S32	D5, Q11,#16	; Q2 = t[2]
+	VADD.S16	Q1, Q1, Q8	; Q1 = t[0]
+	VADD.S16	Q8, Q12,Q13	; Q8 = t[4]-t[5]
+	VADD.S16	Q3, Q3, Q10	; Q3 = t[3]
+	VMULL.S16	Q10,D16,D1[0]
+	VMULL.S16	Q11,D17,D1[0]	; Q11:Q10= OC_C4S4*(t[4]-t[5])
+					;           -(t[4]-t[5]<<16)
+	VSUB.S16	Q12,Q12,Q13	; Q12= t[4]'=t[4]+t[5]
+	VMULL.S16	Q14,D18,D1[0]
+	VMULL.S16	Q13,D19,D1[0]	; Q13:Q14= OC_C4S4*(t[6]-t[7])
+					;           -(t[6]-t[7]<<16)
+	VSHRN.S32	D20,Q10,#16
+	VSHRN.S32	D21,Q11,#16	; Q10= (OC_C4S4*(t[4]-t[5])>>16)
+					;       -(t[4]-t[5])
+	VADD.S16	Q11,Q1, Q3	; Q11= t[0]'=t[0]+t[3]
+	VSUB.S16	Q3, Q1, Q3	; Q3 = t[3]'=t[0]-t[3]
+	VSHRN.S32	D28,Q14,#16
+	VSHRN.S32	D29,Q13,#16	; Q14= (OC_C4S4*(t[7]-t[6])>>16)
+					;       -(t[7]-t[6])
+	VADD.S16	Q10,Q10,Q8	; Q10=t[5]'
+	VADD.S16	Q14,Q14,Q9	; Q14=t[6]'
+	VSUB.S16	Q13,Q14,Q10	; Q13=t[5]''=t[6]'-t[5]'
+	VADD.S16	Q14,Q14,Q10	; Q14=t[6]''=t[6]'+t[5]'
+	VADD.S16	Q10,Q1, Q2	; Q10= t[1]'=t[0]+t[2]
+	VSUB.S16	Q2, Q1, Q2	; Q2 = t[2]'=t[0]-t[2]
+; Stage 4
+	CMP	r0, r1
+	VADD.S16	Q8, Q11,Q15	; Q8  = y[0]=t[0]'+t[7]'
+	VADD.S16	Q9, Q10,Q14	; Q9  = y[1]=t[1]'+t[6]''
+	VSUB.S16	Q15,Q11,Q15	; Q15 = y[7]=t[0]'-t[7]'
+	VSUB.S16	Q14,Q10,Q14	; Q14 = y[6]=t[1]'-t[6]''
+	VADD.S16	Q10,Q2, Q13	; Q10 = y[2]=t[2]'+t[5]''
+	VADD.S16	Q11,Q3, Q12	; Q11 = y[3]=t[3]'+t[4]'
+	VSUB.S16	Q12,Q3, Q12	; Q12 = y[4]=t[3]'-t[4]'
+	VSUB.S16	Q13,Q2, Q13	; Q13 = y[5]=t[2]'-t[5]''
+	BEQ	oc_idct8x8_10_neon_noclear
+	VMOV.I8		D2, #0
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VST1.64		{D2}, [r1 at 64], r12
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VST1.64		{D2}, [r1 at 64], r12
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VST1.64		{D2}, [r1 at 64], r12
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VST1.64		{D2}, [r1 at 64]
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+
+oc_idct8x8_10_neon_noclear
+	VRSHR.S16	Q8, Q8, #4	; Q8  = y[0]+8>>4
+	VRSHR.S16	Q9, Q9, #4	; Q9  = y[1]+8>>4
+	VRSHR.S16	Q10,Q10,#4	; Q10 = y[2]+8>>4
+	VRSHR.S16	Q11,Q11,#4	; Q11 = y[3]+8>>4
+	VRSHR.S16	Q12,Q12,#4	; Q12 = y[4]+8>>4
+	VRSHR.S16	Q13,Q13,#4	; Q13 = y[5]+8>>4
+	VRSHR.S16	Q14,Q14,#4	; Q14 = y[6]+8>>4
+	VRSHR.S16	Q15,Q15,#4	; Q15 = y[7]+8>>4
+	VSTMIA		r0, {D16-D31}
+	MOV	PC, r14
+ ]
+
+	END

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armint.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armint.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,117 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#if !defined(_arm_armint_H)
+# define _arm_armint_H (1)
+# include "../internal.h"
+
+# if defined(OC_ARM_ASM)
+
+#  if defined(__ARMEB__)
+#   error "Big-endian configurations are not supported by the ARM asm. " \
+ "Reconfigure with --disable-asm or undefine OC_ARM_ASM."
+#  endif
+
+#  define oc_state_accel_init oc_state_accel_init_arm
+/*This function is implemented entirely in asm, so it's helpful to pull out all
+   of the things that depend on structure offsets.
+  We reuse the function pointer with the wrong prototype, though.*/
+#  define oc_state_loop_filter_frag_rows(_state,_bv,_refi,_pli, \
+ _fragy0,_fragy_end) \
+  ((oc_loop_filter_frag_rows_arm_func) \
+   (_state)->opt_vtable.state_loop_filter_frag_rows)( \
+   (_state)->ref_frame_data[(_refi)],(_state)->ref_ystride[(_pli)], \
+   (_bv), \
+   (_state)->frags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy0)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset \
+   +(_fragy_end)*(ptrdiff_t)(_state)->fplanes[(_pli)].nhfrags, \
+   (_state)->fplanes[(_pli)].froffset, \
+   (_state)->fplanes[(_pli)].froffset+(_state)->fplanes[(_pli)].nfrags, \
+   (_state)->frag_buf_offs, \
+   (_state)->fplanes[(_pli)].nhfrags)
+/*For everything else the default vtable macros are fine.*/
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+
+# include "../state.h"
+# include "armcpu.h"
+
+# if defined(OC_ARM_ASM)
+typedef void (*oc_loop_filter_frag_rows_arm_func)(
+ unsigned char *_ref_frame_data,int _ystride,signed char _bv[256],
+ const oc_fragment *_frags,ptrdiff_t _fragi0,ptrdiff_t _fragi0_end,
+ ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+void oc_state_accel_init_arm(oc_theora_state *_state);
+void oc_frag_copy_list_arm(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_arm(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_arm(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_arm(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_arm(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_frag_rows_arm(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#  if defined(OC_ARM_ASM_EDSP)
+void oc_frag_copy_list_edsp(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+
+#   if defined(OC_ARM_ASM_MEDIA)
+void oc_frag_recon_intra_v6(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_v6(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_v6(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_v6(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_init_v6(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_v6(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+
+#    if defined(OC_ARM_ASM_NEON)
+void oc_frag_copy_list_neon(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
+void oc_frag_recon_intra_neon(unsigned char *_dst,int _ystride,
+ const ogg_int16_t *_residue);
+void oc_frag_recon_inter_neon(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_neon(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_idct8x8_neon(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_loop_filter_init_neon(signed char *_bv,int _flimit);
+void oc_loop_filter_frag_rows_neon(unsigned char *_ref_frame_data,
+ int _ystride,signed char *_bv,const oc_fragment *_frags,ptrdiff_t _fragi0,
+ ptrdiff_t _fragi0_end,ptrdiff_t _fragi_top,ptrdiff_t _fragi_bot,
+ const ptrdiff_t *_frag_buf_offs,int _nhfrags);
+#    endif
+#   endif
+#  endif
+# endif
+
+#endif

Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMfilter.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armloop.s	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,662 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+	AREA	|.text|, CODE, READONLY
+
+	GET	armopts.s
+
+	EXPORT	oc_loop_filter_frag_rows_arm
+
+; Which bit this is depends on the order of packing within a bitfield.
+; Hopefully that doesn't change among any of the relevant compilers.
+OC_FRAG_CODED_FLAG	*	1
+
+	; Vanilla ARM v4 version
+loop_filter_h_arm
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfh_arm_lp
+	LDRB	r3, [r0, #-2]		; r3 = _pix[0]
+	LDRB	r12,[r0, #1]		; r12= _pix[3]
+	LDRB	r4, [r0, #-1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, #-1]
+	STRB	r5, [r0], r1
+	SUBS	r14,r14,#1
+	BGT	lfh_arm_lp
+	SUB	r0, r0, r1, LSL #3
+	LDMFD	r13!,{r3-r6,PC}
+
+loop_filter_v_arm
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	STMFD	r13!,{r3-r6,r14}
+	MOV	r14,#8
+	MOV	r6, #255
+lfv_arm_lp
+	LDRB	r3, [r0, -r1, LSL #1]	; r3 = _pix[0]
+	LDRB	r12,[r0, r1]		; r12= _pix[3]
+	LDRB	r4, [r0, -r1]		; r4 = _pix[1]
+	LDRB	r5, [r0]		; r5 = _pix[2]
+	SUB	r3, r3, r12		; r3 = _pix[0]-_pix[3]+4
+	ADD	r3, r3, #4
+	SUB	r12,r5, r4		; r12= _pix[2]-_pix[1]
+	ADD	r12,r12,r12,LSL #1	; r12= 3*(_pix[2]-_pix[1])
+	ADD	r12,r12,r3	; r12= _pix[0]-_pix[3]+3*(_pix[2]-_pix[1])+4
+	MOV	r12,r12,ASR #3
+	LDRSB	r12,[r2, r12]
+	; Stall (2 on Xscale)
+	ADDS	r4, r4, r12
+	CMPGT	r6, r4
+	EORLT	r4, r6, r4, ASR #32
+	SUBS	r5, r5, r12
+	CMPGT	r6, r5
+	EORLT	r5, r6, r5, ASR #32
+	STRB	r4, [r0, -r1]
+	STRB	r5, [r0], #1
+	SUBS	r14,r14,#1
+	BGT	lfv_arm_lp
+	SUB	r0, r0, #8
+	LDMFD	r13!,{r3-r6,PC}
+
+oc_loop_filter_frag_rows_arm
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	ADD	r2, r2, #127	; _bv += 127
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_arm_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_arm_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_arm_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_arm_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_arm_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_arm
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_arm
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_arm
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_arm
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+oslffri_arm_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_arm_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_arm_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_arm_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+
+ [ OC_ARM_ASM_MEDIA
+	EXPORT	oc_loop_filter_init_v6
+	EXPORT	oc_loop_filter_frag_rows_v6
+
+oc_loop_filter_init_v6
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MVN	r1, r1, LSL #1		; r1 = <0xFFFFFF|255-2*L>
+	AND	r1, r1, #255		; r1 = ll=r1&0xFF
+	ORR	r1, r1, r1, LSL #8	; r1 = <ll|ll>
+	PKHBT	r1, r1, r1, LSL #16	; r1 = <ll|ll|ll|ll>
+	STR	r1, [r0]
+	MOV	PC,r14
+
+; We could use the same strategy as the v filter below, but that would require
+;  40 instructions to load the data and transpose it into columns and another
+;  32 to write out the results at the end, plus the 52 instructions to do the
+;  filtering itself.
+; This is slightly less, and less code, even assuming we could have shared the
+;  52 instructions in the middle with the other function.
+; It executes slightly fewer instructions than the ARMv6 approach David Conrad
+;  proposed for FFmpeg, but not by much:
+;  http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2010-February/083141.html
+; His is a lot less code, though, because it only does two rows at once instead
+;  of four.
+loop_filter_h_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r3
+	STMFD	r13!,{r4-r11,r14}
+	LDR	r12,=0x10003
+	BL loop_filter_h_core_v6
+	ADD	r0, r0, r1, LSL #2
+	BL loop_filter_h_core_v6
+	SUB	r0, r0, r1, LSL #2
+	LDMFD	r13!,{r4-r11,PC}
+
+loop_filter_h_core_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; r12= 0x10003
+	; Preserves r0-r3, r12; Clobbers r4-r11.
+	LDR	r4,[r0, #-2]!		; r4 = <p3|p2|p1|p0>
+	; Single issue
+	LDR	r5,[r0, r1]!		; r5 = <q3|q2|q1|q0>
+	UXTB16	r6, r4, ROR #16		; r6 = <p0|p2>
+	UXTB16	r4, r4, ROR #8		; r4 = <p3|p1>
+	UXTB16	r7, r5, ROR #16		; r7 = <q0|q2>
+	UXTB16	r5, r5, ROR #8		; r5 = <q3|q1>
+	PKHBT	r8, r4, r5, LSL #16	; r8 = <__|q1|__|p1>
+	PKHBT	r9, r6, r7, LSL #16	; r9 = <__|q2|__|p2>
+	SSUB16	r6, r4, r6		; r6 = <p3-p0|p1-p2>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(p3-p0)+3*(p1-p2)+3>
+	SSUB16	r7, r5, r7		; r7 = <q3-q0|q1-q2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(q0-q3)+3*(q2-q1)+4>
+	LDR	r4,[r0, r1]!		; r4 = <r3|r2|r1|r0>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(p3-p0)+3*(p1-p2)+3>>3>
+	LDR	r5,[r0, r1]!		; r5 = <s3|s2|s1|s0>
+	PKHBT	r11,r6, r7, LSL #13	; r11= <??|-R_q|??|-R_p>
+	UXTB16	r6, r4, ROR #16		; r6 = <r0|r2>
+	UXTB16	r11,r11			; r11= <__|-R_q|__|-R_p>
+	UXTB16	r4, r4, ROR #8		; r4 = <r3|r1>
+	UXTB16	r7, r5, ROR #16		; r7 = <s0|s2>
+	PKHBT	r10,r6, r7, LSL #16	; r10= <__|s2|__|r2>
+	SSUB16	r6, r4, r6		; r6 = <r3-r0|r1-r2>
+	UXTB16	r5, r5, ROR #8		; r5 = <s3|s1>
+	SMLAD	r6, r6, r12,r12		; r6 = <????|(r3-r0)+3*(r2-r1)+3>
+	SSUB16	r7, r5, r7		; r7 = <r3-r0|r1-r2>
+	SMLAD	r7, r7, r12,r12		; r7 = <????|(s0-s3)+3*(s2-s1)+4>
+	ORR	r9, r9, r10, LSL #8	; r9 = <s2|q2|r2|p2>
+	MOV	r6, r6, ASR #3		; r6 = <??????|(r0-r3)+3*(r2-r1)+4>>3>
+	PKHBT	r10,r4, r5, LSL #16	; r10= <__|s1|__|r1>
+	PKHBT	r6, r6, r7, LSL #13	; r6 = <??|-R_s|??|-R_r>
+	ORR	r8, r8, r10, LSL #8	; r8 = <s1|q1|r1|p1>
+	UXTB16	r6, r6			; r6 = <__|-R_s|__|-R_r>
+	MOV	r10,#0
+	ORR	r6, r11,r6, LSL #8	; r6 = <-R_s|-R_q|-R_r|-R_p>
+	; Single issue
+	; There's no min, max or abs instruction.
+	; SSUB8 and SEL will work for abs, and we can do all the rest with
+	;  unsigned saturated adds, which means the GE flags are still all
+	;  set when we're done computing lflim(abs(R_i),L).
+	; This allows us to both add and subtract, and split the results by
+	;  the original sign of R_i.
+	SSUB8	r7, r10,r6
+	; Single issue
+	SEL	r7, r7, r6		; r7 = abs(R_i)
+	; Single issue
+	UQADD8	r4, r7, r2		; r4 = 255-max(2*L-abs(R_i),0)
+	; Single issue
+	UQADD8	r7, r7, r4
+	; Single issue
+	UQSUB8	r7, r7, r4		; r7 = min(abs(R_i),max(2*L-abs(R_i),0))
+	; Single issue
+	UQSUB8	r4, r8, r7
+	UQADD8	r5, r9, r7
+	UQADD8	r8, r8, r7
+	UQSUB8	r9, r9, r7
+	SEL	r8, r8, r4		; r8 = p1+lflim(R_i,L)
+	SEL	r9, r9, r5		; r9 = p2-lflim(R_i,L)
+	MOV	r5, r9, LSR #24		; r5 = s2
+	STRB	r5, [r0,#2]!
+	MOV	r4, r8, LSR #24		; r4 = s1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #8		; r5 = r2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #8		; r4 = r1
+	STRB	r4, [r0,#-1]
+	MOV	r5, r9, LSR #16		; r5 = q2
+	STRB	r5, [r0,-r1]!
+	MOV	r4, r8, LSR #16		; r4 = q1
+	STRB	r4, [r0,#-1]
+	; Single issue
+	STRB	r9, [r0,-r1]!
+	; Single issue
+	STRB	r8, [r0,#-1]
+	MOV	PC,r14
+
+; This uses the same strategy as the MMXEXT version for x86, except that UHADD8
+;  computes (a+b>>1) instead of (a+b+1>>1) like PAVGB.
+; This works just as well, with the following procedure for computing the
+;  filter value, f:
+;   u = ~UHADD8(p1,~p2);
+;   v = UHADD8(~p1,p2);
+;   m = v-u;
+;   a = m^UHADD8(m^p0,m^~p3);
+;   f = UHADD8(UHADD8(a,u1),v1);
+;  where f = 127+R, with R in [-127,128] defined as in the spec.
+; This is exactly the same amount of arithmetic as the version that uses PAVGB
+;  as the basic operator.
+; It executes about 2/3 the number of instructions of David Conrad's approach,
+;  but requires more code, because it does all eight columns at once, instead
+;  of four at a time.
+loop_filter_v_v6
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int            _ll
+	; preserves r0-r11
+	STMFD	r13!,{r4-r11,r14}
+	LDRD	r6, [r0, -r1]!		; r7, r6 = <p5|p1>
+	LDRD	r4, [r0, -r1]		; r5, r4 = <p4|p0>
+	LDRD	r8, [r0, r1]!		; r9, r8 = <p6|p2>
+	MVN	r14,r6			; r14= ~p1
+	LDRD	r10,[r0, r1]		; r11,r10= <p7|p3>
+	; Filter the first four columns.
+	MVN	r12,r8			; r12= ~p2
+	UHADD8	r14,r14,r8		; r14= v1=~p1+p2>>1
+	UHADD8	r12,r12,r6		; r12= p1+~p2>>1
+	MVN	r10, r10		; r10=~p3
+	MVN	r12,r12			; r12= u1=~p1+p2+1>>1
+	SSUB8	r14,r14,r12		; r14= m1=v1-u1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = m1^p0
+	EOR	r10,r10,r14		; r10= m1^~p3
+	UHADD8	r4, r4, r10		; r4 = (m1^p0)+(m1^~p3)>>1
+	; Single issue
+	EOR	r4, r4, r14		; r4 = a1=m1^((m1^p0)+(m1^~p3)>>1)
+	SADD8	r14,r14,r12		; r14= v1=m1+u1
+	UHADD8	r4, r4, r12		; r4 = a1+u1>>1
+	MVN	r12,r9			; r12= ~p6
+	UHADD8	r4, r4, r14		; r4 = f1=(a1+u1>>1)+v1>>1
+	; Filter the second four columns.
+	MVN	r14,r7			; r14= ~p5
+	UHADD8	r12,r12,r7		; r12= p5+~p6>>1
+	UHADD8	r14,r14,r9		; r14= v2=~p5+p6>>1
+	MVN	r12,r12			; r12= u2=~p5+p6+1>>1
+	MVN	r11,r11			; r11=~p7
+	SSUB8	r10,r14,r12		; r10= m2=v2-u2
+	; Single issue
+	EOR	r5, r5, r10		; r5 = m2^p4
+	EOR	r11,r11,r10		; r11= m2^~p7
+	UHADD8	r5, r5, r11		; r5 = (m2^p4)+(m2^~p7)>>1
+	; Single issue
+	EOR	r5, r5, r10		; r5 = a2=m2^((m2^p4)+(m2^~p7)>>1)
+	; Single issue
+	UHADD8	r5, r5, r12		; r5 = a2+u2>>1
+	LDR	r12,=0x7F7F7F7F		; r12 = {127}x4
+	UHADD8	r5, r5, r14		; r5 = f2=(a2+u2>>1)+v2>>1
+	; Now split f[i] by sign.
+	; There's no min or max instruction.
+	; We could use SSUB8 and SEL, but this is just as many instructions and
+	;  dual issues more (for v7 without NEON).
+	UQSUB8	r10,r4, r12		; r10= R_i>0?R_i:0
+	UQSUB8	r4, r12,r4		; r4 = R_i<0?-R_i:0
+	UQADD8	r11,r10,r2		; r11= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r4, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r10,r10,r11
+	UQADD8	r4, r4, r14
+	UQSUB8	r10,r10,r11		; r10= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r4, r4, r14		; r4 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQSUB8	r11,r5, r12		; r11= R_i>0?R_i:0
+	UQADD8	r6, r6, r10
+	UQSUB8	r8, r8, r10
+	UQSUB8	r5, r12,r5		; r5 = R_i<0?-R_i:0
+	UQSUB8	r6, r6, r4		; r6 = p1+lflim(R_i,L)
+	UQADD8	r8, r8, r4		; r8 = p2-lflim(R_i,L)
+	UQADD8	r10,r11,r2		; r10= 255-max(2*L-abs(R_i<0),0)
+	UQADD8	r14,r5, r2		; r14= 255-max(2*L-abs(R_i>0),0)
+	UQADD8	r11,r11,r10
+	UQADD8	r5, r5, r14
+	UQSUB8	r11,r11,r10		; r11= min(abs(R_i<0),max(2*L-abs(R_i<0),0))
+	UQSUB8	r5, r5, r14		; r5 = min(abs(R_i>0),max(2*L-abs(R_i>0),0))
+	UQADD8	r7, r7, r11
+	UQSUB8	r9, r9, r11
+	UQSUB8	r7, r7, r5		; r7 = p5+lflim(R_i,L)
+	STRD	r6, [r0, -r1]		; [p5:p1] = [r7: r6]
+	UQADD8	r9, r9, r5		; r9 = p6-lflim(R_i,L)
+	STRD	r8, [r0]		; [p6:p2] = [r9: r8]
+	LDMFD	r13!,{r4-r11,PC}
+
+oc_loop_filter_frag_rows_v6
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	LDR	r2, [r2]	; ll = *(int *)_bv
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_v6_end	;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_v6_end	;			  bail
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_v6_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_v6_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_v6_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_v6
+	CMP	r4, r6		; if (fragi0>_fragi_top)
+	BLGT	loop_filter_v_v6
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_v6
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_v6
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+oslffri_v6_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_v6_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_v6_lp2
+	MOV	r4, r10		; r4 = fragi0 += nhfrags
+	CMP	r4, r5
+	BLT	oslffri_v6_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+ ]
+
+ [ OC_ARM_ASM_NEON
+	EXPORT	oc_loop_filter_init_neon
+	EXPORT	oc_loop_filter_frag_rows_neon
+
+oc_loop_filter_init_neon
+	; r0 = _bv
+	; r1 = _flimit (=L from the spec)
+	MOV		r1, r1, LSL #1  ; r1 = 2*L
+	VDUP.S16	Q15, r1		; Q15= 2L in U16s
+	VST1.64		{D30,D31}, [r0 at 128]
+	MOV	PC,r14
+
+loop_filter_h_neon
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, #2
+	; Doing a 2-element structure load saves doing two VTRN's below, at the
+	;  cost of using two more slower single-lane loads vs. the faster
+	;  all-lane loads.
+	; It's less code this way, though, and benches a hair faster, but it
+	;  leaves D2 and D4 swapped.
+	VLD2.16	{D0[],D2[]},  [r12], r1		; D0 = ____________1100     2,1
+						; D2 = ____________3322
+	VLD2.16	{D4[],D6[]},  [r12], r1		; D4 = ____________5544     2,1
+						; D6 = ____________7766
+	VLD2.16	{D0[1],D2[1]},[r12], r1		; D0 = ________99881100     3,1
+						; D2 = ________BBAA3322
+	VLD2.16	{D4[1],D6[1]},[r12], r1		; D4 = ________DDCC5544     3,1
+						; D6 = ________FFEE7766
+	VLD2.16	{D0[2],D2[2]},[r12], r1		; D0 = ____GGHH99881100     3,1
+						; D2 = ____JJIIBBAA3322
+	VLD2.16	{D4[2],D6[2]},[r12], r1		; D4 = ____KKLLDDCC5544     3,1
+						; D6 = ____NNMMFFEE7766
+	VLD2.16	{D0[3],D2[3]},[r12], r1		; D0 = PPOOGGHH99881100     3,1
+						; D2 = RRQQJJIIBBAA3322
+	VLD2.16	{D4[3],D6[3]},[r12], r1		; D4 = TTSSKKLLDDCC5544     3,1
+						; D6 = VVUUNNMMFFEE7766
+	VTRN.8	D0, D4	; D0 = SSOOKKGGCC884400 D4 = TTPPLLHHDD995511       1,1
+	VTRN.8	D2, D6	; D2 = UUQQMMIIEEAA6622 D6 = VVRRNNJJFFBB7733       1,1
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	VSUBL.U8	Q8, D2, D4	; Q8 = 22 - 11 in S16s              1,3
+	; Stall
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	SUB	r12, r0, #1
+	; Stall
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	; Stall x2
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	; Stall x2
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q1, D2	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q2, Q9, D4 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q1, Q1, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D4, Q2		; D4 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D2, Q1		; D2 = UUQQMMIIEEAA6622		    1,1
+	VTRN.8	D4, D2		; D4 = QQPPIIHHAA992211	D2 = MMLLEEDD6655   1,1
+	VST1.16	{D4[0]}, [r12], r1
+	VST1.16	{D2[0]}, [r12], r1
+	VST1.16	{D4[1]}, [r12], r1
+	VST1.16	{D2[1]}, [r12], r1
+	VST1.16	{D4[2]}, [r12], r1
+	VST1.16	{D2[2]}, [r12], r1
+	VST1.16	{D4[3]}, [r12], r1
+	VST1.16	{D2[3]}, [r12], r1
+	MOV	PC,r14
+
+loop_filter_v_neon
+	; r0 = unsigned char *_pix
+	; r1 = int            _ystride
+	; r2 = int           *_bv
+	; preserves r0-r3
+	; We assume Q15= 2*L in U16s
+	;                    My best guesses at cycle counts (and latency)--vvv
+	SUB	r12,r0, r1, LSL #1
+	VLD1.64	{D0}, [r12 at 64], r1		; D0 = SSOOKKGGCC884400     2,1
+	VLD1.64	{D2}, [r12 at 64], r1		; D2 = TTPPLLHHDD995511     2,1
+	VLD1.64	{D4}, [r12 at 64], r1		; D4 = UUQQMMIIEEAA6622     2,1
+	VLD1.64	{D6}, [r12 at 64], r1		; D6 = VVRRNNJJFFBB7733     2,1
+	VSUBL.U8	Q8, D4, D2	; Q8 = 22 - 11 in S16s              1,3
+	VSUBL.U8	Q0, D0, D6	; Q0 = 00 - 33 in S16s              1,3
+	; Stall
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	SUB	r12, r0, r1
+	; Stall
+	VADD.S16	Q0, Q0, Q8	;                                   1,3
+	; Stall x2
+	VADD.S16	Q0, Q0, Q8	; Q0 = [0-3]+3*[2-1]                1,3
+	; Stall x2
+	VRSHR.S16	Q0, Q0, #3	; Q0 = f = ([0-3]+3*[2-1]+4)>>3     1,4
+	;  We want to do
+	; f =             CLAMP(MIN(-2L-f,0), f, MAX(2L-f,0))
+	;   = ((f >= 0) ? MIN( f ,MAX(2L- f ,0)) : MAX(  f , MIN(-2L- f ,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) : MAX(-|f|, MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|,-MIN(-2L+|f|,0)))
+	;   = ((f >= 0) ? MIN(|f|,MAX(2L-|f|,0)) :-MIN( |f|, MAX( 2L-|f|,0)))
+	; So we've reduced the left and right hand terms to be the same, except
+	; for a negation.
+	; Stall x3
+	VABS.S16	Q9, Q0		; Q9 = |f| in U16s                  1,4
+	VSHR.S16	Q0, Q0, #15	; Q0 = -1 or 0 according to sign    1,3
+	; Stall x2
+	VQSUB.U16	Q10,Q15,Q9	; Q10= MAX(2L-|f|,0) in U16s        1,4
+	VMOVL.U8	Q2, D4	   ; Q2 = __UU__QQ__MM__II__EE__AA__66__22  2,3
+	; Stall x2
+	VMIN.U16	Q9, Q10,Q9	; Q9 = MIN(|f|,MAX(2L-|f|))         1,4
+	; Now we need to correct for the sign of f.
+	; For negative elements of Q0, we want to subtract the appropriate
+	; element of Q9. For positive elements we want to add them. No NEON
+	; instruction exists to do this, so we need to negate the negative
+	; elements, and we can then just add them. a-b = a-(1+!b) = a-1+!b
+	; Stall x3
+	VADD.S16	Q9, Q9, Q0	;				    1,3
+	; Stall x2
+	VEOR.S16	Q9, Q9, Q0	; Q9 = real value of f              1,3
+	; Bah. No VRSBW.U8
+	; Stall (just 1 as Q9 not needed to second pipeline stage. I think.)
+	VADDW.U8	Q1, Q9, D2 ; Q1 = xxTTxxPPxxLLxxHHxxDDxx99xx55xx11  1,3
+	VSUB.S16	Q2, Q2, Q9 ; Q2 = xxUUxxQQxxMMxxIIxxEExxAAxx66xx22  1,3
+	VQMOVUN.S16	D2, Q1		; D2 = TTPPLLHHDD995511		    1,1
+	VQMOVUN.S16	D4, Q2		; D4 = UUQQMMIIEEAA6622		    1,1
+	VST1.64	{D2}, [r12 at 64], r1
+	VST1.64	{D4}, [r12 at 64], r1
+	MOV	PC,r14
+
+oc_loop_filter_frag_rows_neon
+	; r0 = _ref_frame_data
+	; r1 = _ystride
+	; r2 = _bv
+	; r3 = _frags
+	; r4 = _fragi0
+	; r5 = _fragi0_end
+	; r6 = _fragi_top
+	; r7 = _fragi_bot
+	; r8 = _frag_buf_offs
+	; r9 = _nhfrags
+	MOV	r12,r13
+	STMFD	r13!,{r0,r4-r11,r14}
+	LDMFD	r12,{r4-r9}
+	CMP	r4, r5		; if(_fragi0>=_fragi0_end)
+	BGE	oslffri_neon_end;   bail
+	SUBS	r9, r9, #1	; r9 = _nhfrags-1	if (r9<=0)
+	BLE	oslffri_neon_end	;		  bail
+	VLD1.64	{D30,D31}, [r2 at 128]	; Q15= 2L in U16s
+	ADD	r3, r3, r4, LSL #2	; r3 = &_frags[fragi]
+	ADD	r8, r8, r4, LSL #2	; r8 = &_frag_buf_offs[fragi]
+	SUB	r7, r7, r9	; _fragi_bot -= _nhfrags;
+oslffri_neon_lp1
+	MOV	r10,r4		; r10= fragi = _fragi0
+	ADD	r11,r4, r9	; r11= fragi_end-1=fragi+_nhfrags-1
+oslffri_neon_lp2
+	LDR	r14,[r3], #4	; r14= _frags[fragi]	_frags++
+	LDR	r0, [r13]	; r0 = _ref_frame_data
+	LDR	r12,[r8], #4	; r12= _frag_buf_offs[fragi]   _frag_buf_offs++
+	TST	r14,#OC_FRAG_CODED_FLAG
+	BEQ	oslffri_neon_uncoded
+	CMP	r10,r4		; if (fragi>_fragi0)
+	ADD	r0, r0, r12	; r0 = _ref_frame_data + _frag_buf_offs[fragi]
+	BLGT	loop_filter_h_neon
+	CMP	r4, r6		; if (_fragi0>_fragi_top)
+	BLGT	loop_filter_v_neon
+	CMP	r10,r11		; if(fragi+1<fragi_end)===(fragi<fragi_end-1)
+	LDRLT	r12,[r3]	; r12 = _frags[fragi+1]
+	ADD	r0, r0, #8
+	ADD	r10,r10,#1	; r10 = fragi+1;
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG	; && _frags[fragi+1].coded==0
+	BLLT	loop_filter_h_neon
+	CMP	r10,r7		; if (fragi<_fragi_bot)
+	LDRLT	r12,[r3, r9, LSL #2]	; r12 = _frags[fragi+1+_nhfrags-1]
+	SUB	r0, r0, #8
+	ADD	r0, r0, r1, LSL #3
+	ANDLT	r12,r12,#OC_FRAG_CODED_FLAG
+	CMPLT	r12,#OC_FRAG_CODED_FLAG
+	BLLT	loop_filter_v_neon
+	CMP	r10,r11		; while(fragi<=fragi_end-1)
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+oslffri_neon_end
+	LDMFD	r13!,{r0,r4-r11,PC}
+oslffri_neon_uncoded
+	ADD	r10,r10,#1
+	CMP	r10,r11
+	BLE	oslffri_neon_lp2
+	MOV	r4, r10		; r4 = _fragi0 += _nhfrags
+	CMP	r4, r5
+	BLT	oslffri_neon_lp1
+	LDMFD	r13!,{r0,r4-r11,PC}
+ ]
+
+	END

Copied: experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in (from rev 17378, branches/theorarm-merge-branch/lib/arm/ARMoptions.s)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armopts.s.in	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,39 @@
+;********************************************************************
+;*                                                                  *
+;* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+;* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+;* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+;* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+;*                                                                  *
+;* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+;* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+;*                                                                  *
+;********************************************************************
+; Original implementation:
+;  Copyright (C) 2009 Robin Watts for Pinknoise Productions Ltd
+; last mod: $Id: cpu.h 17344 2010-07-21 01:42:18Z tterribe $
+;********************************************************************
+
+; Set the following to 1 if we have EDSP instructions
+;  (LDRD/STRD, etc., ARMv5E and later).
+OC_ARM_ASM_EDSP		*	@HAVE_ARM_ASM_EDSP@
+
+; Set the following to 1 if we have ARMv6 media instructions.
+OC_ARM_ASM_MEDIA	*	@HAVE_ARM_ASM_MEDIA@
+
+; Set the following to 1 if we have NEON (some ARMv7)
+OC_ARM_ASM_NEON		*	@HAVE_ARM_ASM_NEON@
+
+; Set the following to 1 if LDR/STR can work on unaligned addresses
+; This is assumed to be true for ARMv6 and later code
+OC_ARM_CAN_UNALIGN	*	0
+
+; Large unaligned loads and stores are often configured to cause an exception.
+; They cause an 8 cycle stall when they cross a 128-bit (load) or 64-bit (store)
+;  boundary, so it's usually a bad idea to use them anyway if they can be
+;  avoided.
+
+; Set the following to 1 if LDRD/STRD can work on unaligned addresses
+OC_ARM_CAN_UNALIGN_LDRD	*	0
+
+	END

Added: experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/arm/armstate.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,95 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2010                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 17344 2010-07-21 01:42:18Z tterribe $
+
+ ********************************************************************/
+#include "armint.h"
+
+#if defined(OC_ARM_ASM)
+
+# if defined(OC_ARM_ASM_NEON)
+/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
+   the destination.*/
+static const unsigned char OC_FZIG_ZAG_NEON[128]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3, 4,11,18,25,32,40,
+  33,26,19,12, 5, 6,13,20,
+  27,34,41,48,56,49,42,35,
+  28,21,14, 7,15,22,29,36,
+  43,50,57,58,51,44,37,30,
+  23,31,38,45,52,59,60,53,
+  46,39,47,54,61,62,55,63,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64,
+  64,64,64,64,64,64,64,64
+};
+# endif
+
+void oc_state_accel_init_arm(oc_theora_state *_state){
+  oc_state_accel_init_c(_state);
+  _state->cpu_flags=oc_cpu_flags_get();
+# if defined(OC_STATE_USE_VTABLE)
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_arm;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_arm;
+  _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_arm;
+  _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_arm;
+  _state->opt_vtable.idct8x8=oc_idct8x8_arm;
+  /*Note: We _must_ set this function pointer, because the macro in armint.h
+     calls it with different arguments, so the C version will segfault.*/
+  _state->opt_vtable.state_loop_filter_frag_rows=
+   (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_arm;
+# endif
+# if defined(OC_ARM_ASM_EDSP)
+  if(_state->cpu_flags&OC_CPU_ARM_EDSP){
+#  if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_edsp;
+#  endif
+  }
+#  if defined(OC_ARM_ASM_MEDIA)
+  if(_state->cpu_flags&OC_CPU_ARM_MEDIA){
+#   if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_v6;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_v6;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_v6;
+    _state->opt_vtable.idct8x8=oc_idct8x8_v6;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_v6;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_v6;
+#   endif
+  }
+#   if defined(OC_ARM_ASM_NEON)
+  if(_state->cpu_flags&OC_CPU_ARM_NEON){
+#    if defined(OC_STATE_USE_VTABLE)
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_neon;
+    _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_neon;
+    _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_neon;
+    _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_neon;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_neon;
+    _state->opt_vtable.state_loop_filter_frag_rows=
+     (oc_state_loop_filter_frag_rows_func)oc_loop_filter_frag_rows_neon;
+    _state->opt_vtable.idct8x8=oc_idct8x8_neon;
+#    endif
+    _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_NEON;
+  }
+#   endif
+#  endif
+# endif
+}
+
+#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/bitpack.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/bitpack.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/bitpack.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -32,15 +32,18 @@
   const unsigned char *stop;
   oc_pb_window         window;
   int                  available;
+  unsigned             shift;
   stop=_b->stop;
   ptr=_b->ptr;
   window=_b->window;
   available=_b->bits;
-  while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+  shift=OC_PB_WINDOW_SIZE-available;
+  while(7<shift&&ptr<stop){
+    shift-=8;
+    window|=(oc_pb_window)*ptr++<<shift;
   }
   _b->ptr=ptr;
+  available=OC_PB_WINDOW_SIZE-shift;
   if(_bits>available){
     if(ptr>=stop){
       _b->eof=1;
@@ -67,7 +70,7 @@
 }
 
 /*Here we assume that 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits){
+long oc_pack_read_c(oc_pack_buf *_b,int _bits){
   oc_pb_window window;
   int          available;
   long         result;
@@ -87,7 +90,7 @@
   return result;
 }
 
-int oc_pack_read1(oc_pack_buf *_b){
+int oc_pack_read1_c(oc_pack_buf *_b){
   oc_pb_window window;
   int          available;
   int          result;

Modified: experimental/derf/theora-ptalarbvorm/lib/bitpack.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/bitpack.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/bitpack.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -26,6 +26,21 @@
 
 
 
+/*Custom bitpacker implementations.*/
+# if defined(OC_ARM_ASM)
+#  include "arm/armbits.h"
+# endif
+
+# if !defined(oc_pack_read)
+#  define oc_pack_read oc_pack_read_c
+# endif
+# if !defined(oc_pack_read1)
+#  define oc_pack_read1 oc_pack_read1_c
+# endif
+# if !defined(oc_huff_token_decode)
+#  define oc_huff_token_decode oc_huff_token_decode_c
+# endif
+
 # define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
 /*This is meant to be a large, positive constant that can still be efficiently
    loaded as an immediate (on platforms like ARM, for example).
@@ -46,8 +61,8 @@
 int oc_pack_look1(oc_pack_buf *_b);
 void oc_pack_adv1(oc_pack_buf *_b);
 /*Here we assume 0<=_bits&&_bits<=32.*/
-long oc_pack_read(oc_pack_buf *_b,int _bits);
-int oc_pack_read1(oc_pack_buf *_b);
+long oc_pack_read_c(oc_pack_buf *_b,int _bits);
+int oc_pack_read1_c(oc_pack_buf *_b);
 /* returns -1 for read beyond EOF, or the number of whole bytes available */
 long oc_pack_bytes_left(oc_pack_buf *_b);
 

Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xdec.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -61,7 +61,7 @@
         /*The TI compiler refuses to pipeline this if we put it in an if(coded)
            block.
           We can do the loads unconditionally, which helps move them earlier.
-          We do the store unconditionally too, because if we use a condtional
+          We do the store unconditionally too, because if we use a conditional
            store, the compiler propagates the condition back to the operations
            the store depended on, presumably to reduce cache pressure by
            eliminating dead loads.

Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xfrag.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -43,6 +43,40 @@
 #undef OC_ITER
 }
 
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  /*9 cycles per iteration.*/
+  for(fragii=0;fragii<_nfragis;fragii++){
+    const unsigned char *restrict src;
+    const unsigned char *restrict s2;
+    unsigned char       *restrict dst;
+    unsigned char       *restrict d2;
+    ptrdiff_t                     frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    dst=_dst_frame+frag_buf_off;
+    src=_src_frame+frag_buf_off;
+    d2=dst+_ystride;
+    s2=src+_ystride;
+#define OC_ITER() \
+  do{ \
+    _amem8(dst)=_amem8_const(src); \
+    dst+=2*_ystride; \
+    src+=2*_ystride; \
+    _amem8(d2)=_amem8_const(s2); \
+    d2+=2*_ystride; \
+    s2+=2*_ystride; \
+  } \
+  while(0)
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+    OC_ITER();
+#undef OC_ITER
+  }
+}
+
 /*34 cycles.*/
 void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
  const ogg_int16_t _residue[64]){
@@ -130,7 +164,7 @@
 }
 
 void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
@@ -138,25 +172,28 @@
   /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
-    ogg_int16_t p;
+    int         p;
+    long long   ll;
     int         ci;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
-    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+    p=_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5;
+    ll=_itoll(_pack2(p,p),_pack2(p,p));
+    for(ci=0;ci<64;ci+=4)_amem8(_dct_coeffs+64+ci)=ll;
   }
   else{
     /*First, dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_c64x(_dct_coeffs,_last_zzi);
+    oc_idct8x8_c64x(_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs);
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra_c64x(dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -166,54 +203,12 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_c64x(dst,ref+mvoffsets[0],ref+mvoffsets[1],
-          ystride,_dct_coeffs);
+       ystride,_dct_coeffs+64);
     }
-    else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_c64x(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
-void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  /*9 cycles per iteration.*/
-  for(fragii=0;fragii<_nfragis;fragii++){
-    const unsigned char *restrict src;
-    const unsigned char *restrict s2;
-    unsigned char       *restrict dst;
-    unsigned char       *restrict d2;
-    ptrdiff_t                     frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    dst=dst_frame_data+frag_buf_off;
-    src=src_frame_data+frag_buf_off;
-    d2=dst+ystride;
-    s2=src+ystride;
-#define OC_ITER() \
-  do{ \
-    _amem8(dst)=_amem8_const(src); \
-    dst+=2*ystride; \
-    src+=2*ystride; \
-    _amem8(d2)=_amem8_const(s2); \
-    d2+=2*ystride; \
-    s2+=2*ystride; \
-  } \
-  while(0)
-    OC_ITER();
-    OC_ITER();
-    OC_ITER();
-    OC_ITER();
-#undef OC_ITER
-  }
-}
-
 /*46 cycles.*/
 static void loop_filter_h(unsigned char *restrict _pix,int _ystride,int _ll){
   int p0;
@@ -394,9 +389,16 @@
   _amem8(_pix)=_itoll(p6,p2);
 }
 
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit){
+  int ll;
+  ll=_flimit<<1;
+  ll=_pack2(ll,ll);
+  ll=~_spacku4(ll,ll);
+  *((int *)_bv)=ll;
+}
 
-void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -413,14 +415,12 @@
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
   ref_frame_data=_state->ref_frame_data[_refi];
-  ll=_state->loop_filter_limits[_state->qis[0]]<<1;
-  ll=_pack2(ll,ll);
-  ll=~_spacku4(ll,ll);
+  ll=*((int *)_bv);
   /*The following loops are constructed somewhat non-intuitively on purpose.
     The main idea is: if a block boundary has at least one coded fragment on
      it, the filter is applied to it.

Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xidct.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -296,8 +296,8 @@
   } \
   while(0)
 
-/*179 cycles.*/
-static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64]){
+/*196 cycles.*/
+static void oc_idct8x8_slow_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   ogg_int16_t w[64];
   int         x0;
   int         x1;
@@ -318,7 +318,13 @@
   int         i;
   /*Transform rows of x into columns of w.*/
   for(i=0;i<8;i+=2){
-    OC_IDCT8x2_LOAD8(_y+i*8);
+    OC_IDCT8x2_LOAD8(_x+i*8);
+    if(_x!=_y){
+      _amem8(_x+i*8)=0LL;
+      _amem8(_x+i*8+4)=0LL;
+      _amem8(_x+i*8+8)=0LL;
+      _amem8(_x+i*8+12)=0LL;
+    }
     OC_IDCT8x2();
     OC_IDCT8x2_STORET(w+i);
   }
@@ -330,8 +336,8 @@
   }
 }
 
-/*107 cycles.*/
-static void oc_idct8x8_10_c64x(ogg_int16_t _y[64]){
+/*106 cycles.*/
+static void oc_idct8x8_10_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   ogg_int16_t w[64];
   int         t0;
   int         t1;
@@ -347,10 +353,16 @@
   int         x3;
   int         i;
   /*Transform rows of x into columns of w.*/
-  OC_IDCT8x2_LOAD4(_y);
+  OC_IDCT8x2_LOAD4(_x);
   OC_IDCT8x2_4();
   OC_IDCT8x2_STORET(w);
-  OC_IDCT8x2_LOAD2(_y+16);
+  OC_IDCT8x2_LOAD2(_x+16);
+  if(_x!=_y){
+    _amem8(_x)=0LL;
+    _amem8(_x+8)=0LL;
+    _amem4(_x+16)=0;
+    _amem4(_x+24)=0;
+  }
   OC_IDCT8x2_2();
   OC_IDCT8x2_STORET(w+2);
   /*Transform rows of w into columns of y.*/
@@ -361,8 +373,13 @@
   }
 }
 
-/*88 cycles.*/
-static void oc_idct8x8_3_c64x(ogg_int16_t _y[64]){
+#if 0
+/*This used to compile to something faster (88 cycles), but no longer, and I'm
+   not sure what changed to cause this.
+  In any case, it's barely an advantage over the 10-coefficient version, and is
+   now hardly worth the icache space.*/
+/*95 cycles.*/
+static inline void oc_idct8x8_3_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   ogg_int16_t w[64];
   int         t0;
   int         t1;
@@ -377,10 +394,14 @@
   int         i;
   /*Transform rows of x into rows of w.*/
   for(i=0;i<2;i+=2){
-    OC_IDCT8x2_LOAD2(_y+i*8);
+    OC_IDCT8x2_LOAD2(_x+i*8);
     OC_IDCT8x2_2();
     OC_IDCT8x2_STORE(w+i*8);
   }
+  if(_x!=_y){
+    _amem4(_x)=0;
+    _amem4(_x+8)=0;
+  }
   /*Transform columns of w into columns of y.*/
   for(i=0;i<8;i+=2){
     OC_IDCT8x2_LOAD2T(w+i);
@@ -388,12 +409,13 @@
     OC_IDCT8x2_ROUND_STORET(_y+i);
   }
 }
+#endif
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi){
-  if(_last_zzi<3)oc_idct8x8_3_c64x(_y);
-  else if(_last_zzi<10)oc_idct8x8_10_c64x(_y);
-  else oc_idct8x8_slow_c64x(_y);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
+  /*if(_last_zzi<=3)oc_idct8x8_3_c64x(_y,_x);
+  else*/ if(_last_zzi<=10)oc_idct8x8_10_c64x(_y,_x);
+  else oc_idct8x8_slow_c64x(_y,_x);
 }

Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xint.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -23,16 +23,21 @@
 #  define oc_state_accel_init oc_state_accel_init_c64x
 #  define oc_frag_copy(_state,_dst,_src,_ystride) \
   oc_frag_copy_c64x(_dst,_src,_ystride)
+#  define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_c64x(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
 #  define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
   oc_frag_recon_intra_c64x(_dst,_dst_ystride,_residue)
 #  define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
   oc_frag_recon_inter_c64x(_dst,_src,_ystride,_residue)
 #  define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
   oc_frag_recon_inter2_c64x(_dst,_src1,_src2,_ystride,_residue)
-#  define oc_idct8x8(_state,_y,_last_zzi) \
-  oc_idct8x8_c64x(_y,_last_zzi)
+#  define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_c64x(_y,_x,_last_zzi)
 #  define oc_state_frag_recon oc_state_frag_recon_c64x
-#  define oc_state_frag_copy_list oc_state_frag_copy_list_c64x
+#  define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_c64x(_bv,_flimit)
 #  define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c64x
 #  define oc_restore_fpu(_state) do{}while(0)
 # endif
@@ -43,19 +48,20 @@
 
 void oc_frag_copy_c64x(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_c64x(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_c64x(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_c64x(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_c64x(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_c64x(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_c64x(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_c64x(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c64x(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_c64x(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_c64x(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/c64x/c64xstate.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -20,7 +20,7 @@
 #if defined(OC_C64X_ASM)
 
 void oc_state_accel_init_c64x(oc_theora_state *_state){
-  _state->cpu_flags=0;
+  oc_state_accel_init_c(_state);
 # if defined(OC_STATE_USE_VTABLE)
   _state->opt_vtable.frag_copy=oc_frag_copy_c64x;
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c64x;
@@ -28,12 +28,12 @@
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c64x;
   _state->opt_vtable.idct8x8=oc_idct8x8_c64x;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c64x;
-  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c64x;
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c64x;
+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c64x;
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c64x;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
 # endif
-  _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG;
 }
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/decint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decint.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/decint.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -77,7 +77,24 @@
 
 
 struct oc_dec_pipeline_state{
-  int                 bounding_values[256];
+  /*Decoded DCT coefficients.
+    These are placed here instead of on the stack so that they can persist
+     between blocks, which makes clearing them back to zero much faster when
+     only a few non-zero coefficients were decoded.
+    It requires at least 65 elements because the zig-zag index array uses the
+     65th element as a dumping ground for out-of-range indices to protect us
+     from buffer overflow.
+    We make it fully twice as large so that the second half can serve as the
+     reconstruction buffer, which saves passing another parameter to all the
+     acceleration functios.
+    It also solves problems with 16-byte alignment for NEON on ARM.
+    gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
+     alignment, and silently produces incorrect results if you ask for 16.
+    Finally, keeping it off the stack means there's less likely to be a data
+     hazard beween the NEON co-processor and the regular ARM core, which avoids
+     unnecessary stalls.*/
+  OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
   ptrdiff_t           ti[3][64];
   ptrdiff_t           ebi[3][64];
   ptrdiff_t           eob_runs[3][64];
@@ -97,66 +114,66 @@
 
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
-  oc_theora_state      state;
+  oc_theora_state        state;
   /*Whether or not packets are ready to be emitted.
     This takes on negative values while there are remaining header packets to
      be emitted, reaches 0 when the codec is ready for input, and goes to 1
      when a frame has been processed and a data packet is ready.*/
-  int                  packet_state;
+  int                    packet_state;
   /*Buffer in which to assemble packets.*/
-  oc_pack_buf          opb;
+  oc_pack_buf            opb;
   /*Huffman decode trees.*/
-  ogg_int16_t         *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t           *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of the first token in each plane for each coefficient.*/
-  ptrdiff_t            ti0[3][64];
+  ptrdiff_t              ti0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
-  ptrdiff_t            eob_runs[3][64];
+  ptrdiff_t              eob_runs[3][64];
   /*The DCT token lists.*/
-  unsigned char       *dct_tokens;
+  unsigned char         *dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  unsigned char       *extra_bits;
+  unsigned char         *extra_bits;
   /*The number of dct tokens unpacked so far.*/
-  int                  dct_tokens_count;
+  int                    dct_tokens_count;
   /*The out-of-loop post-processing level.*/
-  int                  pp_level;
+  int                    pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
-  int                  pp_dc_scale[64];
+  int                    pp_dc_scale[64];
   /*The sharpen modifier used for out-of-loop deringing.*/
-  int                  pp_sharp_mod[64];
+  int                    pp_sharp_mod[64];
   /*The DC quantization index of each block.*/
-  unsigned char       *dc_qis;
+  unsigned char         *dc_qis;
   /*The variance of each block.*/
-  int                 *variances;
+  int                   *variances;
   /*The storage for the post-processed frame buffer.*/
-  unsigned char       *pp_frame_data;
+  unsigned char         *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                  pp_frame_state;
+  int                    pp_frame_state;
   /*The buffer used for the post-processed frame.
     Note that this is _not_ guaranteed to have the same strides and offsets as
      the reference frame buffers.*/
-  th_ycbcr_buffer      pp_frame_buf;
+  th_ycbcr_buffer        pp_frame_buf;
   /*The striped decode callback function.*/
-  th_stripe_callback   stripe_cb;
-  oc_dec_pipeline_state pipe;
+  th_stripe_callback     stripe_cb;
+  oc_dec_pipeline_state  pipe;
 # if defined(OC_DEC_USE_VTABLE)
   /*Table for decoder acceleration functions.*/
-  oc_dec_opt_vtable    opt_vtable;
+  oc_dec_opt_vtable      opt_vtable;
 # endif
 # if defined(HAVE_CAIRO)
   /*Output metrics for debugging.*/
-  int                  telemetry;
-  int                  telemetry_mbmode;
-  int                  telemetry_mv;
-  int                  telemetry_qi;
-  int                  telemetry_bits;
-  int                  telemetry_frame_bytes;
-  int                  telemetry_coding_bytes;
-  int                  telemetry_mode_bytes;
-  int                  telemetry_mv_bytes;
-  int                  telemetry_qi_bytes;
-  int                  telemetry_dc_bytes;
-  unsigned char       *telemetry_frame_data;
+  int                    telemetry;
+  int                    telemetry_mbmode;
+  int                    telemetry_mv;
+  int                    telemetry_qi;
+  int                    telemetry_bits;
+  int                    telemetry_frame_bytes;
+  int                    telemetry_coding_bytes;
+  int                    telemetry_mode_bytes;
+  int                    telemetry_mv_bytes;
+  int                    telemetry_qi_bytes;
+  int                    telemetry_dc_bytes;
+  unsigned char         *telemetry_frame_data;
 # endif
 };
 

Modified: experimental/derf/theora-ptalarbvorm/lib/decode.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/decode.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/decode.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -726,11 +726,12 @@
   frags=_dec->state.frags;
   for(mbi=0;mbi<nmbs;mbi++){
     if(mb_modes[mbi]!=OC_MODE_INVALID){
-      int bi;
       /*Check for a coded luma block in this macro block.*/
-      for(bi=0;bi<4&&!frags[mb_maps[mbi][0][bi]].coded;bi++);
-      /*We found one, decode a mode.*/
-      if(bi<4){
+      if(frags[mb_maps[mbi][0][0]].coded
+       ||frags[mb_maps[mbi][0][1]].coded
+       ||frags[mb_maps[mbi][0][2]].coded
+       ||frags[mb_maps[mbi][0][3]].coded){
+        /*We found one, decode a mode.*/
         mb_modes[mbi]=alphabet[oc_huff_token_decode(&_dec->opb,mode_tree)];
       }
       /*There were none: INTER_NOMV is forced.*/
@@ -1335,9 +1336,11 @@
  oc_dec_pipeline_state *_pipe){
   const ptrdiff_t *coded_fragis;
   const ptrdiff_t *uncoded_fragis;
+  int              flimit;
   int              pli;
   int              qii;
   int              qti;
+  int              zzi;
   /*If chroma is sub-sampled in the vertical direction, we have to decode two
      super block rows of Y' for each super block row of Cb and Cr.*/
   _pipe->mcu_nvfrags=4<<!(_dec->state.info.pixel_fmt&2);
@@ -1369,8 +1372,9 @@
   /*Set the previous DC predictor to 0 for all color planes and frame types.*/
   memset(_pipe->pred_last,0,sizeof(_pipe->pred_last));
   /*Initialize the bounding value array for the loop filter.*/
-  _pipe->loop_filter=!oc_state_loop_filter_init(&_dec->state,
-   _pipe->bounding_values);
+  flimit=_dec->state.loop_filter_limits[_dec->state.qis[0]];
+  _pipe->loop_filter=flimit!=0;
+  if(flimit!=0)oc_loop_filter_init(&_dec->state,_pipe->bounding_values,flimit);
   /*Initialize any buffers needed for post-processing.
     We also save the current post-processing level, to guard against the user
      changing it from a callback.*/
@@ -1383,6 +1387,8 @@
      _dec->state.ref_frame_bufs[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
      sizeof(_dec->pp_frame_buf[0])*3);
   }
+  /*Clear down the DCT coefficient buffer for the first block.*/
+  for(zzi=0;zzi<64;zzi++)_pipe->dct_coeffs[zzi]=0;
 }
 
 /*Undo the DC prediction in a single plane of an MCU (one or two super block
@@ -1532,16 +1538,11 @@
   eob_runs=_pipe->eob_runs[_pli];
   for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
   for(fragii=0;fragii<ncoded_fragis;fragii++){
-    /*This array is made one element larger because the zig-zag index array
-       uses the final element as a dumping ground for out-of-range indices
-       to protect us from buffer overflow.*/
-    OC_ALIGN16(ogg_int16_t dct_coeffs[65]);
     const ogg_uint16_t *ac_quant;
     ptrdiff_t           fragi;
     int                 last_zzi;
     int                 zzi;
     fragi=coded_fragis[fragii];
-    for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
     qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
     /*Decode the AC coefficients.*/
@@ -1578,18 +1579,19 @@
         eob_runs[zzi]=eob;
         ti[zzi]=lti;
         zzi+=rlen;
-        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        _pipe->dct_coeffs[dct_fzig_zag[zzi]]=
+         (ogg_int16_t)(coeff*(int)ac_quant[zzi]);
         zzi+=!eob;
       }
     }
     /*TODO: zzi should be exactly 64 here.
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
-    dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
+    _pipe->dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,fragi,_pli,
-     dct_coeffs,last_zzi,dc_quant[qti]);
+     _pipe->dct_coeffs,last_zzi,dc_quant[qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -1603,9 +1605,14 @@
      code, and the hard case (high bitrate, high resolution) is handled
      correctly.*/
   /*Copy the uncoded blocks from the previous reference frame.*/
-  _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
-   _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
+  if(_pipe->nuncoded_fragis[_pli]>0){
+    _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
+    oc_frag_copy_list(&_dec->state,
+     _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_SELF]],
+     _dec->state.ref_frame_data[_dec->state.ref_frame_idx[OC_FRAME_PREV]],
+     _dec->state.ref_ystride[_pli],_pipe->uncoded_fragis[_pli],
+     _pipe->nuncoded_fragis[_pli],_dec->state.frag_buf_offs);
+  }
 }
 
 /*Filter a horizontal block edge.*/

Modified: experimental/derf/theora-ptalarbvorm/lib/encint.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/encint.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -31,6 +31,9 @@
 typedef struct oc_enc_opt_data        oc_enc_opt_data;
 typedef struct oc_mb_enc_info         oc_mb_enc_info;
 typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
+typedef struct oc_fr_state            oc_fr_state;
+typedef struct oc_qii_state           oc_qii_state;
+typedef struct oc_enc_pipeline_state  oc_enc_pipeline_state;
 typedef struct oc_mode_rd             oc_mode_rd;
 typedef struct oc_iir_filter          oc_iir_filter;
 typedef struct oc_frame_metrics       oc_frame_metrics;
@@ -384,6 +387,90 @@
 
 
 
+/*State to track coded block flags and their bit cost.
+  We use opportunity cost to measure the bits required to code or skip the next
+   block, using the cheaper of the cost to code it fully or partially, so long
+   as both are possible.*/
+struct oc_fr_state{
+  /*The number of bits required for the coded block flags so far this frame.*/
+  ptrdiff_t  bits;
+  /*The length of the current run for the partial super block flag, not
+     including the current super block.*/
+  unsigned   sb_partial_count:16;
+  /*The length of the current run for the full super block flag, not
+     including the current super block.*/
+  unsigned   sb_full_count:16;
+  /*The length of the coded block flag run when the current super block
+     started.*/
+  unsigned   b_coded_count_prev:6;
+  /*The coded block flag when the current super block started.*/
+  signed int b_coded_prev:2;
+  /*The length of the current coded block flag run.*/
+  unsigned   b_coded_count:6;
+  /*The current coded block flag.*/
+  signed int b_coded:2;
+  /*The number of blocks processed in the current super block.*/
+  unsigned   b_count:5;
+  /*Whether or not it is cheaper to code the current super block partially,
+     even if it could still be coded fully.*/
+  unsigned   sb_prefer_partial:1;
+  /*Whether the last super block was coded partially.*/
+  signed int sb_partial:2;
+  /*The number of bits required for the flags for the current super block.*/
+  unsigned   sb_bits:6;
+  /*Whether the last non-partial super block was coded fully.*/
+  signed int sb_full:2;
+};
+
+
+
+struct oc_qii_state{
+  ptrdiff_t  bits;
+  unsigned   qi01_count:14;
+  signed int qi01:2;
+  unsigned   qi12_count:14;
+  signed int qi12:2;
+};
+
+
+
+/*Temporary encoder state for the analysis pipeline.*/
+struct oc_enc_pipeline_state{
+  /*DCT coefficient storage.
+    This is kept off the stack because a) gcc can't align things on the stack
+     reliably on ARM, and b) it avoids (unintentional) data hazards between
+     ARM and NEON code.*/
+  OC_ALIGN16(ogg_int16_t dct_data[128]);
+  OC_ALIGN16(signed char bounding_values[256]);
+  oc_fr_state         fr[3];
+  oc_qii_state        qs[3];
+  /*Skip SSD storage for the current MCU in each plane.*/
+  unsigned           *skip_ssd[3];
+  /*Coded/uncoded fragment lists for each plane for the current MCU.*/
+  ptrdiff_t          *coded_fragis[3];
+  ptrdiff_t          *uncoded_fragis[3];
+  ptrdiff_t           ncoded_fragis[3];
+  ptrdiff_t           nuncoded_fragis[3];
+  /*The starting fragment for the current MCU in each plane.*/
+  ptrdiff_t           froffset[3];
+  /*The starting row for the current MCU in each plane.*/
+  int                 fragy0[3];
+  /*The ending row for the current MCU in each plane.*/
+  int                 fragy_end[3];
+  /*The starting superblock for the current MCU in each plane.*/
+  unsigned            sbi0[3];
+  /*The ending superblock for the current MCU in each plane.*/
+  unsigned            sbi_end[3];
+  /*The number of tokens for zzi=1 for each color plane.*/
+  int                 ndct_tokens1[3];
+  /*The outstanding eob_run count for zzi=1 for each color plane.*/
+  int                 eob_run1[3];
+  /*Whether or not the loop filter is enabled.*/
+  int                 loop_filter;
+};
+
+
+
 /*Statistics used to estimate R-D cost of a block in a given coding mode.
   See modedec.h for more details.*/
 struct oc_mode_rd{
@@ -565,6 +652,8 @@
   size_t                   mv_bits[2];
   /*The mode scheme chooser for estimating mode coding costs.*/
   oc_mode_scheme_chooser   chooser;
+  /*Temporary encoder state for the analysis pipeline.*/
+  oc_enc_pipeline_state    pipe;
   /*The number of vertical super blocks in an MCU.*/
   int                      mcu_nvsbs;
   /*The SSD error for skipping each fragment in the current MCU.*/

Modified: experimental/derf/theora-ptalarbvorm/lib/fragment.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/fragment.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/fragment.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -26,6 +26,26 @@
   }
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    oc_frag_copy_c(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
 void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
  const ogg_int16_t _residue[64]){
   int i;

Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -491,22 +491,22 @@
   for(;;){
     n=_tree[node];
     if(n>available){
-      int shift;
-      shift=OC_PB_WINDOW_SIZE-8-available;
+      unsigned shift;
+      shift=OC_PB_WINDOW_SIZE-available;
       do{
         /*We don't bother setting eof because we won't check for it after we've
            started decoding DCT tokens.*/
         if(ptr>=stop){
-          available=OC_LOTS_OF_BITS;
+          shift=-OC_LOTS_OF_BITS;
           break;
         }
-        available+=8;
+        shift-=8;
         window|=(oc_pb_window)*ptr++<<shift;
-        shift-=8;
       }
-      while(shift>=0);
+      while(shift>=8);
       /*Note: We never request more than 24 bits, so there's no need to fill in
          the last partial byte here.*/
+      available=OC_PB_WINDOW_SIZE-shift;
     }
     bits=window>>OC_PB_WINDOW_SIZE-n;
     node=_tree[node+1+bits];

Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -27,6 +27,6 @@
 int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
  const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
 void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const ogg_int16_t *_node);
+int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/idct.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/idct.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -231,18 +231,18 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_2(w,_x);
   idct8_1(w+1,_x+8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[8]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -260,20 +260,20 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
   idct8_4(w,_x);
   idct8_3(w+1,_x+8);
   idct8_2(w+2,_x+16);
   idct8_1(w+3,_x+24);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
+  for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  /*Clear input data for next block (decoder only).*/
+  if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
@@ -282,23 +282,22 @@
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients.*/
-static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
-  const ogg_int16_t *in;
-  ogg_int16_t       *end;
-  ogg_int16_t       *out;
-  ogg_int16_t        w[64];
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  ogg_int16_t w[64];
+  int         i;
   /*Transform rows of x into columns of w.*/
-  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(w+i,_x+i*8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  for(i=0;i<8;i++)idct8(_y+i,w+i*8);
   /*Adjust for the scale factor.*/
-  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+  for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
+  if(_x!=_y)for(i=0;i<64;i++)_x[i]=0;
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -324,7 +323,7 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<3)oc_idct8x8_3(_y,_y);
-  else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
-  else oc_idct8x8_slow(_y,_y);
+  if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
+  else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }

Modified: experimental/derf/theora-ptalarbvorm/lib/internal.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/internal.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/internal.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -38,14 +38,21 @@
 #  endif
 # endif
 
-/*Some assembly constructs require aligned operands.*/
-# if defined(OC_X86_ASM)
+/*Some assembly constructs require aligned operands.
+  The following macros are _only_ intended for structure member declarations.
+  Although they will sometimes work on stack variables, gcc will often silently
+   ignore them.
+  A separate set of macros could be made for manual stack alignment, but we
+   don't actually require it anywhere.*/
+# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
 #  if defined(__GNUC__)
 #   define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
 #   define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
 #  elif defined(_MSC_VER)
 #   define OC_ALIGN8(expr) __declspec (align(8)) expr
 #   define OC_ALIGN16(expr) __declspec (align(16)) expr
+#  else
+#   error "Alignment macros required for this platform."
 #  endif
 # endif
 # if !defined(OC_ALIGN8)

Modified: experimental/derf/theora-ptalarbvorm/lib/state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/state.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/state.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -663,12 +663,13 @@
   _state->cpu_flags=0;
 #if defined(OC_STATE_USE_VTABLE)
   _state->opt_vtable.frag_copy=oc_frag_copy_c;
+  _state->opt_vtable.frag_copy_list=oc_frag_copy_list_c;
   _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
   _state->opt_vtable.idct8x8=oc_idct8x8_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
-  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
+  _state->opt_vtable.loop_filter_init=oc_loop_filter_init_c;
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
@@ -930,7 +931,7 @@
 }
 
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
@@ -944,19 +945,21 @@
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
     /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
+    for(ci=0;ci<64;ci++)_dct_coeffs[64+ci]=p;
   }
   else{
     /*First, dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
+  if(mb_mode==OC_MODE_INTRA){
+    oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs+64);
+  }
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -966,40 +969,15 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2(_state,
-       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs+64);
     }
-    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else{
+      oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
+    }
   }
 }
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    oc_frag_copy(_state,dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_h(unsigned char *_pix,int _ystride,signed char *_bv){
   int y;
   _pix-=2;
   for(y=0;y<8;y++){
@@ -1015,7 +993,7 @@
   }
 }
 
-static void loop_filter_v(unsigned char *_pix,int _ystride,int *_bv){
+static void loop_filter_v(unsigned char *_pix,int _ystride,signed char *_bv){
   int x;
   _pix-=_ystride*2;
   for(x=0;x<8;x++){
@@ -1032,20 +1010,16 @@
 
 /*Initialize the bounding values array used by the loop filter.
   _bv: Storage for the array.
-  Return: 0 on success, or a non-zero value if no filtering need be applied.*/
-int oc_state_loop_filter_init(oc_theora_state *_state,int _bv[256]){
-  int flimit;
+  _flimit: The filter limit as defined in Section 7.10 of the spec.*/
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit){
   int i;
-  flimit=_state->loop_filter_limits[_state->qis[0]];
-  if(flimit==0)return 1;
   memset(_bv,0,sizeof(_bv[0])*256);
-  for(i=0;i<flimit;i++){
-    if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
-    _bv[127-i]=-i;
-    _bv[127+i]=i;
-    if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
+  for(i=0;i<_flimit;i++){
+    if(127-i-_flimit>=0)_bv[127-i-_flimit]=(signed char)(i-_flimit);
+    _bv[127-i]=(signed char)(-i);
+    _bv[127+i]=(signed char)(i);
+    if(127+i+_flimit<256)_bv[127+i+_flimit]=(signed char)(_flimit-i);
   }
-  return 0;
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -1056,8 +1030,8 @@
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
+ signed char *_bv,int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -1074,7 +1048,7 @@
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;

Modified: experimental/derf/theora-ptalarbvorm/lib/state.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/state.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/state.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -52,6 +52,9 @@
 #   include "x86/x86int.h"
 #  endif
 # endif
+# if defined(OC_ARM_ASM)
+#  include "arm/armint.h"
+# endif
 # if defined(OC_C64X_ASM)
 #  include "c64x/c64xint.h"
 # endif
@@ -64,6 +67,12 @@
 #   define oc_frag_copy(_state,_dst,_src,_ystride) \
   ((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
 #  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+ ((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs))
+#  endif
 #  if !defined(oc_frag_recon_intra)
 #   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
   ((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
@@ -78,8 +87,8 @@
    _src1,_src2,_ystride,_residue))
 #  endif
 # if !defined(oc_idct8x8)
-#   define oc_idct8x8(_state,_y,_last_zzi) \
-  ((*(_state)->opt_vtable.idct8x8)(_y,_last_zzi))
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  ((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
 #  endif
 #  if !defined(oc_state_frag_recon)
 #   define oc_state_frag_recon(_state,_fragi, \
@@ -87,11 +96,9 @@
   ((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
    _pli,_dct_coeffs,_last_zzi,_dc_quant))
 #  endif
-#  if !defined(oc_state_frag_copy_list)
-#   define oc_state_frag_copy_list(_state,_fragis,_nfragis, \
- _dst_frame,_src_frame,_pli) \
- ((*(_state)->opt_vtable.state_frag_copy_list)(_state,_fragis,_nfragis, \
-   _dst_frame,_src_frame,_pli))
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  ((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
 #  endif
 #  if !defined(oc_state_loop_filter_frag_rows)
 #   define oc_state_loop_filter_frag_rows(_state, \
@@ -108,6 +115,12 @@
 #   define oc_frag_copy(_state,_dst,_src,_ystride) \
   oc_frag_copy_c(_dst,_src,_ystride)
 #  endif
+#  if !defined(oc_frag_copy_list)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
+  _fragis,_nfragis,_frag_buf_offs)
+#  endif
 #  if !defined(oc_frag_recon_intra)
 #   define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
   oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
@@ -121,13 +134,14 @@
   oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
 #  endif
 #  if !defined(oc_idct8x8)
-#   define oc_idct8x8(_state,_y,_last_zzi) oc_idct8x8_c(_y,_last_zzi)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
 #  endif
 #  if !defined(oc_state_frag_recon)
 #   define oc_state_frag_recon oc_state_frag_recon_c
 #  endif
-#  if !defined(oc_state_frag_copy_list)
-#   define oc_state_frag_copy_list oc_state_frag_copy_list_c
+#  if !defined(oc_loop_filter_init)
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_c(_bv,_flimit)
 #  endif
 #  if !defined(oc_state_loop_filter_frag_rows)
 #   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
@@ -314,25 +328,28 @@
 };
 
 
+typedef void (*oc_state_loop_filter_frag_rows_func)(
+ const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
+ int _fragy0,int _fragy_end);
 
 /*The shared (encoder and decoder) functions that have accelerated variants.*/
 struct oc_base_opt_vtable{
   void (*frag_copy)(unsigned char *_dst,
    const unsigned char *_src,int _ystride);
+  void (*frag_copy_list)(unsigned char *_dst_frame,
+   const unsigned char *_src_frame,int _ystride,
+   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
   void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
    const ogg_int16_t _residue[64]);
   void (*frag_recon_inter)(unsigned char *_dst,
    const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
   void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
    const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
+  void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
   void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
-   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-  void (*state_frag_copy_list)(const oc_theora_state *_state,
-   const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
-   int _dst_frame,int _src_frame,int _pli);
-  void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,
-   int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+  void (*loop_filter_init)(signed char _bv[256],int _flimit);
+  oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
   void (*restore_fpu)(void);
 };
 
@@ -463,7 +480,7 @@
 int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
  int _pli,int _dx,int _dy);
 
-int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
+void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
 # if defined(OC_DUMP_IMAGES)
 int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
@@ -473,20 +490,20 @@
 /*Default pure-C implementations of shared accelerated functions.*/
 void oc_frag_copy_c(unsigned char *_dst,
  const unsigned char *_src,int _src_ystride);
+void oc_frag_copy_list_c(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter_c(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_c(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_c(void);
 
 /*We need a way to call a few encoder functions without introducing a link-time

Modified: experimental/derf/theora-ptalarbvorm/lib/tokenize.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/tokenize.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -754,6 +754,8 @@
 int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
  int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin){
+  /*Note that gcc will not always respect this alignment.
+    In this case it doesn't matter terribly much.*/
   OC_ALIGN16(ogg_int16_t  coef[64]);
   const unsigned char *dct_fzig_zag;
   ogg_uint16_t        *eob_run;

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,182 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
- CPU capability detection for x86 processors.
-  Originally written by Rudolf Marek.
-
- function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "cpu.h"
-
-#if !defined(OC_X86_ASM)
-ogg_uint32_t oc_cpu_flags_get(void){
-  return 0;
-}
-#else
-# if defined(__amd64__)||defined(__x86_64__)
-/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
-   compiling with -fPIC.*/
-#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "cpuid\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-# else
-/*On x86-32, not so much.*/
-#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  __asm__ __volatile__( \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   "cpuid\n\t" \
-   "xchgl %%ebx,%[ebx]\n\t" \
-   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
-   :"a"(_op) \
-   :"cc" \
-  )
-# endif
-
-static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
-  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
-  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
-  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
-  return flags;
-}
-
-static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
-  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
-  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
-  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
-  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
-  return flags;
-}
-
-ogg_uint32_t oc_cpu_flags_get(void){
-  ogg_uint32_t flags;
-  ogg_uint32_t eax;
-  ogg_uint32_t ebx;
-  ogg_uint32_t ecx;
-  ogg_uint32_t edx;
-# if !defined(__amd64__)&&!defined(__x86_64__)
-  /*Not all x86-32 chips support cpuid, so we have to check.*/
-  __asm__ __volatile__(
-   "pushfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "movl %[a],%[b]\n\t"
-   "xorl $0x200000,%[a]\n\t"
-   "pushl %[a]\n\t"
-   "popfl\n\t"
-   "pushfl\n\t"
-   "popl %[a]\n\t"
-   "popfl\n\t"
-   :[a]"=r"(eax),[b]"=r"(ebx)
-   :
-   :"cc"
-  );
-  /*No cpuid.*/
-  if(eax==ebx)return 0;
-# endif
-  cpuid(0,eax,ebx,ecx,edx);
-  /*         l e t n          I e n i          u n e G*/
-  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
-   /*      6 8 x M          T e n i          u n e G*/
-   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
-    int family;
-    int model;
-    /*Intel, Transmeta (tested with Crusoe TM5800):*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-    family=(eax>>8)&0xF;
-    model=(eax>>4)&0xF;
-    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
-       unit, so don't use it.*/
-    if(family==6&&(model==9||model==13||model==14)){
-      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
-    }
-  }
-  /*              D M A c          i t n e          h t u A*/
-  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
-   /*      C S N            y b   e          d o e G*/
-   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
-    /*AMD, Geode:*/
-    cpuid(0x80000000,eax,ebx,ecx,edx);
-    if(eax<0x80000001)flags=0;
-    else{
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      flags=oc_parse_amd_flags(edx,ecx);
-    }
-    /*Also check for SSE.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags|=oc_parse_intel_flags(edx,ecx);
-  }
-  /*Technically some VIA chips can be configured in the BIOS to return any
-     string here the user wants.
-    There is a special detection method that can be used to identify such
-     processors, but in my opinion, if the user really wants to change it, they
-     deserve what they get.*/
-  /*              s l u a          H r u a          t n e C*/
-  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
-    /*VIA:*/
-    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
-       chips (thanks to the engineers from Centaur Technology who provided it).
-      These chips support Intel-like cpuid info.
-      The C3-2 (Nehemiah) cores appear to, as well.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-    if(eax>=0x80000001){
-      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
-        We need to check this even if the Intel test succeeds to pick up 3DNow!
-         support on these processors.
-        Unlike actual AMD processors, we cannot _rely_ on this info, since
-         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
-         this function, yet return edx=0, despite the Intel test indicating
-         MMX support.
-        Therefore the features detected here are strictly added to those
-         detected by the Intel test.*/
-      /*TODO: How about earlier chips?*/
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      /*Note: As of the C7, this function returns Intel-style extended feature
-         flags, not AMD-style.
-        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
-         do not conflict with any of the AMD flags we inspect.
-        For the remaining bits, Intel tells us, "Do not count on their value",
-         but VIA assures us that they will all be zero (at least on the C7 and
-         Isaiah chips).
-        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
-         (0xC0C00000) for something else, we will have to add code to detect
-         the model to decide when it is appropriate to inspect them.*/
-      flags|=oc_parse_amd_flags(edx,ecx);
-    }
-  }
-  else{
-    /*Implement me.*/
-    flags=0;
-  }
-  return flags;
-}
-#endif

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,36 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
- function:
-    last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "../internal.h"
-
-#define OC_CPU_X86_MMX      (1<<0)
-#define OC_CPU_X86_3DNOW    (1<<1)
-#define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT   (1<<3)
-#define OC_CPU_X86_SSE      (1<<4)
-#define OC_CPU_X86_SSE2     (1<<5)
-#define OC_CPU_X86_PNI      (1<<6)
-#define OC_CPU_X86_SSSE3    (1<<7)
-#define OC_CPU_X86_SSE4_1   (1<<8)
-#define OC_CPU_X86_SSE4_2   (1<<9)
-#define OC_CPU_X86_SSE4A    (1<<10)
-#define OC_CPU_X86_SSE5     (1<<11)
-
-ogg_uint32_t oc_cpu_flags_get(void);
-
-#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -22,17 +22,92 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    ptrdiff_t            ystride3; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
+      :[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+  }
+}
+
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm__ __volatile__(

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxfrag.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,64 +0,0 @@
-#if !defined(_x86_mmxfrag_H)
-# define _x86_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    ptrdiff_t            ystride3; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm__ __volatile__( \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*ystride3=ystride*3*/ \
-      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[src],%[ystride],4),%[src]\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      /*Pointer to next 4.*/ \
-      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
-      /*src+0*ystride*/ \
-      "movq (%[src]),%%mm0\n\t" \
-      /*src+1*ystride*/ \
-      "movq (%[src],%[ystride]),%%mm1\n\t" \
-      /*src+2*ystride*/ \
-      "movq (%[src],%[ystride],2),%%mm2\n\t" \
-      /*src+3*ystride*/ \
-      "movq (%[src],%[ystride3]),%%mm3\n\t" \
-      /*dst+0*ystride*/ \
-      "movq %%mm0,(%[dst])\n\t" \
-      /*dst+1*ystride*/ \
-      "movq %%mm1,(%[dst],%[ystride])\n\t" \
-      /*dst+2*ystride*/ \
-      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
-      /*dst+3*ystride*/ \
-      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
-      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
-      :[ystride]"r"((ptrdiff_t)(_ystride)) \
-      :"memory" \
-    ); \
-  } \
-  while(0)
-
-# endif
-#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxidct.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -31,65 +31,65 @@
 
 
 /*38 cycles*/
-#define OC_IDCT_BEGIN \
+#define OC_IDCT_BEGIN(_y,_x) \
   "#OC_IDCT_BEGIN\n\t" \
-  "movq "OC_I(3)",%%mm2\n\t" \
-  "movq 0x30(%[c]),%%mm6\n\t" \
+  "movq "OC_I(3,_x)",%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
   "movq %%mm2,%%mm4\n\t" \
-  "movq "OC_J(5)",%%mm7\n\t" \
+  "movq "OC_J(5,_x)",%%mm7\n\t" \
   "pmulhw %%mm6,%%mm4\n\t" \
-  "movq 0x50(%[c]),%%mm1\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
   "pmulhw %%mm7,%%mm6\n\t" \
   "movq %%mm1,%%mm5\n\t" \
   "pmulhw %%mm2,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm3\n\t" \
+  "movq "OC_I(1,_x)",%%mm3\n\t" \
   "pmulhw %%mm7,%%mm5\n\t" \
-  "movq 0x10(%[c]),%%mm0\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
   "paddw %%mm2,%%mm4\n\t" \
   "paddw %%mm7,%%mm6\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "movq "OC_J(7)",%%mm1\n\t" \
+  "movq "OC_J(7,_x)",%%mm1\n\t" \
   "paddw %%mm5,%%mm7\n\t" \
   "movq %%mm0,%%mm5\n\t" \
   "pmulhw %%mm3,%%mm0\n\t" \
   "paddw %%mm7,%%mm4\n\t" \
   "pmulhw %%mm1,%%mm5\n\t" \
-  "movq 0x70(%[c]),%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm3,%%mm0\n\t" \
   "pmulhw %%mm7,%%mm3\n\t" \
-  "movq "OC_I(2)",%%mm2\n\t" \
+  "movq "OC_I(2,_x)",%%mm2\n\t" \
   "pmulhw %%mm1,%%mm7\n\t" \
   "paddw %%mm1,%%mm5\n\t" \
   "movq %%mm2,%%mm1\n\t" \
-  "pmulhw 0x20(%[c]),%%mm2\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
   "psubw %%mm5,%%mm3\n\t" \
-  "movq "OC_J(6)",%%mm5\n\t" \
+  "movq "OC_J(6,_x)",%%mm5\n\t" \
   "paddw %%mm7,%%mm0\n\t" \
   "movq %%mm5,%%mm7\n\t" \
   "psubw %%mm4,%%mm0\n\t" \
-  "pmulhw 0x20(%[c]),%%mm5\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
   "paddw %%mm1,%%mm2\n\t" \
-  "pmulhw 0x60(%[c]),%%mm1\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
   "paddw %%mm4,%%mm4\n\t" \
   "paddw %%mm0,%%mm4\n\t" \
   "psubw %%mm6,%%mm3\n\t" \
   "paddw %%mm7,%%mm5\n\t" \
   "paddw %%mm6,%%mm6\n\t" \
-  "pmulhw 0x60(%[c]),%%mm7\n\t" \
+  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
   "paddw %%mm3,%%mm6\n\t" \
-  "movq %%mm4,"OC_I(1)"\n\t" \
+  "movq %%mm4,"OC_I(1,_y)"\n\t" \
   "psubw %%mm5,%%mm1\n\t" \
-  "movq 0x40(%[c]),%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
   "movq %%mm3,%%mm5\n\t" \
   "pmulhw %%mm4,%%mm3\n\t" \
   "paddw %%mm2,%%mm7\n\t" \
-  "movq %%mm6,"OC_I(2)"\n\t" \
+  "movq %%mm6,"OC_I(2,_y)"\n\t" \
   "movq %%mm0,%%mm2\n\t" \
-  "movq "OC_I(0)",%%mm6\n\t" \
+  "movq "OC_I(0,_x)",%%mm6\n\t" \
   "pmulhw %%mm4,%%mm0\n\t" \
   "paddw %%mm3,%%mm5\n\t" \
-  "movq "OC_J(4)",%%mm3\n\t" \
+  "movq "OC_J(4,_x)",%%mm3\n\t" \
   "psubw %%mm1,%%mm5\n\t" \
   "paddw %%mm0,%%mm2\n\t" \
   "psubw %%mm3,%%mm6\n\t" \
@@ -103,18 +103,18 @@
   "paddw %%mm0,%%mm6\n\t" \
   "psubw %%mm2,%%mm6\n\t" \
   "paddw %%mm2,%%mm2\n\t" \
-  "movq "OC_I(1)",%%mm0\n\t" \
+  "movq "OC_I(1,_y)",%%mm0\n\t" \
   "paddw %%mm6,%%mm2\n\t" \
   "paddw %%mm3,%%mm4\n\t" \
   "psubw %%mm1,%%mm2\n\t" \
   "#end OC_IDCT_BEGIN\n\t" \
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT \
+#define OC_ROW_IDCT(_y,_x) \
   "#OC_ROW_IDCT\n" \
-  OC_IDCT_BEGIN \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r4=E'=E-G*/ \
   "psubw %%mm7,%%mm4\n\t" \
   /*r1=H'+H'*/ \
@@ -139,7 +139,7 @@
   "psubw %%mm0,%%mm7\n\t" \
   "paddw %%mm0,%%mm0\n\t" \
   /*Save R1.*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r0=R0=G.+C.*/ \
   "paddw %%mm7,%%mm0\n\t" \
   "#end OC_ROW_IDCT\n\t" \
@@ -172,11 +172,11 @@
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE \
+#define OC_TRANSPOSE(_y) \
   "#OC_TRANSPOSE\n\t" \
   "movq %%mm4,%%mm1\n\t" \
   "punpcklwd %%mm5,%%mm4\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm1\n\t" \
   "movq %%mm6,%%mm0\n\t" \
   "punpcklwd %%mm7,%%mm6\n\t" \
@@ -184,17 +184,17 @@
   "punpckldq %%mm6,%%mm4\n\t" \
   "punpckhdq %%mm6,%%mm5\n\t" \
   "movq %%mm1,%%mm6\n\t" \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   "punpckhwd %%mm7,%%mm0\n\t" \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   "punpckhdq %%mm0,%%mm6\n\t" \
-  "movq "OC_I(0)",%%mm4\n\t" \
+  "movq "OC_I(0,_y)",%%mm4\n\t" \
   "punpckldq %%mm0,%%mm1\n\t" \
-  "movq "OC_I(1)",%%mm5\n\t" \
+  "movq "OC_I(1,_y)",%%mm5\n\t" \
   "movq %%mm4,%%mm0\n\t" \
-  "movq %%mm6,"OC_J(7)"\n\t" \
+  "movq %%mm6,"OC_J(7,_y)"\n\t" \
   "punpcklwd %%mm5,%%mm0\n\t" \
-  "movq %%mm1,"OC_J(6)"\n\t" \
+  "movq %%mm1,"OC_J(6,_y)"\n\t" \
   "punpckhwd %%mm5,%%mm4\n\t" \
   "movq %%mm2,%%mm5\n\t" \
   "punpcklwd %%mm3,%%mm2\n\t" \
@@ -202,20 +202,20 @@
   "punpckldq %%mm2,%%mm0\n\t" \
   "punpckhdq %%mm2,%%mm1\n\t" \
   "movq %%mm4,%%mm2\n\t" \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "punpckhwd %%mm3,%%mm5\n\t" \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   "punpckhdq %%mm5,%%mm4\n\t" \
   "punpckldq %%mm5,%%mm2\n\t" \
-  "movq %%mm4,"OC_I(3)"\n\t" \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm4,"OC_I(3,_y)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   "#end OC_TRANSPOSE\n\t" \
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT \
+#define OC_COLUMN_IDCT(_y) \
   "#OC_COLUMN_IDCT\n" \
-  OC_IDCT_BEGIN \
-  "paddw 0x00(%[c]),%%mm2\n\t" \
+  OC_IDCT_BEGIN(_y,_y) \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
   /*r1=H'+H'*/ \
   "paddw %%mm1,%%mm1\n\t" \
   /*r1=R1=A''+H'*/ \
@@ -227,18 +227,18 @@
   /*r1=NR1*/ \
   "psraw $4,%%mm1\n\t" \
   /*r3=D'*/ \
-  "movq "OC_I(2)",%%mm3\n\t" \
+  "movq "OC_I(2,_y)",%%mm3\n\t" \
   /*r7=G+G*/ \
   "paddw %%mm7,%%mm7\n\t" \
   /*Store NR2 at I(2).*/ \
-  "movq %%mm2,"OC_I(2)"\n\t" \
+  "movq %%mm2,"OC_I(2,_y)"\n\t" \
   /*r7=G'=E+G*/ \
   "paddw %%mm4,%%mm7\n\t" \
   /*Store NR1 at I(1).*/ \
-  "movq %%mm1,"OC_I(1)"\n\t" \
+  "movq %%mm1,"OC_I(1,_y)"\n\t" \
   /*r4=R4=E'-D'*/ \
   "psubw %%mm3,%%mm4\n\t" \
-  "paddw 0x00(%[c]),%%mm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
   /*r3=D'+D'*/ \
   "paddw %%mm3,%%mm3\n\t" \
   /*r3=R3=E'+D'*/ \
@@ -249,7 +249,7 @@
   "psubw %%mm5,%%mm6\n\t" \
   /*r3=NR3*/ \
   "psraw $4,%%mm3\n\t" \
-  "paddw 0x00(%[c]),%%mm6\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
   /*r5=B''+B''*/ \
   "paddw %%mm5,%%mm5\n\t" \
   /*r5=R5=F'+B''*/ \
@@ -257,14 +257,14 @@
   /*r6=NR6*/ \
   "psraw $4,%%mm6\n\t" \
   /*Store NR4 at J(4).*/ \
-  "movq %%mm4,"OC_J(4)"\n\t" \
+  "movq %%mm4,"OC_J(4,_y)"\n\t" \
   /*r5=NR5*/ \
   "psraw $4,%%mm5\n\t" \
   /*Store NR3 at I(3).*/ \
-  "movq %%mm3,"OC_I(3)"\n\t" \
+  "movq %%mm3,"OC_I(3,_y)"\n\t" \
   /*r7=R7=G'-C'*/ \
   "psubw %%mm0,%%mm7\n\t" \
-  "paddw 0x00(%[c]),%%mm7\n\t" \
+  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
   /*r0=C'+C'*/ \
   "paddw %%mm0,%%mm0\n\t" \
   /*r0=R0=G'+C'*/ \
@@ -272,113 +272,123 @@
   /*r7=NR7*/ \
   "psraw $4,%%mm7\n\t" \
   /*Store NR6 at J(6).*/ \
-  "movq %%mm6,"OC_J(6)"\n\t" \
+  "movq %%mm6,"OC_J(6,_y)"\n\t" \
   /*r0=NR0*/ \
   "psraw $4,%%mm0\n\t" \
   /*Store NR5 at J(5).*/ \
-  "movq %%mm5,"OC_J(5)"\n\t" \
+  "movq %%mm5,"OC_J(5,_y)"\n\t" \
   /*Store NR7 at J(7).*/ \
-  "movq %%mm7,"OC_J(7)"\n\t" \
+  "movq %%mm7,"OC_J(7,_y)"\n\t" \
   /*Store NR0 at I(0).*/ \
-  "movq %%mm0,"OC_I(0)"\n\t" \
+  "movq %%mm0,"OC_I(0,_y)"\n\t" \
   "#end OC_COLUMN_IDCT\n\t" \
 
-#define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
-#define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
-
-static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm__ __volatile__(
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k-4)*16)+8,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+64)"(%[y])"
-#define OC_J(_k)      OC_M2STR(((_k-4)*16)+72)"(%[y])"
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k*16)+64,_y)
+#define OC_J(_k,_y)   OC_MEM_OFFS(((_k-4)*16)+72,_y)
+    OC_ROW_IDCT(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   OC_MEM_OFFS((_k*16)+8,_y)
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
+        :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
+      );
+    }
+  }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 \
+#define OC_IDCT_BEGIN_10(_y,_x) \
  "#OC_IDCT_BEGIN_10\n\t" \
- "movq "OC_I(3)",%%mm2\n\t" \
+ "movq "OC_I(3,_x)",%%mm2\n\t" \
  "nop\n\t" \
- "movq 0x30(%[c]),%%mm6\n\t" \
+ "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
  "movq %%mm2,%%mm4\n\t" \
- "movq 0x50(%[c]),%%mm1\n\t" \
+ "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
  "pmulhw %%mm6,%%mm4\n\t" \
- "movq "OC_I(1)",%%mm3\n\t" \
+ "movq "OC_I(1,_x)",%%mm3\n\t" \
  "pmulhw %%mm2,%%mm1\n\t" \
- "movq 0x10(%[c]),%%mm0\n\t" \
+ "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
  "paddw %%mm2,%%mm4\n\t" \
  "pxor %%mm6,%%mm6\n\t" \
  "paddw %%mm1,%%mm2\n\t" \
- "movq "OC_I(2)",%%mm5\n\t" \
+ "movq "OC_I(2,_x)",%%mm5\n\t" \
  "pmulhw %%mm3,%%mm0\n\t" \
  "movq %%mm5,%%mm1\n\t" \
  "paddw %%mm3,%%mm0\n\t" \
- "pmulhw 0x70(%[c]),%%mm3\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
- "pmulhw 0x20(%[c]),%%mm5\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
  "psubw %%mm4,%%mm0\n\t" \
- "movq "OC_I(2)",%%mm7\n\t" \
+ "movq "OC_I(2,_x)",%%mm7\n\t" \
  "paddw %%mm4,%%mm4\n\t" \
  "paddw %%mm5,%%mm7\n\t" \
  "paddw %%mm0,%%mm4\n\t" \
- "pmulhw 0x60(%[c]),%%mm1\n\t" \
+ "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
  "psubw %%mm6,%%mm3\n\t" \
- "movq %%mm4,"OC_I(1)"\n\t" \
+ "movq %%mm4,"OC_I(1,_y)"\n\t" \
  "paddw %%mm6,%%mm6\n\t" \
- "movq 0x40(%[c]),%%mm4\n\t" \
+ "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
  "paddw %%mm3,%%mm6\n\t" \
  "movq %%mm3,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm3\n\t" \
- "movq %%mm6,"OC_I(2)"\n\t" \
+ "movq %%mm6,"OC_I(2,_y)"\n\t" \
  "movq %%mm0,%%mm2\n\t" \
- "movq "OC_I(0)",%%mm6\n\t" \
+ "movq "OC_I(0,_x)",%%mm6\n\t" \
  "pmulhw %%mm4,%%mm0\n\t" \
  "paddw %%mm3,%%mm5\n\t" \
  "paddw %%mm0,%%mm2\n\t" \
  "psubw %%mm1,%%mm5\n\t" \
  "pmulhw %%mm4,%%mm6\n\t" \
- "paddw "OC_I(0)",%%mm6\n\t" \
+ "paddw "OC_I(0,_x)",%%mm6\n\t" \
  "paddw %%mm1,%%mm1\n\t" \
  "movq %%mm6,%%mm4\n\t" \
  "paddw %%mm5,%%mm1\n\t" \
  "psubw %%mm2,%%mm6\n\t" \
  "paddw %%mm2,%%mm2\n\t" \
- "movq "OC_I(1)",%%mm0\n\t" \
+ "movq "OC_I(1,_y)",%%mm0\n\t" \
  "paddw %%mm6,%%mm2\n\t" \
  "psubw %%mm1,%%mm2\n\t" \
  "nop\n\t" \
  "#end OC_IDCT_BEGIN_10\n\t" \
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 \
+#define OC_ROW_IDCT_10(_y,_x) \
  "#OC_ROW_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
+ OC_IDCT_BEGIN_10(_y,_x) \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r4=E'=E-G*/ \
  "psubw %%mm7,%%mm4\n\t" \
  /*r1=H'+H'*/ \
@@ -403,16 +413,16 @@
  "psubw %%mm0,%%mm7\n\t" \
  "paddw %%mm0,%%mm0\n\t" \
  /*Save R1.*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r0=R0=G'+C'*/ \
  "paddw %%mm7,%%mm0\n\t" \
  "#end OC_ROW_IDCT_10\n\t" \
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 \
+#define OC_COLUMN_IDCT_10(_y) \
  "#OC_COLUMN_IDCT_10\n\t" \
- OC_IDCT_BEGIN_10 \
- "paddw 0x00(%[c]),%%mm2\n\t" \
+ OC_IDCT_BEGIN_10(_y,_y) \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
  /*r1=H'+H'*/ \
  "paddw %%mm1,%%mm1\n\t" \
  /*r1=R1=A''+H'*/ \
@@ -424,18 +434,18 @@
  /*r1=NR1*/ \
  "psraw $4,%%mm1\n\t" \
  /*r3=D'*/ \
- "movq "OC_I(2)",%%mm3\n\t" \
+ "movq "OC_I(2,_y)",%%mm3\n\t" \
  /*r7=G+G*/ \
  "paddw %%mm7,%%mm7\n\t" \
  /*Store NR2 at I(2).*/ \
- "movq %%mm2,"OC_I(2)"\n\t" \
+ "movq %%mm2,"OC_I(2,_y)"\n\t" \
  /*r7=G'=E+G*/ \
  "paddw %%mm4,%%mm7\n\t" \
  /*Store NR1 at I(1).*/ \
- "movq %%mm1,"OC_I(1)"\n\t" \
+ "movq %%mm1,"OC_I(1,_y)"\n\t" \
  /*r4=R4=E'-D'*/ \
  "psubw %%mm3,%%mm4\n\t" \
- "paddw 0x00(%[c]),%%mm4\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
  /*r3=D'+D'*/ \
  "paddw %%mm3,%%mm3\n\t" \
  /*r3=R3=E'+D'*/ \
@@ -446,7 +456,7 @@
  "psubw %%mm5,%%mm6\n\t" \
  /*r3=NR3*/ \
  "psraw $4,%%mm3\n\t" \
- "paddw 0x00(%[c]),%%mm6\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
  /*r5=B''+B''*/ \
  "paddw %%mm5,%%mm5\n\t" \
  /*r5=R5=F'+B''*/ \
@@ -454,14 +464,14 @@
  /*r6=NR6*/ \
  "psraw $4,%%mm6\n\t" \
  /*Store NR4 at J(4).*/ \
- "movq %%mm4,"OC_J(4)"\n\t" \
+ "movq %%mm4,"OC_J(4,_y)"\n\t" \
  /*r5=NR5*/ \
  "psraw $4,%%mm5\n\t" \
  /*Store NR3 at I(3).*/ \
- "movq %%mm3,"OC_I(3)"\n\t" \
+ "movq %%mm3,"OC_I(3,_y)"\n\t" \
  /*r7=R7=G'-C'*/ \
  "psubw %%mm0,%%mm7\n\t" \
- "paddw 0x00(%[c]),%%mm7\n\t" \
+ "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
  /*r0=C'+C'*/ \
  "paddw %%mm0,%%mm0\n\t" \
  /*r0=R0=G'+C'*/ \
@@ -469,46 +479,57 @@
  /*r7=NR7*/ \
  "psraw $4,%%mm7\n\t" \
  /*Store NR6 at J(6).*/ \
- "movq %%mm6,"OC_J(6)"\n\t" \
+ "movq %%mm6,"OC_J(6,_y)"\n\t" \
  /*r0=NR0*/ \
  "psraw $4,%%mm0\n\t" \
  /*Store NR5 at J(5).*/ \
- "movq %%mm5,"OC_J(5)"\n\t" \
+ "movq %%mm5,"OC_J(5,_y)"\n\t" \
  /*Store NR7 at J(7).*/ \
- "movq %%mm7,"OC_J(7)"\n\t" \
+ "movq %%mm7,"OC_J(7,_y)"\n\t" \
  /*Store NR0 at I(0).*/ \
- "movq %%mm0,"OC_I(0)"\n\t" \
+ "movq %%mm0,"OC_I(0,_y)"\n\t" \
  "#end OC_COLUMN_IDCT_10\n\t" \
 
-static void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm__ __volatile__(
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_MEM_OFFS(((_k-4)*16)+8,_y)
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(y,x)
+    OC_TRANSPOSE(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16),_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) OC_M2STR((_k*16)+8)"(%[y])"
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) OC_MEM_OFFS((_k*16)+8,_y)
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(y)
 #undef  OC_I
 #undef  OC_J
-    :
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
   );
+  if(_x!=_y){
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
+    );
+  }
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -534,8 +555,8 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10_mmx(_y);
-  else oc_idct8x8_slow_mmx(_y);
+  if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
+  else oc_idct8x8_slow_mmx(_y,_x);
 }
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/mmxstate.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -19,13 +19,12 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
@@ -36,6 +35,7 @@
     /*Note that this value must be unsigned, to keep the __asm__ block from
        sign-extending it when it puts it in a register.*/
     ogg_uint16_t p;
+    int          i;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
@@ -47,38 +47,30 @@
       "punpcklwd %%mm0,%%mm0\n\t"
       /*mm0=AAAA AAAA AAAA AAAA*/
       "punpckldq %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
       :
-      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
-      :"memory"
+      :[p]"r"((unsigned)p)
     );
+    for(i=0;i<4;i++){
+      __asm__ __volatile__(
+        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
+        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
+        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
+      );
+    }
   }
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
+    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -88,40 +80,17 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,_flimit,8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -133,7 +102,7 @@
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   OC_ALIGN8(unsigned char   ll[8]);
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
@@ -189,6 +158,10 @@
   }
 }
 
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
+}
+
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of
    fragments, so this row also needs to be available.
@@ -198,8 +171,7 @@
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char   ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -210,13 +182,12 @@
   ptrdiff_t                fragi0_end;
   int                      ystride;
   int                      nhfrags;
-  memset(ll,~(_state->loop_filter_limits[_state->qis[0]]<<1),sizeof(ll));
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
@@ -236,16 +207,16 @@
         unsigned char *ref;
         ref=ref_frame_data+frag_buf_offs[fragi];
         if(fragi>fragi0){
-          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
         }
         if(fragi0>fragi_top){
-          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
         }
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,ll);
+          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
         }
       }
       fragi++;

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/sse2idct.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -38,16 +38,16 @@
 /*Performs the first three stages of the iDCT.
   xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
    (accessed in that order).
-  The remaining rows must be in %[y] at their corresponding locations.
+  The remaining rows must be in _x at their corresponding locations.
   On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
    contain rows 4 through 7.*/
-#define OC_IDCT_8x8_ABC \
+#define OC_IDCT_8x8_ABC(_x) \
   "#OC_IDCT_8x8_ABC\n\t" \
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
-  "movdqa 0x20(%[c]),%%xmm1\n\t" \
-  "movdqa 0x60(%[c]),%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
   "movdqa %%xmm1,%%xmm0\n\t" \
   "pmulhw %%xmm2,%%xmm1\n\t" \
   "movdqa %%xmm4,%%xmm7\n\t" \
@@ -55,12 +55,12 @@
   "pmulhw %%xmm2,%%xmm7\n\t" \
   "pmulhw %%xmm6,%%xmm4\n\t" \
   "paddw %%xmm6,%%xmm0\n\t" \
-  "movdqa 0x30(%[c]),%%xmm6\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
   "paddw %%xmm1,%%xmm2\n\t" \
   "psubw %%xmm0,%%xmm7\n\t" \
   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   "paddw %%xmm4,%%xmm2\n\t" \
-  "movdqa 0x50(%[c]),%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
@@ -73,13 +73,13 @@
   "paddw %%xmm3,%%xmm4\n\t" \
   "paddw %%xmm5,%%xmm3\n\t" \
   "paddw %%xmm6,%%xmm3\n\t" \
-  "movdqa 0x70(%[y]),%%xmm6\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
   "paddw %%xmm5,%%xmm1\n\t" \
-  "movdqa 0x10(%[y]),%%xmm5\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
   "paddw %%xmm3,%%xmm2\n\t" \
-  "movdqa 0x70(%[c]),%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
   "psubw %%xmm4,%%xmm1\n\t" \
-  "movdqa 0x10(%[c]),%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
   /*4-7 rotation by 7pi/16. \
     xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
   "movdqa %%xmm3,%%xmm0\n\t" \
@@ -89,12 +89,12 @@
   "pmulhw %%xmm6,%%xmm4\n\t" \
   "pmulhw %%xmm6,%%xmm0\n\t" \
   "paddw %%xmm6,%%xmm4\n\t" \
-  "movdqa 0x40(%[y]),%%xmm6\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
   "paddw %%xmm5,%%xmm7\n\t" \
   "psubw %%xmm4,%%xmm3\n\t" \
-  "movdqa 0x40(%[c]),%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
   "paddw %%xmm7,%%xmm0\n\t" \
-  "movdqa 0x00(%[y]),%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
   /*0-1 butterfly. \
     xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
   "paddw %%xmm7,%%xmm6\n\t" \
@@ -172,15 +172,15 @@
     2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
     3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
   "psubw %%xmm3,%%xmm4\n\t" \
-  "movdqa %%xmm4,0x40(%[y])\n\t" \
-  "movdqa 0x00(%[c]),%%xmm4\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
+  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
   "psubw %%xmm0,%%xmm7\n\t" \
   "psubw %%xmm1,%%xmm6\n\t" \
   "psubw %%xmm2,%%xmm5\n\t" \
   "paddw %%xmm4,%%xmm7\n\t" \
   "paddw %%xmm4,%%xmm6\n\t" \
   "paddw %%xmm4,%%xmm5\n\t" \
-  "paddw 0x40(%[y]),%%xmm4\n\t" \
+  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
   "paddw %%xmm0,%%xmm0\n\t" \
   "paddw %%xmm1,%%xmm1\n\t" \
   "paddw %%xmm2,%%xmm2\n\t" \
@@ -189,45 +189,61 @@
   "paddw %%xmm6,%%xmm1\n\t" \
   "psraw $4,%%xmm0\n\t" \
   "paddw %%xmm5,%%xmm2\n\t" \
-  "movdqa %%xmm0,0x00(%[y])\n\t" \
+  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
   "psraw $4,%%xmm1\n\t" \
   "paddw %%xmm4,%%xmm3\n\t" \
-  "movdqa %%xmm1,0x10(%[y])\n\t" \
+  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
   "psraw $4,%%xmm2\n\t" \
-  "movdqa %%xmm2,0x20(%[y])\n\t" \
+  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
   "psraw $4,%%xmm3\n\t" \
-  "movdqa %%xmm3,0x30(%[y])\n\t" \
+  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
   "psraw $4,%%xmm4\n\t" \
-  "movdqa %%xmm4,0x40(%[y])\n\t" \
+  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
   "psraw $4,%%xmm5\n\t" \
-  "movdqa %%xmm5,0x50(%[y])\n\t" \
+  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
   "psraw $4,%%xmm6\n\t" \
-  "movdqa %%xmm6,0x60(%[y])\n\t" \
+  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
   "psraw $4,%%xmm7\n\t" \
-  "movdqa %%xmm7,0x70(%[y])\n\t" \
+  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
 
-static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   OC_ALIGN16(ogg_int16_t buf[16]);
   /*This routine accepts an 8x8 matrix pre-transposed.*/
   __asm__ __volatile__(
     /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
-    "movdqa 0x20(%[y]),%%xmm2\n\t"
-    "movdqa 0x60(%[y]),%%xmm6\n\t"
-    "movdqa 0x30(%[y]),%%xmm3\n\t"
-    "movdqa 0x50(%[y]),%%xmm5\n\t"
-    OC_IDCT_8x8_ABC
+    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
+    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
+    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
+    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
+    OC_IDCT_8x8_ABC(x)
     OC_IDCT_8x8_D
     OC_TRANSPOSE_8x8
     /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
-    "movdqa %%xmm7,0x70(%[y])\n\t"
-    "movdqa %%xmm4,0x40(%[y])\n\t"
-    "movdqa %%xmm1,0x10(%[y])\n\t"
-    "movdqa %%xmm0,0x00(%[y])\n\t"
-    OC_IDCT_8x8_ABC
+    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
+    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
+    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
+    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
+    OC_IDCT_8x8_ABC(y)
     OC_IDCT_8x8_D_STORE
-    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   );
+  if(_x!=_y){
+    int i;
+    __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
+    /*Clear input data for next block (decoder only).*/
+    for(i=0;i<2;i++){
+      __asm__ __volatile__(
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+        "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+        :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
+      );
+    }
+  }
 }
 
 /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
@@ -238,28 +254,28 @@
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
-  "movq 0x60(%[c]),%%mm7\n\t" \
-  "movq 0x20(%[c]),%%mm6\n\t" \
+  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
   "pmulhw %%mm2,%%mm6\n\t" \
   "pmulhw %%mm2,%%mm7\n\t" \
-  "movq 0x50(%[c]),%%mm5\n\t" \
+  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
   "paddw %%mm6,%%mm2\n\t" \
   "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
-  "movq 0x30(%[c]),%%mm2\n\t" \
+  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
   "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
   "pmulhw %%mm3,%%mm5\n\t" \
   "pmulhw %%mm3,%%mm2\n\t" \
-  "movq 0x10(%[c]),%%mm7\n\t" \
+  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
   "paddw %%mm3,%%mm5\n\t" \
   "paddw %%mm3,%%mm2\n\t" \
-  "movq 0x70(%[c]),%%mm3\n\t" \
+  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
   /*4-7 rotation by 7pi/16. \
     mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
   "pmulhw %%mm1,%%mm3\n\t" \
   "pmulhw %%mm1,%%mm7\n\t" \
-  "movq 0x40(%[c]),%%mm4\n\t" \
+  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
   "movq %%mm3,%%mm6\n\t" \
   "paddw %%mm1,%%mm7\n\t" \
   /*0-1 butterfly. \
@@ -319,28 +335,28 @@
   /*Stage 1:*/ \
   /*2-3 rotation by 6pi/16. \
     xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
-  "movdqa 0x60(%[c]),%%xmm7\n\t" \
-  "movdqa 0x20(%[c]),%%xmm6\n\t" \
+  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
   "pmulhw %%xmm2,%%xmm6\n\t" \
   "pmulhw %%xmm2,%%xmm7\n\t" \
-  "movdqa 0x50(%[c]),%%xmm5\n\t" \
+  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
   "paddw %%xmm6,%%xmm2\n\t" \
   "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
-  "movdqa 0x30(%[c]),%%xmm2\n\t" \
+  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
   "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   /*5-6 rotation by 3pi/16. \
     xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
   "pmulhw %%xmm3,%%xmm5\n\t" \
   "pmulhw %%xmm3,%%xmm2\n\t" \
-  "movdqa 0x10(%[c]),%%xmm7\n\t" \
+  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
   "paddw %%xmm3,%%xmm5\n\t" \
   "paddw %%xmm3,%%xmm2\n\t" \
-  "movdqa 0x70(%[c]),%%xmm3\n\t" \
+  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
   /*4-7 rotation by 7pi/16. \
     xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
   "pmulhw %%xmm1,%%xmm3\n\t" \
   "pmulhw %%xmm1,%%xmm7\n\t" \
-  "movdqa 0x40(%[c]),%%xmm4\n\t" \
+  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
   "movdqa %%xmm3,%%xmm6\n\t" \
   "paddw %%xmm1,%%xmm7\n\t" \
   /*0-1 butterfly. \
@@ -378,27 +394,40 @@
   "psubw %%xmm7,%%xmm4\n\t" \
   "psubw %%xmm6,%%xmm5\n\t" \
 
-static void oc_idct8x8_10_sse2(ogg_int16_t _y[64]){
+static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   OC_ALIGN16(ogg_int16_t buf[16]);
   /*This routine accepts an 8x8 matrix pre-transposed.*/
   __asm__ __volatile__(
-    "movq 0x20(%[y]),%%mm2\n\t"
-    "movq 0x30(%[y]),%%mm3\n\t"
-    "movq 0x10(%[y]),%%mm1\n\t"
-    "movq 0x00(%[y]),%%mm0\n\t"
+    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
+    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
+    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
+    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
     OC_IDCT_8x8_10_MMX
     OC_TRANSPOSE_8x4_MMX2SSE
     OC_IDCT_8x8_10_ABC
     OC_IDCT_8x8_D_STORE
-    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16))
-    :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
+    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
+     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
+    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
+     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   );
+  if(_x!=_y){
+    /*Clear input data for next block (decoder only).*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
+      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
+      :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
+    );
+  }
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -424,8 +453,8 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Then perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10_sse2(_y);
-  else oc_idct8x8_slow_sse2(_y);
+  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
+  else oc_idct8x8_slow_sse2(_y,_x);
 }
 
 #endif

Copied: experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c (from rev 17375, experimental/derf/theora-ptalarbvorm/lib/x86/cpu.c)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,182 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+# if defined(__amd64__)||defined(__x86_64__)
+/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
+   compiling with -fPIC.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "cpuid\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# else
+/*On x86-32, not so much.*/
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  __asm__ __volatile__( \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   "cpuid\n\t" \
+   "xchgl %%ebx,%[ebx]\n\t" \
+   :[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
+   :"a"(_op) \
+   :"cc" \
+  )
+# endif
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  __asm__ __volatile__(
+   "pushfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "movl %[a],%[b]\n\t"
+   "xorl $0x200000,%[a]\n\t"
+   "pushl %[a]\n\t"
+   "popfl\n\t"
+   "pushfl\n\t"
+   "popl %[a]\n\t"
+   "popfl\n\t"
+   :[a]"=r"(eax),[b]"=r"(ebx)
+   :
+   :"cc"
+  );
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif

Copied: experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h (from rev 17375, experimental/derf/theora-ptalarbvorm/lib/x86/cpu.h)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86cpu.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_x86cpu_H)
+# define _x86_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86int.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -28,16 +28,21 @@
    call.*/
 #   define oc_frag_copy(_state,_dst,_src,_ystride) \
   oc_frag_copy_mmx(_dst,_src,_ystride)
+#   define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
+ _fragis,_nfragis,_frag_buf_offs) \
+  oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
+   _fragis,_nfragis,_frag_buf_offs)
 #   define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
   oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
 #   define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
   oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
 #   define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
   oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
-#   define oc_idct8x8(_state,_y,_last_zzi) \
-  oc_idct8x8_sse2(_y,_last_zzi)
+#   define oc_idct8x8(_state,_y,_x,_last_zzi) \
+  oc_idct8x8_sse2(_y,_x,_last_zzi)
 #   define oc_state_frag_recon oc_state_frag_recon_mmx
-#   define oc_state_frag_copy_list oc_state_frag_copy_list_mmx
+#   define oc_loop_filter_init(_state,_bv,_flimit) \
+  oc_loop_filter_init_mmxext(_bv,_flimit)
 #   define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
 #   define oc_restore_fpu(_state) \
   oc_restore_fpu_mmx()
@@ -47,7 +52,7 @@
 # endif
 
 # include "../state.h"
-# include "cpu.h"
+# include "x86cpu.h"
 
 /*Converts the expression in the argument to a string.*/
 #define OC_M2STR(_s) #_s
@@ -74,7 +79,7 @@
    stack pointer, without allocating a separate register to point to them.*/
 #define OC_ARRAY_OPERAND(_type,_ptr,_size) \
   (*({ \
-    struct{_type array_value__[_size];} *array_addr__=(void *)_ptr; \
+    struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
     array_addr__; \
   }))
 
@@ -84,8 +89,8 @@
    stack pointer, without allocating a separate register to point to them.*/
 #define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
   (*({ \
-    const struct{_type array_value__[_size];} *array_addr__= \
-     (const void *)_ptr; \
+    const struct{_type array_value__[(_size)];} *array_addr__= \
+     (const void *)(_ptr); \
     array_addr__; \
   }))
 
@@ -95,23 +100,25 @@
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
-void oc_idct8x8_sse2(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
+void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
+void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86/x86state.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -67,18 +67,20 @@
 # if defined(OC_STATE_USE_VTABLE)
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_state_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
   if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
+    _state->opt_vtable.loop_filter_init=oc_state_loop_filter_init_mmxext;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmxext;
   }

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,192 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
- CPU capability detection for x86 processors.
-  Originally written by Rudolf Marek.
-
- function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "cpu.h"
-
-#if !defined(OC_X86_ASM)
-ogg_uint32_t oc_cpu_flags_get(void){
-  return 0;
-}
-#else
-/*Why does MSVC need this complicated rigamarole?
-  At this point I honestly do not care.*/
-
-/*Visual C cpuid helper function.
-  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
-   for VS2003 users, so we do it in inline assembler.*/
-static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
-  _asm{
-    mov eax,[_op]
-    mov esi,_cpu_info
-    cpuid
-    mov [esi+0],eax
-    mov [esi+4],ebx
-    mov [esi+8],ecx
-    mov [esi+12],edx
-  }
-}
-
-#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
-  do{ \
-    ogg_uint32_t cpu_info[4]; \
-    oc_cpuid_helper(cpu_info,_op); \
-    (_eax)=cpu_info[0]; \
-    (_ebx)=cpu_info[1]; \
-    (_ecx)=cpu_info[2]; \
-    (_edx)=cpu_info[3]; \
-  }while(0)
-
-static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
-  _asm{
-    pushfd
-    pushfd
-    pop eax
-    mov ebx,eax
-    xor eax,200000h
-    push eax
-    popfd
-    pushfd
-    pop eax
-    popfd
-    mov ecx,_eax
-    mov [ecx],eax
-    mov ecx,_ebx
-    mov [ecx],ebx
-  }
-}
-
-static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
-  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
-  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
-  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
-  return flags;
-}
-
-static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
-  ogg_uint32_t flags;
-  /*If there isn't even MMX, give up.*/
-  if(!(_edx&0x00800000))return 0;
-  flags=OC_CPU_X86_MMX;
-  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
-  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
-  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
-  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
-  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
-  return flags;
-}
-
-ogg_uint32_t oc_cpu_flags_get(void){
-  ogg_uint32_t flags;
-  ogg_uint32_t eax;
-  ogg_uint32_t ebx;
-  ogg_uint32_t ecx;
-  ogg_uint32_t edx;
-# if !defined(__amd64__)&&!defined(__x86_64__)
-  /*Not all x86-32 chips support cpuid, so we have to check.*/
-  oc_detect_cpuid_helper(&eax,&ebx);
-  /*No cpuid.*/
-  if(eax==ebx)return 0;
-# endif
-  cpuid(0,eax,ebx,ecx,edx);
-  /*         l e t n          I e n i          u n e G*/
-  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
-   /*      6 8 x M          T e n i          u n e G*/
-   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
-    int family;
-    int model;
-    /*Intel, Transmeta (tested with Crusoe TM5800):*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-    family=(eax>>8)&0xF;
-    model=(eax>>4)&0xF;
-    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
-       unit, so don't use it.*/
-    if(family==6&&(model==9||model==13||model==14)){
-      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
-    }
-  }
-  /*              D M A c          i t n e          h t u A*/
-  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
-   /*      C S N            y b   e          d o e G*/
-   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
-    /*AMD, Geode:*/
-    cpuid(0x80000000,eax,ebx,ecx,edx);
-    if(eax<0x80000001)flags=0;
-    else{
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      flags=oc_parse_amd_flags(edx,ecx);
-    }
-    /*Also check for SSE.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags|=oc_parse_intel_flags(edx,ecx);
-  }
-  /*Technically some VIA chips can be configured in the BIOS to return any
-     string here the user wants.
-    There is a special detection method that can be used to identify such
-     processors, but in my opinion, if the user really wants to change it, they
-     deserve what they get.*/
-  /*              s l u a          H r u a          t n e C*/
-  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
-    /*VIA:*/
-    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
-       chips (thanks to the engineers from Centaur Technology who provided it).
-      These chips support Intel-like cpuid info.
-      The C3-2 (Nehemiah) cores appear to, as well.*/
-    cpuid(1,eax,ebx,ecx,edx);
-    flags=oc_parse_intel_flags(edx,ecx);
-    if(eax>=0x80000001){
-      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
-        We need to check this even if the Intel test succeeds to pick up 3DNow!
-         support on these processors.
-        Unlike actual AMD processors, we cannot _rely_ on this info, since
-         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
-         this function, yet return edx=0, despite the Intel test indicating
-         MMX support.
-        Therefore the features detected here are strictly added to those
-         detected by the Intel test.*/
-      /*TODO: How about earlier chips?*/
-      cpuid(0x80000001,eax,ebx,ecx,edx);
-      /*Note: As of the C7, this function returns Intel-style extended feature
-         flags, not AMD-style.
-        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
-         do not conflict with any of the AMD flags we inspect.
-        For the remaining bits, Intel tells us, "Do not count on their value",
-         but VIA assures us that they will all be zero (at least on the C7 and
-         Isaiah chips).
-        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
-         (0xC0C00000) for something else, we will have to add code to detect
-         the model to decide when it is appropriate to inspect them.*/
-      flags|=oc_parse_amd_flags(edx,ecx);
-    }
-  }
-  else{
-    /*Implement me.*/
-    flags=0;
-  }
-  return flags;
-}
-#endif

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,36 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
- function:
-    last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_x86_cpu_H)
-# define _x86_cpu_H (1)
-#include "../internal.h"
-
-#define OC_CPU_X86_MMX      (1<<0)
-#define OC_CPU_X86_3DNOW    (1<<1)
-#define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT   (1<<3)
-#define OC_CPU_X86_SSE      (1<<4)
-#define OC_CPU_X86_SSE2     (1<<5)
-#define OC_CPU_X86_PNI      (1<<6)
-#define OC_CPU_X86_SSSE3    (1<<7)
-#define OC_CPU_X86_SSE4_1   (1<<8)
-#define OC_CPU_X86_SSE4_2   (1<<9)
-#define OC_CPU_X86_SSE4A    (1<<10)
-#define OC_CPU_X86_SSE5     (1<<11)
-
-ogg_uint32_t oc_cpu_flags_get(void);
-
-#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -22,12 +22,63 @@
   The iteration each instruction belongs to is marked in the comments as #i.*/
 #include <stddef.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 
 #if defined(OC_X86_ASM)
 
 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    between rows.*/
+# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    const unsigned char *src; \
+    unsigned char       *dst; \
+    src=(_src); \
+    dst=(_dst); \
+    __asm  mov SRC,src \
+    __asm  mov DST,dst \
+    __asm  mov YSTRIDE,_ystride \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*ystride3=ystride*3*/ \
+    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*Pointer to next 4.*/ \
+    __asm  lea SRC,[SRC+YSTRIDE*4] \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+    /*Pointer to next 4.*/ \
+    __asm  lea DST,[DST+YSTRIDE*4] \
+    /*src+0*ystride*/ \
+    __asm  movq mm0,[SRC] \
+    /*src+1*ystride*/ \
+    __asm  movq mm1,[SRC+YSTRIDE] \
+    /*src+2*ystride*/ \
+    __asm  movq mm2,[SRC+YSTRIDE*2] \
+    /*src+3*ystride*/ \
+    __asm  movq mm3,[SRC+YSTRIDE3] \
+    /*dst+0*ystride*/ \
+    __asm  movq [DST],mm0 \
+    /*dst+1*ystride*/ \
+    __asm  movq [DST+YSTRIDE],mm1 \
+    /*dst+2*ystride*/ \
+    __asm  movq [DST+YSTRIDE*2],mm2 \
+    /*dst+3*ystride*/ \
+    __asm  movq [DST+YSTRIDE3],mm3 \
+  } \
+  while(0)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride){
 #define SRC edx
@@ -41,6 +92,34 @@
 #undef YSTRIDE3
 }
 
+/*Copies the fragments specified by the lists of fragment indices from one
+   frame to another.
+  _dst_frame:     The reference frame to copy to.
+  _src_frame:     The reference frame to copy from.
+  _ystride:       The row stride of the reference frames.
+  _fragis:        A pointer to a list of fragment indices.
+  _nfragis:       The number of fragment indices to copy.
+  _frag_buf_offs: The offsets of fragments in the reference frames.*/
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t _frag_buf_offs){
+  ptrdiff_t fragii;
+  for(fragii=0;fragii<_nfragis;fragii++){
+    ptrdiff_t frag_buf_off;
+    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
+#define SRC edx
+#define DST eax
+#define YSTRIDE ecx
+#define YSTRIDE3 edi
+    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
+     _src_frame+frag_buf_off,_ystride);
+#undef SRC
+#undef DST
+#undef YSTRIDE
+#undef YSTRIDE3
+  }
+}
+
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm{

Deleted: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxfrag.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -1,61 +0,0 @@
-#if !defined(_x86_vc_mmxfrag_H)
-# define _x86_vc_mmxfrag_H (1)
-# include <stddef.h>
-# include "x86int.h"
-
-#if defined(OC_X86_ASM)
-
-/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
-   between rows.*/
-#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
-  do{ \
-    const unsigned char *src; \
-    unsigned char       *dst; \
-    src=(_src); \
-    dst=(_dst); \
-    __asm  mov SRC,src \
-    __asm  mov DST,dst \
-    __asm  mov YSTRIDE,_ystride \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*ystride3=ystride*3*/ \
-    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*Pointer to next 4.*/ \
-    __asm  lea SRC,[SRC+YSTRIDE*4] \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-    /*Pointer to next 4.*/ \
-    __asm  lea DST,[DST+YSTRIDE*4] \
-    /*src+0*ystride*/ \
-    __asm  movq mm0,[SRC] \
-    /*src+1*ystride*/ \
-    __asm  movq mm1,[SRC+YSTRIDE] \
-    /*src+2*ystride*/ \
-    __asm  movq mm2,[SRC+YSTRIDE*2] \
-    /*src+3*ystride*/ \
-    __asm  movq mm3,[SRC+YSTRIDE3] \
-    /*dst+0*ystride*/ \
-    __asm  movq [DST],mm0 \
-    /*dst+1*ystride*/ \
-    __asm  movq [DST+YSTRIDE],mm1 \
-    /*dst+2*ystride*/ \
-    __asm  movq [DST+YSTRIDE*2],mm2 \
-    /*dst+3*ystride*/ \
-    __asm  movq [DST+YSTRIDE3],mm3 \
-  } \
-  while(0)
-
-# endif
-#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxidct.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -24,15 +24,16 @@
 
 /*These are offsets into the table of constants below.*/
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (0)
+#define OC_COSINE_OFFSET (8)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (56)
+#define OC_EIGHT_OFFSET  (0)
 
 
 
 /*A table of constants used by the MMX routines.*/
 static const __declspec(align(16))ogg_uint16_t
- OC_IDCT_CONSTS[(7+1)*4]={
+ OC_IDCT_CONSTS[(1+7)*4]={
+      8,    8,    8,    8
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -47,27 +48,26 @@
   (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
   (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
   (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
-      8,    8,    8,    8
 };
 
 /*38 cycles*/
-#define OC_IDCT_BEGIN __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
-  __asm movq mm7,OC_J(5) \
+  __asm movq mm7,OC_J(5,_x) \
   __asm pmulhw mm4,mm6 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm6,mm7 \
   __asm movq mm5,mm1 \
   __asm pmulhw mm1,mm2 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm5,mm7 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm paddw mm6,mm7 \
   __asm paddw mm2,mm1 \
-  __asm movq mm1,OC_J(7) \
+  __asm movq mm1,OC_J(7,_x) \
   __asm paddw mm7,mm5 \
   __asm movq mm5,mm0 \
   __asm pmulhw mm0,mm3 \
@@ -77,13 +77,13 @@
   __asm psubw mm6,mm2 \
   __asm paddw mm0,mm3 \
   __asm pmulhw mm3,mm7 \
-  __asm movq mm2,OC_I(2) \
+  __asm movq mm2,OC_I(2,_x) \
   __asm pmulhw mm7,mm1 \
   __asm paddw mm5,mm1 \
   __asm movq mm1,mm2 \
   __asm pmulhw mm2,OC_C(2) \
   __asm psubw mm3,mm5 \
-  __asm movq mm5,OC_J(6) \
+  __asm movq mm5,OC_J(6,_x) \
   __asm paddw mm0,mm7 \
   __asm movq mm7,mm5 \
   __asm psubw mm0,mm4 \
@@ -97,18 +97,18 @@
   __asm paddw mm6,mm6 \
   __asm pmulhw mm7,OC_C(6) \
   __asm paddw mm6,mm3 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm psubw mm1,mm5 \
   __asm movq mm4,OC_C(4) \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
   __asm paddw mm7,mm2 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
-  __asm movq mm3,OC_J(4) \
+  __asm movq mm3,OC_J(4,_x) \
   __asm psubw mm5,mm1 \
   __asm paddw mm2,mm0 \
   __asm psubw mm6,mm3 \
@@ -122,17 +122,17 @@
   __asm paddw mm6,mm0 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm paddw mm4,mm3 \
   __asm psubw mm2,mm1 \
 }
 
 /*38+8=46 cycles.*/
-#define OC_ROW_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_ROW_IDCT(_y,_x) __asm{ \
+  OC_IDCT_BEGIN(_y,_x) \
   /*r3=D'*/ \
-  __asm  movq mm3,OC_I(2) \
+  __asm  movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
   __asm  psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -157,7 +157,7 @@
   __asm  psubw mm7,mm0 \
   __asm  paddw mm0,mm0 \
   /*Save R1.*/ \
-  __asm  movq OC_I(1),mm1 \
+  __asm  movq OC_I(1,_y),mm1 \
   /*r0=R0=G.+C.*/ \
   __asm  paddw mm0,mm7 \
 }
@@ -190,10 +190,10 @@
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
-#define OC_TRANSPOSE __asm{ \
+#define OC_TRANSPOSE(_y) __asm{ \
   __asm movq mm1,mm4 \
   __asm punpcklwd mm4,mm5 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm1,mm5 \
   __asm movq mm0,mm6 \
   __asm punpcklwd mm6,mm7 \
@@ -201,17 +201,17 @@
   __asm punpckldq mm4,mm6 \
   __asm punpckhdq mm5,mm6 \
   __asm movq mm6,mm1 \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   __asm punpckhwd mm0,mm7 \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   __asm punpckhdq mm6,mm0 \
-  __asm movq mm4,OC_I(0) \
+  __asm movq mm4,OC_I(0,_y) \
   __asm punpckldq mm1,mm0 \
-  __asm movq mm5,OC_I(1) \
+  __asm movq mm5,OC_I(1,_y) \
   __asm movq mm0,mm4 \
-  __asm movq OC_J(7),mm6 \
+  __asm movq OC_J(7,_y),mm6 \
   __asm punpcklwd mm0,mm5 \
-  __asm movq OC_J(6),mm1 \
+  __asm movq OC_J(6,_y),mm1 \
   __asm punpckhwd mm4,mm5 \
   __asm movq mm5,mm2 \
   __asm punpcklwd mm2,mm3 \
@@ -219,18 +219,18 @@
   __asm punpckldq mm0,mm2 \
   __asm punpckhdq mm1,mm2 \
   __asm movq mm2,mm4 \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
   __asm punpckhwd mm5,mm3 \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   __asm punpckhdq mm4,mm5 \
   __asm punpckldq mm2,mm5 \
-  __asm movq OC_I(3),mm4 \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(3,_y),mm4 \
+  __asm movq OC_I(2,_y),mm2 \
 }
 
 /*38+19=57 cycles.*/
-#define OC_COLUMN_IDCT __asm{ \
-  OC_IDCT_BEGIN \
+#define OC_COLUMN_IDCT(_y) __asm{ \
+  OC_IDCT_BEGIN(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -243,15 +243,15 @@
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -273,11 +273,11 @@
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -288,71 +288,90 @@
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
-static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
+  int i;
   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     Every 4x4 block is transposed.*/
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      [Y+(_k-4)*16+8]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+    mov X,_x
+#define OC_I(_k,_y)   [(_y)+_k*16]
+#define OC_J(_k,_y)   [(_y)+(_k-4)*16+8]
+    OC_ROW_IDCT(_y,_x)
+    OC_TRANSPOSE(_y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+(_k*16)+64]
-#define OC_J(_k)      [Y+(_k-4)*16+72]
-    OC_ROW_IDCT
-    OC_TRANSPOSE
+#define OC_I(_k,_y)   [(_y)+(_k*16)+64]
+#define OC_J(_k,_y)   [(_y)+(_k-4)*16+72]
+    OC_ROW_IDCT(_y,_x)
+    OC_TRANSPOSE(_y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+_k*16]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(_y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k)      [Y+_k*16+8]
-#define OC_J(_k)      OC_I(_k)
-    OC_COLUMN_IDCT
+#define OC_I(_k,_y)   [(_y)+_k*16+8]
+#define OC_J(_k,_y)   OC_I(_k,_y)
+    OC_COLUMN_IDCT(_y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
   }
+  if(_x!=_y){
+    int i;
+    __asm pxor mm0,mm0;
+    for(i=0;i<4;i++){
+#define X ecx
+      __asm{
+        mov X,(_x+16*i)
+        movq [X+0x00],mm0
+        movq [X+0x08],mm0
+        movq [X+0x10],mm0
+        movq [X+0x18],mm0
+      }
+#undef  X
+    }
+  }
 }
 
 /*25 cycles.*/
-#define OC_IDCT_BEGIN_10 __asm{ \
-  __asm movq mm2,OC_I(3) \
+#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
+  __asm movq mm2,OC_I(3,_x) \
   __asm nop \
   __asm movq mm6,OC_C(3) \
   __asm movq mm4,mm2 \
   __asm movq mm1,OC_C(5) \
   __asm pmulhw mm4,mm6 \
-  __asm movq mm3,OC_I(1) \
+  __asm movq mm3,OC_I(1,_x) \
   __asm pmulhw mm1,mm2 \
   __asm movq mm0,OC_C(1) \
   __asm paddw mm4,mm2 \
   __asm pxor mm6,mm6 \
   __asm paddw mm2,mm1 \
-  __asm movq mm5,OC_I(2) \
+  __asm movq mm5,OC_I(2,_x) \
   __asm pmulhw mm0,mm3 \
   __asm movq mm1,mm5 \
   __asm paddw mm0,mm3 \
@@ -360,43 +379,43 @@
   __asm psubw mm6,mm2 \
   __asm pmulhw mm5,OC_C(2) \
   __asm psubw mm0,mm4 \
-  __asm movq mm7,OC_I(2) \
+  __asm movq mm7,OC_I(2,_x) \
   __asm paddw mm4,mm4 \
   __asm paddw mm7,mm5 \
   __asm paddw mm4,mm0 \
   __asm pmulhw mm1,OC_C(6) \
   __asm psubw mm3,mm6 \
-  __asm movq OC_I(1),mm4 \
+  __asm movq OC_I(1,_y),mm4 \
   __asm paddw mm6,mm6 \
   __asm movq mm4,OC_C(4) \
   __asm paddw mm6,mm3 \
   __asm movq mm5,mm3 \
   __asm pmulhw mm3,mm4 \
-  __asm movq OC_I(2),mm6 \
+  __asm movq OC_I(2,_y),mm6 \
   __asm movq mm2,mm0 \
-  __asm movq mm6,OC_I(0) \
+  __asm movq mm6,OC_I(0,_x) \
   __asm pmulhw mm0,mm4 \
   __asm paddw mm5,mm3 \
   __asm paddw mm2,mm0 \
   __asm psubw mm5,mm1 \
   __asm pmulhw mm6,mm4 \
-  __asm paddw mm6,OC_I(0) \
+  __asm paddw mm6,OC_I(0,_x) \
   __asm paddw mm1,mm1 \
   __asm movq mm4,mm6 \
   __asm paddw mm1,mm5 \
   __asm psubw mm6,mm2 \
   __asm paddw mm2,mm2 \
-  __asm movq mm0,OC_I(1) \
+  __asm movq mm0,OC_I(1,_y) \
   __asm paddw mm2,mm6 \
   __asm psubw mm2,mm1 \
   __asm nop \
 }
 
 /*25+8=33 cycles.*/
-#define OC_ROW_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_ROW_IDCT_10(_y,_x) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_x) \
   /*r3=D'*/ \
-   __asm movq mm3,OC_I(2) \
+   __asm movq mm3,OC_I(2,_y) \
   /*r4=E'=E-G*/ \
    __asm psubw mm4,mm7 \
   /*r1=H'+H'*/ \
@@ -421,14 +440,14 @@
    __asm psubw mm7,mm0 \
    __asm paddw mm0,mm0 \
   /*Save R1.*/ \
-   __asm movq OC_I(1),mm1 \
+   __asm movq OC_I(1,_y),mm1 \
   /*r0=R0=G'+C'*/ \
    __asm paddw mm0,mm7 \
 }
 
 /*25+19=44 cycles'*/
-#define OC_COLUMN_IDCT_10 __asm{ \
-  OC_IDCT_BEGIN_10 \
+#define OC_COLUMN_IDCT_10(_y) __asm{ \
+  OC_IDCT_BEGIN_10(_y,_y) \
   __asm paddw mm2,OC_8 \
   /*r1=H'+H'*/ \
   __asm paddw mm1,mm1 \
@@ -441,15 +460,15 @@
   /*r1=NR1*/ \
   __asm psraw mm1,4 \
   /*r3=D'*/ \
-  __asm movq mm3,OC_I(2) \
+  __asm movq mm3,OC_I(2,_y) \
   /*r7=G+G*/ \
   __asm paddw mm7,mm7 \
   /*Store NR2 at I(2).*/ \
-  __asm movq OC_I(2),mm2 \
+  __asm movq OC_I(2,_y),mm2 \
   /*r7=G'=E+G*/ \
   __asm paddw mm7,mm4 \
   /*Store NR1 at I(1).*/ \
-  __asm movq OC_I(1),mm1 \
+  __asm movq OC_I(1,_y),mm1 \
   /*r4=R4=E'-D'*/ \
   __asm psubw mm4,mm3 \
   __asm paddw mm4,OC_8 \
@@ -471,11 +490,11 @@
   /*r6=NR6*/ \
   __asm psraw mm6,4 \
   /*Store NR4 at J(4).*/ \
-  __asm movq OC_J(4),mm4 \
+  __asm movq OC_J(4,_y),mm4 \
   /*r5=NR5*/ \
   __asm psraw mm5,4 \
   /*Store NR3 at I(3).*/ \
-  __asm movq OC_I(3),mm3 \
+  __asm movq OC_I(3,_y),mm3 \
   /*r7=R7=G'-C'*/ \
   __asm psubw mm7,mm0 \
   __asm paddw mm7,OC_8 \
@@ -486,50 +505,65 @@
   /*r7=NR7*/ \
   __asm psraw mm7,4 \
   /*Store NR6 at J(6).*/ \
-  __asm movq OC_J(6),mm6 \
+  __asm movq OC_J(6,_y),mm6 \
   /*r0=NR0*/ \
   __asm psraw mm0,4 \
   /*Store NR5 at J(5).*/ \
-  __asm movq OC_J(5),mm5 \
+  __asm movq OC_J(5,_y),mm5 \
   /*Store NR7 at J(7).*/ \
-  __asm movq OC_J(7),mm7 \
+  __asm movq OC_J(7,_y),mm7 \
   /*Store NR0 at I(0).*/ \
-  __asm movq OC_I(0),mm0 \
+  __asm movq OC_I(0,_y),mm0 \
 }
 
-static void oc_idct8x8_10(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   __asm{
 #define CONSTS eax
 #define Y edx
+#define X ecx
     mov CONSTS,offset OC_IDCT_CONSTS
     mov Y,_y
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) [Y+(_k-4)*16+8]
+    mov X,_x
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) [(_y)+(_k-4)*16+8]
     /*Done with dequant, descramble, and partial transpose.
       Now do the iDCT itself.*/
-    OC_ROW_IDCT_10
-    OC_TRANSPOSE
+    OC_ROW_IDCT_10(_y,_x)
+    OC_TRANSPOSE(_y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+_k*16]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(_y)
 #undef  OC_I
 #undef  OC_J
-#define OC_I(_k) [Y+_k*16+8]
-#define OC_J(_k) OC_I(_k)
-    OC_COLUMN_IDCT_10
+#define OC_I(_k,_y) [(_y)+_k*16+8]
+#define OC_J(_k,_y) OC_I(_k,_y)
+    OC_COLUMN_IDCT_10(_y)
 #undef  OC_I
 #undef  OC_J
 #undef  CONSTS
 #undef  Y
+#undef  X
   }
+  if(_x!=_y){
+#define X ecx
+    __asm{
+      mm0,mm0;
+      mov X,(_x+16*i)
+      movq [X+0x00],mm0
+      movq [X+0x10],mm0
+      movq [X+0x20],mm0
+      movq [X+0x30],mm0
+    }
+#undef  X
+  }
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was
@@ -555,8 +589,8 @@
      gets.
     Needless to say we inherited this approach from VP3.*/
   /*Perform the iDCT.*/
-  if(_last_zzi<10)oc_idct8x8_10(_y);
-  else oc_idct8x8_slow(_y);
+  if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
+  else oc_idct8x8_slow(_y,_x);
 }
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/mmxstate.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -19,13 +19,12 @@
   Originally written by Rudolf Marek.*/
 #include <string.h>
 #include "x86int.h"
-#include "mmxfrag.h"
 #include "mmxloop.h"
 
 #if defined(OC_X86_ASM)
 
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
@@ -45,6 +44,7 @@
 #define P ecx
       mov Y,_dct_coeffs
       movzx P,p
+      lea Y,[Y+128]
       /*mm0=0000 0000 0000 AAAA*/
       movd mm0,P
       /*mm0=0000 0000 AAAA AAAA*/
@@ -74,14 +74,14 @@
   else{
     /*Dequantize the DC coefficient.*/
     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
+    oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -91,48 +91,17 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       _dct_coeffs);
+       _dct_coeffs+64);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
   }
 }
 
 /*We copy these entire function to inline the actual MMX routines so that we
    use only a single indirect call.*/
 
-/*Copies the fragments specified by the lists of fragment indices from one
-   frame to another.
-  _fragis:    A pointer to a list of fragment indices.
-  _nfragis:   The number of fragment indices to copy.
-  _dst_frame: The reference frame to copy to.
-  _src_frame: The reference frame to copy from.
-  _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli){
-  const ptrdiff_t     *frag_buf_offs;
-  const unsigned char *src_frame_data;
-  unsigned char       *dst_frame_data;
-  ptrdiff_t            fragii;
-  int                  ystride;
-  dst_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_dst_frame]];
-  src_frame_data=_state->ref_frame_data[_state->ref_frame_idx[_src_frame]];
-  ystride=_state->ref_ystride[_pli];
-  frag_buf_offs=_state->frag_buf_offs;
-  for(fragii=0;fragii<_nfragis;fragii++){
-    ptrdiff_t frag_buf_off;
-    frag_buf_off=frag_buf_offs[_fragis[fragii]];
-#define SRC edx
-#define DST eax
-#define YSTRIDE ecx
-#define YSTRIDE3 edi
-    OC_FRAG_COPY_MMX(dst_frame_data+frag_buf_off,
-     src_frame_data+frag_buf_off,ystride);
-#undef SRC
-#undef DST
-#undef YSTRIDE
-#undef YSTRIDE3
-  }
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
+  memset(_bv,~(_flimit<<1),8);
 }
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
@@ -144,8 +113,7 @@
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
-  OC_ALIGN8(unsigned char  ll[8]);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   const oc_fragment_plane *fplane;
   const oc_fragment       *frags;
   const ptrdiff_t         *frag_buf_offs;
@@ -156,13 +124,12 @@
   ptrdiff_t                fragi0_end;
   int                      ystride;
   int                      nhfrags;
-  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   fplane=_state->fplanes+_pli;
   nhfrags=fplane->nhfrags;
   fragi_top=fplane->froffset;
   fragi_bot=fragi_top+fplane->nfrags;
   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
-  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
+  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   ystride=_state->ref_ystride[_pli];
   frags=_state->frags;
   frag_buf_offs=_state->frag_buf_offs;
@@ -187,13 +154,13 @@
 #define LL edx
 #define D esi
 #define D_WORD si
-        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,ll);
-        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,ll);
+        if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
+        if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
-          OC_LOOP_FILTER_H_MMX(ref+8,ystride,ll);
+          OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
         }
         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
-          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,ll);
+          OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
         }
 #undef PIX
 #undef YSTRIDE3

Copied: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c (from rev 17379, experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.c)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,192 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+ CPU capability detection for x86 processors.
+  Originally written by Rudolf Marek.
+
+ function:
+  last mod: $Id$
+
+ ********************************************************************/
+
+#include "x86cpu.h"
+
+#if !defined(OC_X86_ASM)
+ogg_uint32_t oc_cpu_flags_get(void){
+  return 0;
+}
+#else
+/*Why does MSVC need this complicated rigamarole?
+  At this point I honestly do not care.*/
+
+/*Visual C cpuid helper function.
+  For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
+   for VS2003 users, so we do it in inline assembler.*/
+static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
+  _asm{
+    mov eax,[_op]
+    mov esi,_cpu_info
+    cpuid
+    mov [esi+0],eax
+    mov [esi+4],ebx
+    mov [esi+8],ecx
+    mov [esi+12],edx
+  }
+}
+
+#  define cpuid(_op,_eax,_ebx,_ecx,_edx) \
+  do{ \
+    ogg_uint32_t cpu_info[4]; \
+    oc_cpuid_helper(cpu_info,_op); \
+    (_eax)=cpu_info[0]; \
+    (_ebx)=cpu_info[1]; \
+    (_ecx)=cpu_info[2]; \
+    (_edx)=cpu_info[3]; \
+  }while(0)
+
+static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
+  _asm{
+    pushfd
+    pushfd
+    pop eax
+    mov ebx,eax
+    xor eax,200000h
+    push eax
+    popfd
+    pushfd
+    pop eax
+    popfd
+    mov ecx,_eax
+    mov [ecx],eax
+    mov ecx,_ebx
+    mov [ecx],ebx
+  }
+}
+
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
+  ogg_uint32_t eax;
+  ogg_uint32_t ebx;
+  ogg_uint32_t ecx;
+  ogg_uint32_t edx;
+# if !defined(__amd64__)&&!defined(__x86_64__)
+  /*Not all x86-32 chips support cpuid, so we have to check.*/
+  oc_detect_cpuid_helper(&eax,&ebx);
+  /*No cpuid.*/
+  if(eax==ebx)return 0;
+# endif
+  cpuid(0,eax,ebx,ecx,edx);
+  /*         l e t n          I e n i          u n e G*/
+  if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
+   /*      6 8 x M          T e n i          u n e G*/
+   ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
+    int family;
+    int model;
+    /*Intel, Transmeta (tested with Crusoe TM5800):*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    family=(eax>>8)&0xF;
+    model=(eax>>4)&0xF;
+    /*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
+       unit, so don't use it.*/
+    if(family==6&&(model==9||model==13||model==14)){
+      flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
+    }
+  }
+  /*              D M A c          i t n e          h t u A*/
+  else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
+   /*      C S N            y b   e          d o e G*/
+   ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
+    /*AMD, Geode:*/
+    cpuid(0x80000000,eax,ebx,ecx,edx);
+    if(eax<0x80000001)flags=0;
+    else{
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      flags=oc_parse_amd_flags(edx,ecx);
+    }
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
+  }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
+  /*              s l u a          H r u a          t n e C*/
+  else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
+    /*VIA:*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
+    }
+  }
+  else{
+    /*Implement me.*/
+    flags=0;
+  }
+  return flags;
+}
+#endif

Copied: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h (from rev 17379, experimental/derf/theora-ptalarbvorm/lib/x86_vc/cpu.h)
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86cpu.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,36 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+ function:
+    last mod: $Id$
+
+ ********************************************************************/
+
+#if !defined(_x86_vc_x86cpu_H)
+# define _x86_vc_x86cpu_H (1)
+#include "../internal.h"
+
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
+#define OC_CPU_X86_3DNOWEXT (1<<2)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
+
+ogg_uint32_t oc_cpu_flags_get(void);
+
+#endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -18,10 +18,10 @@
 
 #if defined(OC_X86_ASM)
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc){
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
   ogg_uint32_t cpu_flags;
   cpu_flags=_enc->state.cpu_flags;
-  oc_enc_vtable_init_c(_enc);
+  oc_enc_accel_init_c(_enc);
   if(cpu_flags&OC_CPU_X86_MMX){
     _enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
     _enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86enc.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -17,10 +17,14 @@
 
 #if !defined(_x86_vc_x86enc_H)
 # define _x86_vc_x86enc_H (1)
+# include "x86int.h"
+# if defined(OC_X86_ASM)
+#  define oc_enc_accel_init oc_enc_accel_init_x86
+#  define OC_ENC_USE_VTABLE (1)
+# endif
 # include "../encint.h"
-# include "x86int.h"
 
-void oc_enc_vtable_init_x86(oc_enc_ctx *_enc);
+void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
 
 unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
  const unsigned char *_ref,int _ystride);

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86int.h	2010-09-21 21:53:48 UTC (rev 17410)
@@ -18,26 +18,32 @@
 #if !defined(_x86_vc_x86int_H)
 # define _x86_vc_x86int_H (1)
 # include "../internal.h"
-# include "cpu.h"
+# if defined(OC_X86_ASM)
+#  define oc_state_accel_init oc_state_accel_init_x86
+#  define OC_STATE_USE_VTABLE (1)
+# endif
+# include "../state.h"
+# include "x86cpu.h"
 
-void oc_state_vtable_init_x86(oc_theora_state *_state);
+void oc_state_accel_init_x86(oc_theora_state *_state);
 
 void oc_frag_copy_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride);
+void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
+ const unsigned char *_src_frame,int _ystride,
+ const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
 void oc_frag_recon_inter_mmx(unsigned char *_dst,
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
-void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
- const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
- int _dst_frame,int _src_frame,int _pli);
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
+void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
- int _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
+ signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
 
 #endif

Modified: experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c	2010-09-21 19:04:31 UTC (rev 17409)
+++ experimental/derf/theora-ptalarbvorm/lib/x86_vc/x86state.c	2010-09-21 21:53:48 UTC (rev 17410)
@@ -40,21 +40,22 @@
   64,64,64,64,64,64,64,64,
 };
 
-void oc_state_vtable_init_x86(oc_theora_state *_state){
+void oc_state_accel_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
     _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
     _state->opt_vtable.idct8x8=oc_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
-    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
+    _state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
     _state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
   }
-  else oc_state_vtable_init_c(_state);
+  else oc_state_accel_init_c(_state);
 }
 #endif

Added: experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4
===================================================================
--- experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4	                        (rev 0)
+++ experimental/derf/theora-ptalarbvorm/m4/as-gcc-inline-assembly.m4	2010-09-21 21:53:48 UTC (rev 17410)
@@ -0,0 +1,106 @@
+dnl as-gcc-inline-assembly.m4 0.1.0
+
+dnl autostars m4 macro for detection of gcc inline assembly
+
+dnl David Schleef <ds at schleef.org>
+
+dnl $Id$
+
+dnl AS_COMPILER_FLAG(ACTION-IF-ACCEPTED, [ACTION-IF-NOT-ACCEPTED])
+dnl Tries to compile with the given CFLAGS.
+dnl Runs ACTION-IF-ACCEPTED if the compiler can compile with the flags,
+dnl and ACTION-IF-NOT-ACCEPTED otherwise.
+
+AC_DEFUN([AS_GCC_INLINE_ASSEMBLY],
+[
+  AC_MSG_CHECKING([if compiler supports gcc-style inline assembly])
+
+  AC_TRY_COMPILE([], [
+#ifdef __GNUC_MINOR__
+#if (__GNUC__ * 1000 + __GNUC_MINOR__) < 3004
+#error GCC before 3.4 has critical bugs compiling inline assembly
+#endif
+#endif
+__asm__ (""::) ], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+AC_DEFUN([AC_TRY_ASSEMBLE],
+[ac_c_ext=$ac_ext
+ ac_ext=${ac_s_ext-s}
+ cat > conftest.$ac_ext <<EOF
+ .file "configure"
+[$1]
+EOF
+if AC_TRY_EVAL(ac_compile); then
+  ac_ext=$ac_c_ext
+  ifelse([$2], , :, [  $2
+  rm -rf conftest*])
+else
+  echo "configure: failed program was:" >&AC_FD_CC
+  cat conftest.$ac_ext >&AC_FD_CC
+  ac_ext=$ac_c_ext
+ifelse([$3], , , [  rm -rf conftest*
+  $3
+])dnl
+fi
+rm -rf conftest*])
+
+
+AC_DEFUN([AS_ASM_ARM_NEON],
+[
+  AC_MSG_CHECKING([if assembler supports NEON instructions on ARM])
+
+  AC_TRY_ASSEMBLE([vorr d0,d0,d0], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_MEDIA],
+[
+  AC_MSG_CHECKING([if assembler supports ARMv6 media instructions on ARM])
+
+  AC_TRY_ASSEMBLE([shadd8 r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])
+
+
+AC_DEFUN([AS_ASM_ARM_EDSP],
+[
+  AC_MSG_CHECKING([if assembler supports EDSP instructions on ARM])
+
+  AC_TRY_ASSEMBLE([qadd r3,r3,r3], [flag_ok=yes], [flag_ok=no])
+
+  if test "X$flag_ok" = Xyes ; then
+    $1
+    true
+  else
+    $2
+    true
+  fi
+  AC_MSG_RESULT([$flag_ok])
+])