[xiph-commits] r15953 - in branches/theora-thusnelda: . lib lib/dec lib/dec/x86 lib/enc lib/enc/x86

tterribe at svn.xiph.org tterribe at svn.xiph.org
Sun Apr 26 07:30:18 PDT 2009


Author: tterribe
Date: 2009-04-26 07:30:15 -0700 (Sun, 26 Apr 2009)
New Revision: 15953

Added:
   branches/theora-thusnelda/lib/dec/x86/mmxfrag.h
   branches/theora-thusnelda/lib/dec/x86/mmxloop.h
   branches/theora-thusnelda/lib/enc/encfrag.c
   branches/theora-thusnelda/lib/enc/huffenc.c
   branches/theora-thusnelda/lib/enc/huffenc.h
   branches/theora-thusnelda/lib/enc/x86/mmxenc.c
   branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c
   branches/theora-thusnelda/lib/enc/x86/mmxfdct.c
   branches/theora-thusnelda/lib/enc/x86/sse2fdct.c
   branches/theora-thusnelda/lib/enc/x86/x86enc.c
   branches/theora-thusnelda/lib/enc/x86/x86enc.h
Removed:
   branches/theora-thusnelda/lib/dec/idct.h
   branches/theora-thusnelda/lib/enc/dsp.c
   branches/theora-thusnelda/lib/enc/dsp.h
   branches/theora-thusnelda/lib/enc/encoder_huffman.c
   branches/theora-thusnelda/lib/enc/encoder_idct.c
   branches/theora-thusnelda/lib/enc/reconstruct.c
   branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
   branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
   branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
   branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
   branches/theora-thusnelda/lib/enc/x86/fdct_sse2.c
   branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
   branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
Modified:
   branches/theora-thusnelda/configure.ac
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/cpu.c
   branches/theora-thusnelda/lib/dec/apiwrapper.c
   branches/theora-thusnelda/lib/dec/decapiwrapper.c
   branches/theora-thusnelda/lib/dec/decinfo.c
   branches/theora-thusnelda/lib/dec/decint.h
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/dequant.c
   branches/theora-thusnelda/lib/dec/fragment.c
   branches/theora-thusnelda/lib/dec/huffdec.c
   branches/theora-thusnelda/lib/dec/huffdec.h
   branches/theora-thusnelda/lib/dec/huffman.h
   branches/theora-thusnelda/lib/dec/idct.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/quant.c
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxfrag.c
   branches/theora-thusnelda/lib/dec/x86/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86/x86int.h
   branches/theora-thusnelda/lib/dec/x86/x86state.c
   branches/theora-thusnelda/lib/enc/codec_internal.h
   branches/theora-thusnelda/lib/enc/dct.c
   branches/theora-thusnelda/lib/enc/dct_decode.c
   branches/theora-thusnelda/lib/enc/dct_encode.c
   branches/theora-thusnelda/lib/enc/encapiwrapper.c
   branches/theora-thusnelda/lib/enc/encode.c
   branches/theora-thusnelda/lib/enc/encoder_quant.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/mcenc.c
   branches/theora-thusnelda/lib/enc/mode.c
   branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
   branches/theora-thusnelda/lib/internal.h
Log:
Oh hai. I rewrote all ur asm.
Includes an MMX implementation of the new fDCT for x86-32.
Overall decoder speed-ups are 4.7% for x86-32 and 3.8% for x86-64.
Encoder speed-up is 3.1% for x86-64; x86-32 speeds aren't comparable because
 the fDCT has changed (though overall speed-up over using the C version of the
 new fDCT is 6.1%, which gives some perspective).
Also, the encoder output with asm enabled should now match that of
 --disable-asm (which was a slight speed regression, but other gains more than
 made up for it).
There is still room for improvement (notably quantization and dequantization),
 but this version is unequivocably better, so it's time to check it in.

Also in this commit, the long involved process of getting the encoder to share
 code with the decoder has begun; this breaks all of the other build systems.
Lots more of this in the future.
Huffman table encoding and decoding has been redone, a VFR rate-control
 acounting bug was fixed and TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE was
 actually implemented (maybe xiphmont just forgot to check that part of r15914
 in?).


Modified: branches/theora-thusnelda/configure.ac
===================================================================
--- branches/theora-thusnelda/configure.ac	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/configure.ac	2009-04-26 14:30:15 UTC (rev 15953)
@@ -186,7 +186,7 @@
   i[[3456]]86)
     cpu_x86_32=yes 
     cpu_optimization="32 bit x86"
-    AC_DEFINE([USE_ASM], [],  [make use of asm optimization])
+    AC_DEFINE([OC_X86_ASM], [],  [make use of x86 asm optimization])
     if test "x$target_vendor" = "xapple"; then
       THEORA_LDFLAGS="$THEORA_LDFLAGS  -Wl,-read_only_relocs,suppress"
     fi
@@ -194,7 +194,8 @@
   x86_64)
     cpu_x86_64=yes
     cpu_optimization="64 bit x86"
-    AC_DEFINE([USE_ASM], [],  [make use of asm optimization])  
+    AC_DEFINE([OC_X86_ASM], [],  [make use of x86 asm optimization])
+    AC_DEFINE([OC_X86_64_ASM], [],  [make use of x86_64 asm optimization])
     ;;
   esac
 else

Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================
--- branches/theora-thusnelda/lib/Makefile.am	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/Makefile.am	2009-04-26 14:30:15 UTC (rev 15953)
@@ -3,16 +3,13 @@
 
 EXTRA_DIST = \
 	cpu.c \
-        enc/x86/dct_decode_mmx.c \
-        enc/x86/dsp_mmx.c \
-        enc/x86/dsp_mmxext.c \
-        enc/x86/recon_mmx.c \
         enc/x86/fdct_mmx.c \
         enc/x86/fdct_sse2.c \
-        enc/x86/idct_mmx.c \
+        enc/x86/mmxenc.c \
+        enc/x86/mmxencfrag.c \
+        enc/x86/x86enc.c \
         enc/x86_32_vs/dsp_mmx.c \
         enc/x86_32_vs/fdct_mmx.c \
-        enc/x86_32_vs/recon_mmx.c \
         enc/dct_encode.c \
         enc/encode.c \
         enc/encoder_toplevel.c \
@@ -25,43 +22,79 @@
 	enc/encapiwrapper.c \
 	enc/encoder_disabled.c
 else
-encoder_sources = \
+encoder_uniq_x86_sources = \
+	enc/x86/mmxenc.c \
+	enc/x86/mmxencfrag.c \
+	enc/x86/mmxfdct.c \
+	enc/x86/x86enc.c
+
+encoder_uniq_x86_64_sources = \
+	enc/x86/sse2fdct.c
+
+encoder_shared_x86_sources = \
+	dec/x86/mmxfrag.c \
+	dec/x86/mmxidct.c
+
+encoder_shared_x86_64_sources =
+
+if CPU_x86_64
+encoder_uniq_arch_sources = \
+ $(encoder_uniq_x86_sources) \
+ $(encoder_uniq_x86_64_sources)
+encoder_shared_arch_sources = \
+ $(encoder_shared_x86_sources) \
+ $(encoder_shared_x86_64_sources)
+else
+if CPU_x86_32
+encoder_uniq_arch_sources = $(encoder_uniq_x86_sources)
+encoder_shared_arch_sources = $(encoder_shared_x86_sources)
+else
+encoder_uniq_arch_sources =
+encoder_shared_arch_sources =
+endif
+endif
+
+encoder_uniq_sources = \
 	enc/dct.c \
 	enc/dct_decode.c \
 	enc/dct_encode.c \
-	enc/dsp.c \
+	enc/encfrag.c \
 	enc/encapiwrapper.c \
 	enc/encode.c \
-	enc/encoder_huffman.c \
-	enc/encoder_idct.c \
 	enc/encoder_toplevel.c \
 	enc/encoder_quant.c \
 	enc/frarray.c \
 	enc/frinit.c \
+	enc/huffenc.c \
 	enc/mathops.c \
 	enc/mcenc.c \
 	enc/mode.c \
-	enc/reconstruct.c
+	$(encoder_uniq_arch_sources)
 
-encoder_x86_sources = \
-	enc/x86/dct_decode_mmx.c \
-	enc/x86/dsp_mmx.c \
-	enc/x86/dsp_mmxext.c \
-	enc/x86/recon_mmx.c \
-	enc/x86/idct_mmx.c \
-	enc/x86/fdct_mmx.c \
-	enc/x86/fdct_sse2.c
+encoder_sources = \
+	dec/fragment.c \
+	dec/idct.c \
+	dec/internal.c \
+	$(encoder_shared_arch_sources) \
+	$(encoder_uniq_sources)
 
+endif
+
+decoder_x86_sources = \
+	dec/x86/mmxidct.c \
+	dec/x86/mmxfrag.c \
+	dec/x86/mmxstate.c \
+	dec/x86/x86state.c
 if CPU_x86_64
-encoder_arch_sources = $(encoder_x86_sources)
+decoder_arch_sources = $(decoder_x86_sources)
 else
 if CPU_x86_32
-encoder_arch_sources = $(encoder_x86_sources)
+decoder_arch_sources = $(decoder_x86_sources)
+else
+decoder_arch_sources =
 endif
 endif
 
-endif
-
 decoder_sources = \
 	dec/apiwrapper.c \
 	dec/bitpack.c \
@@ -75,28 +108,13 @@
 	dec/info.c \
 	dec/internal.c \
 	dec/quant.c \
-	dec/state.c
+	dec/state.c \
+	$(decoder_arch_sources)
 
-decoder_x86_sources = \
-	dec/x86/mmxidct.c \
-	dec/x86/mmxfrag.c \
-	dec/x86/mmxstate.c \
-	dec/x86/x86state.c
-if CPU_x86_64
-decoder_arch_sources = $(decoder_x86_sources)
-else
-if CPU_x86_32
-decoder_arch_sources = $(decoder_x86_sources)
-else
-decoder_arch_sources =
-endif
-endif
-
 noinst_HEADERS = \
 	cpu.h \
 	internal.h \
 	enc/codec_internal.h \
-	enc/dsp.h \
 	enc/encoder_huffman.h \
 	enc/encoder_lookup.h \
 	enc/enquant.h \
@@ -112,33 +130,28 @@
 	dec/dequant.h \
 	dec/huffdec.h \
 	dec/huffman.h \
-	dec/idct.h \
 	dec/ocintrin.h \
 	dec/quant.h \
 	dec/x86/x86int.h
 
 libtheoradec_la_SOURCES = \
-	$(decoder_arch_sources) \
 	$(decoder_sources) \
-  Version_script-dec
+	Version_script-dec
 libtheoradec_la_LDFLAGS = \
   -version-info @THDEC_LIB_CURRENT@:@THDEC_LIB_REVISION@:@THDEC_LIB_AGE@ \
   @THEORADEC_LDFLAGS@ @CAIRO_LIBS@
 
 libtheoraenc_la_SOURCES = \
-	$(encoder_arch_sources) \
 	$(encoder_sources) \
-  Version_script-enc
+	Version_script-enc
 libtheoraenc_la_LDFLAGS = \
   -version-info @THENC_LIB_CURRENT@:@THENC_LIB_REVISION@:@THENC_LIB_AGE@ \
   @THEORAENC_LDFLAGS@ $(OGG_LIBS)
 
 libtheora_la_SOURCES = \
-	$(decoder_arch_sources) \
 	$(decoder_sources) \
-	$(encoder_arch_sources) \
-	$(encoder_sources) \
-  Version_script
+	$(encoder_uniq_sources) \
+	Version_script
 libtheora_la_LDFLAGS = \
   -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
   @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS)

Modified: branches/theora-thusnelda/lib/cpu.c
===================================================================
--- branches/theora-thusnelda/lib/cpu.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/cpu.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -20,7 +20,7 @@
 
 #include "cpu.h"
 
-#if !defined(USE_ASM)
+#if !defined(OC_X86_ASM)
 static ogg_uint32_t oc_cpu_flags_get(void){
   return 0;
 }

Modified: branches/theora-thusnelda/lib/dec/apiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/dec/apiwrapper.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/apiwrapper.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -47,10 +47,10 @@
 void theora_clear(theora_state *_th){
   /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
   if(_th->internal_decode!=NULL){
-    (*((oc_state_dispatch_vtbl *)_th->internal_decode)->clear)(_th);
+    (*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th);
   }
   if(_th->internal_encode!=NULL){
-    (*((oc_state_dispatch_vtbl *)_th->internal_encode)->clear)(_th);
+    (*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th);
   }
   if(_th->i!=NULL)theora_info_clear(_th->i);
   memset(_th,0,sizeof(*_th));
@@ -59,11 +59,11 @@
 int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){
   /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
   if(_th->internal_decode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_decode)->control)(_th,
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th,
      _req,_buf,_buf_sz);
   }
   else if(_th->internal_encode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_encode)->control)(_th,
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th,
      _req,_buf,_buf_sz);
   }
   else return TH_EINVAL;
@@ -72,11 +72,11 @@
 ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){
   /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
   if(_th->internal_decode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_decode)->granule_frame)(
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)(
      _th,_gp);
   }
   else if(_th->internal_encode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_encode)->granule_frame)(
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)(
      _th,_gp);
   }
   else return -1;
@@ -85,11 +85,11 @@
 double theora_granule_time(theora_state *_th, ogg_int64_t _gp){
   /*Provide compatibility with mixed encoder and decoder shared lib versions.*/
   if(_th->internal_decode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_decode)->granule_time)(
+    return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)(
      _th,_gp);
   }
   else if(_th->internal_encode!=NULL){
-    return (*((oc_state_dispatch_vtbl *)_th->internal_encode)->granule_time)(
+    return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)(
      _th,_gp);
   }
   else return -1;

Modified: branches/theora-thusnelda/lib/dec/decapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decapiwrapper.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/decapiwrapper.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -52,7 +52,7 @@
   return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
 }
 
-static const oc_state_dispatch_vtbl OC_DEC_DISPATCH_VTBL={
+static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={
   (oc_state_clear_func)theora_decode_clear,
   (oc_state_control_func)theora_decode_control,
   (oc_state_granule_frame_func)theora_decode_granule_frame,

Modified: branches/theora-thusnelda/lib/dec/decinfo.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decinfo.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/decinfo.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -116,8 +116,8 @@
   _tc->vendor[len]='\0';
   /*Read the user comments.*/
   _tc->comments=(int)oc_unpack_length(_opb);
-  if(_tc->comments<0||_tc->comments>(LONG_MAX>>2)||
-   ((long)_tc->comments<<2)>_opb->storage-theorapackB_bytes(_opb)){
+  len=_tc->comments;
+  if(len<0||len>(LONG_MAX>>2)||len<<2>_opb->storage-theorapackB_bytes(_opb)){
     _tc->comments=0;
     return TH_EBADHEADER;
   }

Modified: branches/theora-thusnelda/lib/dec/decint.h
===================================================================
--- branches/theora-thusnelda/lib/dec/decint.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/decint.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -25,7 +25,6 @@
 typedef struct th_setup_info oc_setup_info;
 typedef struct th_dec_ctx    oc_dec_ctx;
 
-# include "idct.h"
 # include "huffdec.h"
 # include "dequant.h"
 

Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -175,10 +175,10 @@
     int qsum;
     qsum=0;
     for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
-      qsum+=_dec->state.dequant_tables[qti][pli][qi][18]+
-       _dec->state.dequant_tables[qti][pli][qi][19]+
-       _dec->state.dequant_tables[qti][pli][qi][26]+
-       _dec->state.dequant_tables[qti][pli][qi][27]<<(pli==0);
+      qsum+=_dec->state.dequant_tables[qti][pli][qi][12]+
+       _dec->state.dequant_tables[qti][pli][qi][17]+
+       _dec->state.dequant_tables[qti][pli][qi][18]+
+       _dec->state.dequant_tables[qti][pli][qi][24]<<(pli==0);
     }
     _dec->pp_sharp_mod[qi]=-(qsum>>11);
   }
@@ -1397,7 +1397,7 @@
   coded_fragi_end+=_pipe->ncoded_fragis[_pli];
   for(;coded_fragi<coded_fragi_end;coded_fragi++){
     oc_fragment    *frag;
-    oc_quant_table *iquants;
+    oc_quant_table *quants;
     /*This array is made one bigger than necessary so that an invalid zero
        run cannot cause a buffer overflow.
       The inverse zig-zag mapping sends all out of range indices to the last
@@ -1432,11 +1432,11 @@
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
     dct_coeffs[0]=(ogg_int16_t)frag->dc;
-    iquants=_dec->state.dequant_tables[frag->mbmode!=OC_MODE_INTRA][_pli];
+    quants=_dec->state.dequant_tables[frag->mbmode!=OC_MODE_INTRA][_pli];
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
     oc_state_frag_recon(&_dec->state,frag,_pli,dct_coeffs,last_zzi,zzi,
-     iquants[_dec->state.qis[0]][0],iquants[frag->qi]);
+     quants[_dec->state.qis[0]][0],quants[frag->qi]);
   }
   _pipe->coded_fragis[_pli]=coded_fragi;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -1451,7 +1451,7 @@
      correctly.*/
   /*Copy the uncoded blocks from the previous reference frame.*/
   _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
-  oc_state_frag_copy(&_dec->state,_pipe->uncoded_fragis[_pli],
+  oc_state_frag_copy_list(&_dec->state,_pipe->uncoded_fragis[_pli],
    _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
 }
 
@@ -2031,6 +2031,10 @@
          pipe.fragy_end[pli]-edelay<<frag_shift);
       }
       if(_dec->stripe_cb.stripe_decoded!=NULL){
+        /*The callback might want to use the FPU, so let's make sure they can.
+          We violate all kinds of ABI restrictions by not doing this until
+           now, but none of them actually matter.*/
+        oc_restore_fpu(&_dec->state);
         /*Make the callback, ensuring we flip the sense of the "start" and
            "end" of the available region upside down.*/
         (*_dec->stripe_cb.stripe_decoded)(_dec->stripe_cb.ctx,stripe_buf,
@@ -2057,6 +2061,7 @@
     /*Don't dump images for dropped frames.*/
     oc_state_dump_frame(&_dec->state,OC_FRAME_SELF,"dec");
 #endif
+    oc_restore_fpu(&_dec->state);
     return 0;
   }
   else{

Modified: branches/theora-thusnelda/lib/dec/dequant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/dequant.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/dequant.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -21,8 +21,7 @@
 #include "dequant.h"
 #include "decint.h"
 
-int oc_quant_params_unpack(oggpack_buffer *_opb,
- th_quant_info *_qinfo){
+int oc_quant_params_unpack(oggpack_buffer *_opb,th_quant_info *_qinfo){
   th_quant_base *base_mats;
   long           val;
   int            nbase_mats;
@@ -127,7 +126,6 @@
     }
     while(qri-->0);
   }
-
   _ogg_free(base_mats);
   return 0;
 }

Modified: branches/theora-thusnelda/lib/dec/fragment.c
===================================================================
--- branches/theora-thusnelda/lib/dec/fragment.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/fragment.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -14,74 +14,79 @@
     last mod: $Id$
 
  ********************************************************************/
-
+#include <string.h>
 #include "../internal.h"
 
+void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  (*_state->opt_vtable.frag_copy)(_dst,_src,_ystride);
+}
+
+void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
+  int i;
+  for(i=8;i-->0;){
+    memcpy(_dst,_src,8*sizeof(*_dst));
+    _dst+=_ystride;
+    _src+=_ystride;
+  }
+}
+
 void oc_frag_recon_intra(const oc_theora_state *_state,unsigned char *_dst,
- int _dst_ystride,const ogg_int16_t *_residue){
-  _state->opt_vtable.frag_recon_intra(_dst,_dst_ystride,_residue);
+ int _ystride,const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_intra(_dst,_ystride,_residue);
 }
 
-void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t *_residue){
+void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
+ const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
-    for(j=0;j<8;j++){
-      int res;
-      res=*_residue++;
-      _dst[j]=OC_CLAMP255(res+128);
-    }
-    _dst+=_dst_ystride;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+128);
+    _dst+=_ystride;
   }
 }
 
 void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
- int _dst_ystride,const unsigned char *_src,int _src_ystride,
- const ogg_int16_t *_residue){
-  _state->opt_vtable.frag_recon_inter(_dst,_dst_ystride,_src,_src_ystride,
-   _residue);
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_inter(_dst,_src,_ystride,_residue);
 }
 
-void oc_frag_recon_inter_c(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
-    for(j=0;j<8;j++){
-      int res;
-      res=*_residue++;
-      _dst[j]=OC_CLAMP255(res+_src[j]);
-    }
-    _dst+=_dst_ystride;
-    _src+=_src_ystride;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+_src[j]);
+    _dst+=_ystride;
+    _src+=_ystride;
   }
 }
 
 void oc_frag_recon_inter2(const oc_theora_state *_state,unsigned char *_dst,
- int _dst_ystride,const unsigned char *_src1,int _src1_ystride,
- const unsigned char *_src2,int _src2_ystride,const ogg_int16_t *_residue){
-  _state->opt_vtable.frag_recon_inter2(_dst,_dst_ystride,_src1,_src1_ystride,
-   _src2,_src2_ystride,_residue);
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride,
+ const ogg_int16_t _residue[64]){
+  _state->opt_vtable.frag_recon_inter2(_dst,_src1,_src2,_ystride,_residue);
 }
 
-void oc_frag_recon_inter2_c(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue){
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
   int i;
   for(i=0;i<8;i++){
     int j;
-    for(j=0;j<8;j++){
-      int res;
-      res=*_residue++;
-      _dst[j]=OC_CLAMP255(res+((int)_src1[j]+_src2[j]>>1));
-    }
-    _dst+=_dst_ystride;
-    _src1+=_src1_ystride;
-    _src2+=_src2_ystride;
+    for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+(_src1[j]+_src2[j]>>1));
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
   }
 }
 
+void oc_restore_fpu(const oc_theora_state *_state){
+  _state->opt_vtable.restore_fpu();
+}
+
+void oc_restore_fpu_c(void){}
+
+
 /*Computes the predicted DC value for the given fragment.
   This requires that the fully decoded DC values be available for the left,
    upper-left, upper, and upper-right fragments (if they exist).

Modified: branches/theora-thusnelda/lib/dec/huffdec.c
===================================================================
--- branches/theora-thusnelda/lib/dec/huffdec.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/huffdec.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -131,40 +131,42 @@
 }
 
 /*Unpacks a sub-tree from the given buffer.
-  _opb:    The buffer to unpack from.
-  _binode: The location to store a pointer to the sub-tree in.
-  _depth:  The current depth of the tree.
-           This is used to prevent infinite recursion.
+  _opb:      The buffer to unpack from.
+  _binodes:  The nodes to store the sub-tree in.
+  _nbinodes: The number of nodes available for the sub-tree.
   Return: 0 on success, or a negative value on error.*/
 static int oc_huff_tree_unpack(oggpack_buffer *_opb,
- oc_huff_node **_binode,int _depth){
+ oc_huff_node *_binodes,int _nbinodes){
   oc_huff_node *binode;
   long          bits;
-  /*Prevent infinite recursion.*/
-  if(++_depth>32)return TH_EBADHEADER;
+  int           nused;
+  if(_nbinodes<1)return TH_EBADHEADER;
+  binode=_binodes;
+  nused=1;
   if(theorapackB_read1(_opb,&bits)<0)return TH_EBADHEADER;
   /*Read an internal node:*/
   if(!bits){
     int ret;
-    binode=oc_huff_node_alloc(1);
-    binode->depth=(unsigned char)(_depth>1);
-    ret=oc_huff_tree_unpack(_opb,binode->nodes,_depth);
-    if(ret>=0)ret=oc_huff_tree_unpack(_opb,binode->nodes+1,_depth);
-    if(ret<0){
-      oc_huff_tree_free(binode);
-      *_binode=NULL;
-      return ret;
+    binode->nbits=1;
+    binode->depth=1;
+    binode->nodes[0]=_binodes+nused;
+    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
+    if(ret>=0){
+      nused+=ret;
+      binode->nodes[1]=_binodes+nused;
+      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
     }
+    if(ret<0)return ret;
+    nused+=ret;
   }
   /*Read a leaf node:*/
   else{
     if(theorapackB_read(_opb,OC_NDCT_TOKEN_BITS,&bits)<0)return TH_EBADHEADER;
-    binode=oc_huff_node_alloc(0);
-    binode->depth=(unsigned char)(_depth>1);
+    binode->nbits=0;
+    binode->depth=1;
     binode->token=(unsigned char)bits;
   }
-  *_binode=binode;
-  return 0;
+  return nused;
 }
 
 /*Finds the depth of shortest branch of the given sub-tree.
@@ -197,6 +199,28 @@
   }
 }
 
+/*Makes a copy of the given Huffman tree.
+  _node: The Huffman tree to copy.
+  Return: The copy of the Huffman tree.*/
+static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node){
+  oc_huff_node *ret;
+  ret=oc_huff_node_alloc(_node->nbits);
+  ret->depth=_node->depth;
+  if(_node->nbits){
+    int nchildren;
+    int i;
+    int inext;
+    nchildren=1<<_node->nbits;
+    for(i=0;i<nchildren;){
+      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i]);
+      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
+      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
+    }
+  }
+  else ret->token=_node->token;
+  return ret;
+}
+
 static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode);
 
 /*Fills the given nodes table with all the children in the sub-tree at the
@@ -224,7 +248,6 @@
     _level--;
     oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth);
     oc_huff_node_fill(_nodes+(1<<_level),_binode->nodes[1],_level,_depth);
-    oc_huff_node_free(_binode);
   }
 }
 
@@ -248,35 +271,13 @@
   }
   while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
   depth--;
-  if(depth<=1)return _binode;
+  if(depth<=1)return oc_huff_tree_copy(_binode);
   root=oc_huff_node_alloc(depth);
   root->depth=_binode->depth;
   oc_huff_node_fill(root->nodes,_binode,depth,depth);
   return root;
 }
 
-/*Makes a copy of the given Huffman tree.
-  _node: The Huffman tree to copy.
-  Return: The copy of the Huffman tree.*/
-static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node){
-  oc_huff_node *ret;
-  ret=oc_huff_node_alloc(_node->nbits);
-  ret->depth=_node->depth;
-  if(_node->nbits){
-    int nchildren;
-    int i;
-    int inext;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;){
-      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i]);
-      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
-      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
-    }
-  }
-  else ret->token=_node->token;
-  return ret;
-}
-
 /*Unpacks a set of Huffman trees, and reduces them to a collapsed
    representation.
   _opb:   The buffer to unpack the trees from.
@@ -286,10 +287,12 @@
  oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    int ret;
-    ret=oc_huff_tree_unpack(_opb,_nodes+i,0);
+    oc_huff_node nodes[63];
+    int          ret;
+    /*Unpack the full tree into a temporary buffer.*/
+    ret=oc_huff_tree_unpack(_opb,nodes,63);
     if(ret<0)return ret;
-    _nodes[i]=oc_huff_tree_collapse(_nodes[i]);
+    _nodes[i]=oc_huff_tree_collapse(nodes);
   }
   return 0;
 }

Modified: branches/theora-thusnelda/lib/dec/huffdec.h
===================================================================
--- branches/theora-thusnelda/lib/dec/huffdec.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/huffdec.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -75,7 +75,7 @@
     The ACTUAL size of this array is 1<<nbits, despite what the declaration
      below claims.
     The exception is that for leaf nodes the size is 0.*/
-  oc_huff_node  *nodes[1];
+  oc_huff_node  *nodes[2];
 };
 
 

Modified: branches/theora-thusnelda/lib/dec/huffman.h
===================================================================
--- branches/theora-thusnelda/lib/dec/huffman.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/huffman.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -65,6 +65,6 @@
 #define OC_NDCT_RUN_MAX          (32)
 #define OC_NDCT_RUN_CAT1A_MAX    (28)
 
-extern const int OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS];
+extern const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS];
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/idct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/idct.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/idct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -16,9 +16,8 @@
  ********************************************************************/
 
 #include <string.h>
-#include <ogg/ogg.h>
+#include "../internal.h"
 #include "dct.h"
-#include "idct.h"
 
 /*Performs an inverse 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 relative to the orthonormal version of
@@ -220,19 +219,29 @@
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.
+  All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
+   x  x  0  0  0  0  0  0
+   x  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
+   0  0  0  0  0  0  0  0
   _y: The buffer to store the result in.
       This may be the same as _x.
-  _x: The input coefficients. */
-void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  _x: The input coefficients.*/
+static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
   ogg_int16_t        w[64];
   /*Transform rows of x into columns of w.*/
-  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  idct8_2(w,_x);
+  idct8_1(w+1,_x+8);
   /*Transform rows of w into columns of y.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
-  /*Adjust for scale factor.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
+  /*Adjust for the scale factor.*/
   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 }
 
@@ -250,8 +259,8 @@
    0  0  0  0  0  0  0  0
   _y: The buffer to store the result in.
       This may be the same as _x.
-  _x: The input coefficients. */
-void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  _x: The input coefficients.*/
+static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
@@ -263,6 +272,99 @@
   idct8_1(w+3,_x+24);
   /*Transform rows of w into columns of y.*/
   for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
-  /*Adjust for scale factor.*/
+  /*Adjust for the scale factor.*/
   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 }
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  _y: The buffer to store the result in.
+      This may be the same as _x.
+  _x: The input coefficients.*/
+static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  const ogg_int16_t *in;
+  ogg_int16_t       *end;
+  ogg_int16_t       *out;
+  ogg_int16_t        w[64];
+  /*Transform rows of x into columns of w.*/
+  for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  /*Transform rows of w into columns of y.*/
+  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
+  /*Adjust for the scale factor.*/
+  for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
+}
+
+void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+ const ogg_uint16_t _ac_quant[64]){
+  (*_state->opt_vtable.dequant_idct8x8)(_y,_x,_last_zzi,_ncoefs,
+   _dc_quant,_ac_quant);
+}
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  _y: The buffer to store the result in.
+      This must not be the same as _x.
+  _x: The input coefficients.*/
+void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+ int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+ const ogg_uint16_t _ac_quant[64]){
+  int ci;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    ogg_int16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*LOOP VECTORIZES.*/
+    for(ci=0;ci<64;ci++)_y[ci]=p;
+  }
+  else{
+    int zzi;
+    /*First, dequantize the coefficients.*/
+    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      _y[OC_FZIG_ZAG[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
+    }
+    /*Then, fill in the remainder of the coefficients with 0's, and perform
+       the iDCT.*/
+    if(_last_zzi<3){
+      for(;zzi<3;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
+      oc_idct8x8_3(_y,_y);
+    }
+    else if(_last_zzi<10){
+      for(;zzi<10;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
+      oc_idct8x8_10(_y,_y);
+    }
+    else{
+      for(;zzi<64;zzi++)_y[OC_FZIG_ZAG[zzi]]=0;
+      oc_idct8x8_slow(_y,_y);
+    }
+  }
+}

Deleted: branches/theora-thusnelda/lib/dec/idct.h
===================================================================
--- branches/theora-thusnelda/lib/dec/idct.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/idct.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,26 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id$
-
- ********************************************************************/
-
-/*Inverse DCT transforms.*/
-#include <ogg/ogg.h>
-#if !defined(_idct_H)
-# define _idct_H (1)
-
-void oc_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_idct8x8_10_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-
-#endif

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -19,7 +19,6 @@
 #include <limits.h>
 #include <string.h>
 #include "../internal.h"
-#include "idct.h"
 
 
 
@@ -100,7 +99,7 @@
   Each DCT token has some fixed number of additional bits (possibly 0) stored
    after the token itself, containing, for example, coefficient magnitude,
    sign bits, etc.*/
-const int OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]={
+const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]={
   0,0,0,2,3,4,12,3,6,
   0,0,0,0,
   1,1,1,1,2,3,4,5,6,10,

Modified: branches/theora-thusnelda/lib/dec/quant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/quant.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -54,7 +54,6 @@
       ogg_uint32_t    q;
       int             qi_start;
       int             qi_end;
-      int             ci;
       qtables=_dequant[qti][pli];
       memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
        sizeof(base));
@@ -64,6 +63,8 @@
       /*Iterate over quality indicies in this range.*/
       for(;;){
         ogg_uint32_t  qfac;
+        int           zzi;
+        int           ci;
         /*In the original VP3.2 code, the rounding offset and the size of the
            dead zone around 0 were controlled by a "sharpness" parameter.
           The size of our dead zone is now controlled by the per-coefficient
@@ -81,10 +82,10 @@
         q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
         qtables[qi][0]=(ogg_uint16_t)q;
         /*Now scale AC coefficients from the proper table.*/
-        for(ci=1;ci<64;ci++){
-          q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
+        for(zzi=1;zzi<64;zzi++){
+          q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[OC_FZIG_ZAG[zzi]]/100)<<2;
           q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-          qtables[qi][ci]=(ogg_uint16_t)q;
+          qtables[qi][zzi]=(ogg_uint16_t)q;
         }
         if(++qi>=qi_end)break;
         /*Interpolate the next base matrix.*/

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -18,8 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../internal.h"
-#include "idct.h"
-#if defined(USE_ASM)
+#if defined(OC_X86_ASM)
 #if defined(_MSC_VER)
 # include "x86_vc/x86int.h"
 #else
@@ -31,12 +30,6 @@
 # include "png.h"
 #endif
 
-void oc_restore_fpu(const oc_theora_state *_state){
-  _state->opt_vtable.restore_fpu();
-}
-
-void oc_restore_fpu_c(void){}
-
 /*Returns the fragment index of the top-left block in a macro block.
   This can be used to test whether or not the whole macro block is coded.
   _sb:    The super block.
@@ -527,10 +520,13 @@
 
 
 void oc_state_vtable_init_c(oc_theora_state *_state){
+  _state->opt_vtable.frag_copy=oc_frag_copy_c;
+  _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
   _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
   _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_c;
-  _state->opt_vtable.state_frag_copy=oc_state_frag_copy_c;
+  _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_c;
   _state->opt_vtable.state_frag_recon=oc_state_frag_recon_c;
+  _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_c;
   _state->opt_vtable.state_loop_filter_frag_rows=
    oc_state_loop_filter_frag_rows_c;
   _state->opt_vtable.restore_fpu=oc_restore_fpu_c;
@@ -538,7 +534,7 @@
 
 /*Initialize the accelerated function pointers.*/
 void oc_state_vtable_init(oc_theora_state *_state){
-#if defined(USE_ASM)
+#if defined(OC_X86_ASM)
   oc_state_vtable_init_x86(_state);
 #else
   oc_state_vtable_init_c(_state);
@@ -733,7 +729,7 @@
   _ystride: The Y stride in the buffer the motion vector points into.
   _pli:     The color plane index.
   Return: The number of offsets returned: 1 or 2.*/
-int oc_state_get_mv_offsets(oc_theora_state *_state,int _offsets[2],
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
  int _dx,int _dy,int _ystride,int _pli){
   /*Here is a brief description of how Theora handles motion vectors:
     Motion vector components are specified to half-pixel accuracy in
@@ -834,96 +830,42 @@
 #endif
 }
 
-void oc_state_frag_recon(oc_theora_state *_state,oc_fragment *_frag,
+void oc_state_frag_recon(const oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
   _state->opt_vtable.state_frag_recon(_state,_frag,_pli,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_iquant,_ac_iquant);
+   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
 }
 
-void oc_state_frag_recon_c(oc_theora_state *_state,oc_fragment *_frag,
+void oc_state_frag_recon_c(const oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant, const ogg_uint16_t _ac_iquant[64]){
-  ogg_int16_t dct_buf[64];
+ ogg_uint16_t _dc_quant, const ogg_uint16_t _ac_quant[64]){
   ogg_int16_t res_buf[64];
   int dst_framei;
-  int dst_ystride;
-  int zzi;
-  int ci;
-  /*_last_zzi is subtly different from an actual count of the number of
-     coefficients we decoded for this block.
-    It contains the value of zzi BEFORE the final token in the block was
-     decoded.
-    In most cases this is an EOB token (the continuation of an EOB run from a
-     previous block counts), and so this is the same as the coefficient count.
-    However, in the case that the last token was NOT an EOB token, but filled
-     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
-    Provided the last token was not a pure zero run, the minimum value it can
-     be is 46, and so that doesn't affect any of the cases in this routine.
-    However, if the last token WAS a pure zero run of length 63, then _last_zzi
-     will be 1 while the number of coefficients decoded is 64.
-    Thus, we will trigger the following special case, where the real
-     coefficient count would not.
-    Note also that a zero run of length 64 will give _last_zzi a value of 0,
-     but we still process the DC coefficient, which might have a non-zero value
-     due to DC prediction.
-    Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
-    It could be smarter... multiple separate zero runs at the end of a block
-     will fool it, but an encoder that generates these really deserves what it
-     gets.
-    Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    ogg_int16_t p;
-    /*Why is the iquant product rounded in this case and no others?
-      Who knows.*/
-    p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-    /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)res_buf[ci]=p;
-  }
-  else{
-    /*First, dequantize the coefficients.*/
-    dct_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      int ci;
-      ci=OC_FZIG_ZAG[zzi];
-      dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]);
-    }
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    if(_last_zzi<10){
-      for(;zzi<10;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_10_c(res_buf,dct_buf);
-    }
-    else{
-      for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
-      oc_idct8x8_c(res_buf,dct_buf);
-    }
-  }
+  int ystride;
+  /*Dequantize and apply the inverse transform.*/
+  oc_dequant_idct8x8(_state,res_buf,_dct_coeffs,
+   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
+  ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
+    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],ystride,res_buf);
   }
   else{
     int ref_framei;
-    int ref_ystride;
     int mvoffsets[2];
     ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride;
-    if(oc_state_get_mv_offsets(_state,mvoffsets,_frag->mv[0],_frag->mv[1],
-     ref_ystride,_pli)>1){
-      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
+    if(oc_state_get_mv_offsets(_state,mvoffsets,
+     _frag->mv[0],_frag->mv[1],ystride,_pli)>1){
+      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],
+       _frag->buffer[ref_framei]+mvoffsets[0],
+       _frag->buffer[ref_framei]+mvoffsets[1],ystride,res_buf);
     }
     else{
-      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
+      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],
+       _frag->buffer[ref_framei]+mvoffsets[0],ystride,res_buf);
     }
   }
   oc_restore_fpu(_state);
@@ -936,38 +878,28 @@
   _dst_frame: The reference frame to copy to.
   _src_frame: The reference frame to copy from.
   _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+void oc_state_frag_copy_list(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli){
-  _state->opt_vtable.state_frag_copy(_state,_fragis,_nfragis,_dst_frame,
+  _state->opt_vtable.state_frag_copy_list(_state,_fragis,_nfragis,_dst_frame,
    _src_frame,_pli);
 }
 
-void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli){
+void oc_state_frag_copy_list_c(const oc_theora_state *_state,
+ const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli){
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;
-  int        dst_ystride;
   int        src_framei;
-  int        src_ystride;
+  int        ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride;
+  ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
     oc_fragment   *frag;
-    unsigned char *dst;
-    unsigned char *src;
-    int            j;
     frag=_state->frags+*fragi;
-    dst=frag->buffer[dst_framei];
-    src=frag->buffer[src_framei];
-    for(j=0;j<8;j++){
-      memcpy(dst,src,sizeof(dst[0])*8);
-      dst+=dst_ystride;
-      src+=src_ystride;
-    }
+    oc_frag_copy(_state,frag->buffer[dst_framei],
+     frag->buffer[src_framei],ystride);
   }
 }
 
@@ -1029,22 +961,22 @@
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
+void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
   _state->opt_vtable.state_loop_filter_frag_rows(_state,_bv,_refi,_pli,
    _fragy0,_fragy_end);
 }
 
-void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-  th_img_plane      *iplane;
-  oc_fragment_plane *fplane;
-  oc_fragment       *frag_top;
-  oc_fragment       *frag0;
-  oc_fragment       *frag;
-  oc_fragment       *frag_end;
-  oc_fragment       *frag0_end;
-  oc_fragment       *frag_bot;
+  const th_img_plane      *iplane;
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frag_top;
+  oc_fragment             *frag0;
+  oc_fragment             *frag;
+  oc_fragment             *frag_end;
+  oc_fragment             *frag0_end;
+  oc_fragment             *frag_bot;
   _bv+=127;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;

Modified: branches/theora-thusnelda/lib/dec/x86/mmxfrag.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxfrag.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/x86/mmxfrag.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -20,12 +20,20 @@
   Additional optimization by Nils Pipenbrinck.
   Note: Loops are unrolled for best performance.
   The iteration each instruction belongs to is marked in the comments as #i.*/
+#include <stddef.h>
 #include "x86int.h"
-#include <stddef.h>
+#include "mmxfrag.h"
 
-#if defined(USE_ASM)
+#if defined(OC_X86_ASM)
 
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
+}
+
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue){
   __asm__ __volatile__(
     /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
@@ -67,9 +75,9 @@
     /*#0 Write row.*/
     "movq %%mm1,(%[dst])\n\t"
     /*#1 Write row.*/
-    "movq %%mm3,(%[dst],%[dst_ystride])\n\t"
+    "movq %%mm3,(%[dst],%[ystride])\n\t"
     /*#2 Write row.*/
-    "movq %%mm5,(%[dst],%[dst_ystride],2)\n\t"
+    "movq %%mm5,(%[dst],%[ystride],2)\n\t"
     /*#3 Load low residue.*/
     "movq 6*8(%[residue]),%%mm1\n\t"
     /*#3 Load high residue.*/
@@ -101,11 +109,11 @@
     /*#5 Pack to byte.*/
     "packuswb %%mm6,%%mm5\n\t"
     /*#3 Write row.*/
-    "movq %%mm1,(%[dst],%[dst_ystride3])\n\t"
+    "movq %%mm1,(%[dst],%[ystride3])\n\t"
     /*#4 Write row.*/
     "movq %%mm3,(%[dst4])\n\t"
     /*#5 Write row.*/
-    "movq %%mm5,(%[dst4],%[dst_ystride])\n\t"
+    "movq %%mm5,(%[dst4],%[ystride])\n\t"
     /*#6 Load low residue.*/
     "movq 12*8(%[residue]),%%mm1\n\t"
     /*#6 Load high residue.*/
@@ -127,21 +135,21 @@
     /*#7 Pack to byte.*/
     "packuswb %%mm4,%%mm3\n\t"
     /*#6 Write row.*/
-    "movq %%mm1,(%[dst4],%[dst_ystride],2)\n\t"
+    "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
     /*#7 Write row.*/
-    "movq %%mm3,(%[dst4],%[dst_ystride3])\n\t"
+    "movq %%mm3,(%[dst4],%[ystride3])\n\t"
     :
     :[residue]"r"(_residue),
      [dst]"r"(_dst),
-     [dst4]"r"(_dst+(_dst_ystride<<2)),
-     [dst_ystride]"r"((ptrdiff_t)_dst_ystride),
-     [dst_ystride3]"r"((ptrdiff_t)_dst_ystride*3)
+     [dst4]"r"(_dst+(_ystride<<2)),
+     [ystride]"r"((ptrdiff_t)_ystride),
+     [ystride3]"r"((ptrdiff_t)_ystride*3)
     :"memory"
   );
 }
 
-void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue){
+void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
+ int _ystride,const ogg_int16_t *_residue){
   int i;
   /*Zero mm0.*/
   __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
@@ -150,7 +158,7 @@
       /*#0 Load source.*/
       "movq (%[src]),%%mm3\n\t"
       /*#1 Load source.*/
-      "movq (%[src],%[src_ystride]),%%mm7\n\t"
+      "movq (%[src],%[ystride]),%%mm7\n\t"
       /*#0 Get copy of src.*/
       "movq %%mm3,%%mm4\n\t"
       /*#0 Expand high source.*/
@@ -178,29 +186,23 @@
       /*#1 Pack final row pixels.*/
       "packuswb %%mm2,%%mm7\n\t"
       /*Advance src.*/
-      "lea (%[src],%[src_ystride],2),%[src]\n\t"
+      "lea (%[src],%[ystride],2),%[src]\n\t"
       /*#0 Write row.*/
       "movq %%mm3,(%[dst])\n\t"
       /*#1 Write row.*/
-      "movq %%mm7,(%[dst],%[dst_ystride])\n\t"
+      "movq %%mm7,(%[dst],%[ystride])\n\t"
       /*Advance dst.*/
-      "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
       :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
-      :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
-       [src_ystride]"r"((ptrdiff_t)_src_ystride)
+      :[ystride]"r"((ptrdiff_t)_ystride)
       :"memory"
     );
   }
 }
 
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue){
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
   int i;
-  /*NOTE: This assumes that
-     _dst_ystride==_src1_ystride&&_dst_ystride==_src2_ystride.
-    This is currently always the case, but a slower fallback version will need
-     to be written if it ever is not.*/
   /*Zero mm7.*/
   __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
   for(i=4;i-->0;){
@@ -278,8 +280,8 @@
       /*Advance dest ptr.*/
       "lea (%[dst],%[ystride],2),%[dst]\n\t"
      :[dst]"+r"(_dst),[residue]"+r"(_residue),
-      [src1]"+r"(_src1),[src2]"+r"(_src2)
-     :[ystride]"r"((ptrdiff_t)_dst_ystride)
+      [src1]"+%r"(_src1),[src2]"+r"(_src2)
+     :[ystride]"r"((ptrdiff_t)_ystride)
      :"memory"
     );
   }

Added: branches/theora-thusnelda/lib/dec/x86/mmxfrag.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxfrag.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/dec/x86/mmxfrag.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,60 @@
+#if !defined(_x86_mmxfrag_H)
+# define _x86_mmxfrag_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
+   between rows.*/
+#define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
+  do{ \
+    ptrdiff_t ystride3; \
+    __asm__ __volatile__( \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*ystride3=ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[src],%[ystride],4),%[src]\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      /*Pointer to next 4.*/ \
+      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
+      /*src+0*ystride*/ \
+      "movq (%[src]),%%mm0\n\t" \
+      /*src+1*ystride*/ \
+      "movq (%[src],%[ystride]),%%mm1\n\t" \
+      /*src+2*ystride*/ \
+      "movq (%[src],%[ystride],2),%%mm2\n\t" \
+      /*src+3*ystride*/ \
+      "movq (%[src],%[ystride3]),%%mm3\n\t" \
+      /*dst+0*ystride*/ \
+      "movq %%mm0,(%[dst])\n\t" \
+      /*dst+1*ystride*/ \
+      "movq %%mm1,(%[dst],%[ystride])\n\t" \
+      /*dst+2*ystride*/ \
+      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
+      /*dst+3*ystride*/ \
+      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
+      :[ystride3]"=&r"(ystride3) \
+      :[dst]"r"(_dst),[src]"r"(_src),[ystride]"r"((ptrdiff_t)(_ystride)) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif

Modified: branches/theora-thusnelda/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -17,14 +17,11 @@
 
 /*MMX acceleration of Theora's iDCT.
   Originally written by Rudolf Marek, based on code from On2's VP3.*/
-#include <ogg/ogg.h>
+#include "x86int.h"
 #include "../dct.h"
-#include "../idct.h"
 
-#include "x86int.h"
+#if defined(OC_X86_ASM)
 
-#if defined(USE_ASM)
-
 /*These are offsets into the table of constants below.*/
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
 #define OC_COSINE_OFFSET (0)
@@ -194,7 +191,7 @@
   J(7) = h3 g3 f3 e3
 
   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
-  J(4) J(5) J(6) J(7) is the transpose of r4   r5 r6 r7.
+  J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
 
   Since r1 is free at entry, we calculate the Js first.*/
 /*19 cycles.*/
@@ -313,9 +310,9 @@
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 
-void oc_idct8x8_mmx(ogg_int16_t _y[64]){
-  /*This routine accepts an 8x8 matrix, but in transposed form.
-    Every 4x4 submatrix is transposed.*/
+static void oc_idct8x8_slow(ogg_int16_t _y[64]){
+  /*This routine accepts an 8x8 matrix, but in partially transposed form.
+    Every 4x4 block is transposed.*/
   __asm__ __volatile__(
 #define OC_I(_k)      OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k)      OC_M2STR(((_k-4)*16)+8)"(%[y])"
@@ -507,7 +504,7 @@
  "movq %%mm0,"OC_I(0)"\n\t" \
  "#end OC_COLUMN_IDCT_10\n\t" \
 
-void oc_idct8x8_10_mmx(ogg_int16_t _y[64]){
+static void oc_idct8x8_10(ogg_int16_t _y[64]){
   __asm__ __volatile__(
 #define OC_I(_k) OC_M2STR((_k*16))"(%[y])"
 #define OC_J(_k) OC_M2STR(((_k-4)*16)+8)"(%[y])"
@@ -532,4 +529,127 @@
     :[y]"r"(_y),[c]"r"(OC_IDCT_CONSTS)
   );
 }
+
+/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transposes into
+   each quadrant of the destination.*/
+static const unsigned char OC_FZIG_ZAG_MMX[64]={
+   0, 8, 1, 2, 9,16,24,17,
+  10, 3,32,11,18,25, 4,12,
+   5,26,19,40,33,34,41,48,
+  27, 6,13,20,28,21,14, 7,
+  56,49,42,35,43,50,57,36,
+  15,22,29,30,23,44,37,58,
+  51,59,38,45,52,31,60,53,
+  46,39,47,54,61,62,55,63
+};
+
+/*Performs an inverse 8x8 Type-II DCT transform.
+  The input is assumed to be scaled by a factor of 4 relative to orthonormal
+   version of the transform.
+  _y: The buffer to store the result in.
+      This must not be the same as _x.
+  _x: The input coefficients.*/
+void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+ int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+ const ogg_uint16_t _ac_quant[64]){
+  int ci;
+  /*_last_zzi is subtly different from an actual count of the number of
+     coefficients we decoded for this block.
+    It contains the value of zzi BEFORE the final token in the block was
+     decoded.
+    In most cases this is an EOB token (the continuation of an EOB run from a
+     previous block counts), and so this is the same as the coefficient count.
+    However, in the case that the last token was NOT an EOB token, but filled
+     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
+    Provided the last token was not a pure zero run, the minimum value it can
+     be is 46, and so that doesn't affect any of the cases in this routine.
+    However, if the last token WAS a pure zero run of length 63, then _last_zzi
+     will be 1 while the number of coefficients decoded is 64.
+    Thus, we will trigger the following special case, where the real
+     coefficient count would not.
+    Note also that a zero run of length 64 will give _last_zzi a value of 0,
+     but we still process the DC coefficient, which might have a non-zero value
+     due to DC prediction.
+    Although convoluted, this is arguably the correct behavior: it allows us to
+     dequantize fewer coefficients and use a smaller transform when the block
+     ends with a long zero run instead of a normal EOB token.
+    It could be smarter... multiple separate zero runs at the end of a block
+     will fool it, but an encoder that generates these really deserves what it
+     gets.
+    Needless to say we inherited this approach from VP3.*/
+  /*Special case only having a DC component.*/
+  if(_last_zzi<2){
+    /*Note that this value must be unsigned, to keep the __asm__ block from
+       sign-extending it when it puts it in a register.*/
+    ogg_uint16_t p;
+    /*We round this dequant product (and not any of the others) because there's
+       no iDCT rounding.*/
+    p=(ogg_int16_t)(_x[0]*(ogg_int32_t)_dc_quant+15>>5);
+    /*Fill _y with p.*/
+    __asm__ __volatile__(
+      /*mm0=0000 0000 0000 AAAA*/
+      "movd %[p],%%mm0\n\t"
+      /*mm0=0000 0000 AAAA AAAA*/
+      "punpcklwd %%mm0,%%mm0\n\t"
+      /*mm0=AAAA AAAA AAAA AAAA*/
+      "punpckldq %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(_y),[p]"r"((unsigned)p)
+      :"memory"
+    );
+  }
+  else{
+    int zzi;
+    /*First zero the buffer.*/
+    /*On K7, etc., this could be replaced with movntq and sfence.*/
+    __asm__ __volatile__(
+      "pxor %%mm0,%%mm0\n\t"
+      "movq %%mm0,(%[y])\n\t"
+      "movq %%mm0,8(%[y])\n\t"
+      "movq %%mm0,16(%[y])\n\t"
+      "movq %%mm0,24(%[y])\n\t"
+      "movq %%mm0,32(%[y])\n\t"
+      "movq %%mm0,40(%[y])\n\t"
+      "movq %%mm0,48(%[y])\n\t"
+      "movq %%mm0,56(%[y])\n\t"
+      "movq %%mm0,64(%[y])\n\t"
+      "movq %%mm0,72(%[y])\n\t"
+      "movq %%mm0,80(%[y])\n\t"
+      "movq %%mm0,88(%[y])\n\t"
+      "movq %%mm0,96(%[y])\n\t"
+      "movq %%mm0,104(%[y])\n\t"
+      "movq %%mm0,112(%[y])\n\t"
+      "movq %%mm0,120(%[y])\n\t"
+      :
+      :[y]"r"(_y)
+      :"memory"
+    );
+    /*Dequantize the coefficients.*/
+    _y[0]=(ogg_int16_t)(_x[0]*(int)_dc_quant);
+    for(zzi=1;zzi<_ncoefs;zzi++){
+      _y[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_x[zzi]*(int)_ac_quant[zzi]);
+    }
+    /*Then perform the iDCT.*/
+    if(_last_zzi<10)oc_idct8x8_10(_y);
+    else oc_idct8x8_slow(_y);
+  }
+}
+
+
 #endif

Added: branches/theora-thusnelda/lib/dec/x86/mmxloop.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxloop.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/dec/x86/mmxloop.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,213 @@
+#if !defined(_x86_mmxloop_H)
+# define _x86_mmxloop_H (1)
+# include <stddef.h>
+# include "x86int.h"
+
+#if defined(OC_X86_ASM)
+
+/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
+  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
+   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
+#define OC_LOOP_FILTER8_MMX \
+ "#OC_LOOP_FILTER8_MMX\n\t" \
+ /*mm7=0*/ \
+ "pxor %%mm7,%%mm7\n\t" \
+ /*mm6:mm0={a0,...,a7}*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "punpcklbw %%mm7,%%mm0\n\t" \
+ "punpckhbw %%mm7,%%mm6\n\t" \
+ /*mm3:mm5={d0,...,d7}*/ \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm3\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm6:mm0={a0-d0,...,a7-d7}*/ \
+ "psubw %%mm3,%%mm0\n\t" \
+ "psubw %%mm5,%%mm6\n\t" \
+ /*mm3:mm1={b0,...,b7}*/ \
+ "movq %%mm1,%%mm3\n\t" \
+ "punpcklbw %%mm7,%%mm1\n\t" \
+ "movq %%mm2,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm3\n\t" \
+ /*mm5:mm4={c0,...,c7}*/ \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklbw %%mm7,%%mm4\n\t" \
+ "punpckhbw %%mm7,%%mm5\n\t" \
+ /*mm7={3}x4 \
+   mm5:mm4={c0-b0,...,c7-b7}*/ \
+ "pcmpeqw %%mm7,%%mm7\n\t" \
+ "psubw %%mm1,%%mm4\n\t" \
+ "psrlw $14,%%mm7\n\t" \
+ "psubw %%mm3,%%mm5\n\t" \
+ /*Scale by 3.*/ \
+ "pmullw %%mm7,%%mm4\n\t" \
+ "pmullw %%mm7,%%mm5\n\t" \
+ /*mm7={4}x4 \
+   mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
+ "psrlw $1,%%mm7\n\t" \
+ "paddw %%mm0,%%mm4\n\t" \
+ "psllw $2,%%mm7\n\t" \
+ "movq (%[ll]),%%mm0\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*R_i has the range [-127,128], so we compute -R_i instead. \
+   mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
+ "psubw %%mm7,%%mm4\n\t" \
+ "psubw %%mm7,%%mm5\n\t" \
+ "psraw $3,%%mm4\n\t" \
+ "psraw $3,%%mm5\n\t" \
+ "pcmpeqb %%mm7,%%mm7\n\t" \
+ "packsswb %%mm5,%%mm4\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "pxor %%mm7,%%mm4\n\t" \
+ "packuswb %%mm3,%%mm1\n\t" \
+ /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
+ /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
+    we have to split things by sign (the other option is to work in 16 bits, \
+    but working in 8 bits gives much better parallelism). \
+   We compute abs(R_i), but save a mask of which terms were negative in mm6. \
+   Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
+   Finally, we split mm4 into positive and negative pieces using the mask in \
+    mm6, and add and subtract them as appropriate.*/ \
+ /*mm4=abs(-R_i)*/ \
+ /*mm7=255-2*L*/ \
+ "pcmpgtb %%mm4,%%mm6\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "pxor %%mm6,%%mm4\n\t" \
+ "psubb %%mm0,%%mm7\n\t" \
+ "psubb %%mm6,%%mm4\n\t" \
+ /*mm7=255-max(2*L-abs(R_i),0)*/ \
+ "paddusb %%mm4,%%mm7\n\t" \
+ /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
+ "paddusb %%mm7,%%mm4\n\t" \
+ "psubusb %%mm7,%%mm4\n\t" \
+ /*Now split mm4 by the original sign of -R_i.*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "pand %%mm6,%%mm4\n\t" \
+ "pandn %%mm5,%%mm6\n\t" \
+ /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
+ /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
+ "paddusb %%mm4,%%mm1\n\t" \
+ "psubusb %%mm4,%%mm2\n\t" \
+ "psubusb %%mm6,%%mm1\n\t" \
+ "paddusb %%mm6,%%mm2\n\t" \
+
+#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
+  do{ \
+    ptrdiff_t ystride3; \
+    __asm__ __volatile__( \
+      /*mm0={a0,...,a7}*/ \
+      "movq (%[pix]),%%mm0\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*mm3={d0,...,d7}*/ \
+      "movq (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*mm1={b0,...,b7}*/ \
+      "movq (%[pix],%[ystride]),%%mm1\n\t" \
+      /*mm2={c0,...,c7}*/ \
+      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
+      OC_LOOP_FILTER8_MMX \
+      /*Write it back out.*/ \
+      "movq %%mm1,(%[pix],%[ystride])\n\t" \
+      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
+      :[ystride3]"=&r"(ystride3) \
+      :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
+       [ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
+  do{ \
+    ptrdiff_t ystride3; \
+    ptrdiff_t d; \
+    __asm__ __volatile__( \
+      /*x x x x d0 c0 b0 a0*/ \
+      "movd (%[pix]),%%mm0\n\t" \
+      /*x x x x d1 c1 b1 a1*/ \
+      "movd (%[pix],%[ystride]),%%mm1\n\t" \
+      /*ystride3=_ystride*3*/ \
+      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
+      /*x x x x d2 c2 b2 a2*/ \
+      "movd (%[pix],%[ystride],2),%%mm2\n\t" \
+      /*x x x x d3 c3 b3 a3*/ \
+      "lea (%[pix],%[ystride],4),%[d]\n\t" \
+      "movd (%[pix],%[ystride3]),%%mm3\n\t" \
+      /*x x x x d4 c4 b4 a4*/ \
+      "movd (%[d]),%%mm4\n\t" \
+      /*x x x x d5 c5 b5 a5*/ \
+      "movd (%[d],%[ystride]),%%mm5\n\t" \
+      /*x x x x d6 c6 b6 a6*/ \
+      "movd (%[d],%[ystride],2),%%mm6\n\t" \
+      /*x x x x d7 c7 b7 a7*/ \
+      "movd (%[d],%[ystride3]),%%mm7\n\t" \
+      /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "punpcklbw %%mm1,%%mm0\n\t" \
+      /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
+      "punpcklbw %%mm3,%%mm2\n\t" \
+      /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
+      "movq %%mm0,%%mm3\n\t" \
+      /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "punpcklwd %%mm2,%%mm0\n\t" \
+      /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "punpckhwd %%mm2,%%mm3\n\t" \
+      /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
+      "movq %%mm0,%%mm1\n\t" \
+      /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "punpcklbw %%mm5,%%mm4\n\t" \
+      /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
+      "punpcklbw %%mm7,%%mm6\n\t" \
+      /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
+      "movq %%mm4,%%mm5\n\t" \
+      /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
+      "punpcklwd %%mm6,%%mm4\n\t" \
+      /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
+      "punpckhwd %%mm6,%%mm5\n\t" \
+      /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
+      "movq %%mm3,%%mm2\n\t" \
+      /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
+      "punpckldq %%mm4,%%mm0\n\t" \
+      /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
+      "punpckhdq %%mm4,%%mm1\n\t" \
+      /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
+      "punpckldq %%mm5,%%mm2\n\t" \
+      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
+      "punpckhdq %%mm5,%%mm3\n\t" \
+      OC_LOOP_FILTER8_MMX \
+      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
+      "movq %%mm1,%%mm0\n\t" \
+      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
+      "punpcklbw %%mm2,%%mm1\n\t" \
+      /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
+      "punpckhbw %%mm2,%%mm0\n\t" \
+      /*[d]=c1 b1 c0 b0*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm1\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c3 b3 c2 b2*/ \
+      "movd %%mm1,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      "lea (%[pix],%[ystride],4),%[pix]\n\t" \
+      /*[d]=c5 b5 c4 b4*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix])\n\t" \
+      "psrlq $32,%%mm0\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride])\n\t" \
+      /*[d]=c7 b7 c6 b6*/ \
+      "movd %%mm0,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
+      "shr $16,%[d]\n\t" \
+      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
+      :[ystride3]"=&r"(ystride3),[d]"=&r"(d) \
+      :[pix]"r"(_pix-2),[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
+      :"memory" \
+    ); \
+  } \
+  while(0)
+
+# endif
+#endif

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -17,160 +17,49 @@
 
 /*MMX acceleration of complete fragment reconstruction algorithm.
   Originally written by Rudolf Marek.*/
+#include <string.h>
 #include "x86int.h"
-#include "../../internal.h"
-#include <stddef.h>
+#include "mmxfrag.h"
+#include "mmxloop.h"
 
-#if defined(USE_ASM)
+#if defined(OC_X86_ASM)
 
-static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
-
-
-void oc_state_frag_recon_mmx(oc_theora_state *_state,oc_fragment *_frag,
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
   ogg_int16_t  __attribute__((aligned(8))) res_buf[64];
   int dst_framei;
-  int dst_ystride;
-  int zzi;
-  /*_last_zzi is subtly different from an actual count of the number of
-     coefficients we decoded for this block.
-    It contains the value of zzi BEFORE the final token in the block was
-     decoded.
-    In most cases this is an EOB token (the continuation of an EOB run from a
-     previous block counts), and so this is the same as the coefficient count.
-    However, in the case that the last token was NOT an EOB token, but filled
-     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
-    Provided the last token was not a pure zero run, the minimum value it can
-     be is 46, and so that doesn't affect any of the cases in this routine.
-    However, if the last token WAS a pure zero run of length 63, then _last_zzi
-     will be 1 while the number of coefficients decoded is 64.
-    Thus, we will trigger the following special case, where the real
-     coefficient count would not.
-    Note also that a zero run of length 64 will give _last_zzi a value of 0,
-     but we still process the DC coefficient, which might have a non-zero value
-     due to DC prediction.
-    Although convoluted, this is arguably the correct behavior: it allows us to
-     dequantize fewer coefficients and use a smaller transform when the block
-     ends with a long zero run instead of a normal EOB token.
-    It could be smarter... multiple separate zero runs at the end of a block
-     will fool it, but an encoder that generates these really deserves what it
-     gets.
-    Needless to say we inherited this approach from VP3.*/
-  /*Special case only having a DC component.*/
-  if(_last_zzi<2){
-    ogg_uint16_t p;
-    /*Why is the iquant product rounded in this case and no others?
-      Who knows.*/
-    p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
-    /*Fill res_buf with p.*/
-    __asm__ __volatile__(
-      /*mm0=0000 0000 0000 AAAA*/
-      "movd %[p],%%mm0\n\t"
-      /*mm1=0000 0000 0000 AAAA*/
-      "movd %[p],%%mm1\n\t"
-      /*mm0=0000 0000 AAAA 0000*/
-      "pslld $16,%%mm0\n\t"
-      /*mm0=0000 0000 AAAA AAAA*/
-      "por %%mm1,%%mm0\n\t"
-      /*mm0=AAAA AAAA AAAA AAAA*/
-      "punpcklwd %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[res_buf])\n\t"
-      "movq %%mm0,8(%[res_buf])\n\t"
-      "movq %%mm0,16(%[res_buf])\n\t"
-      "movq %%mm0,24(%[res_buf])\n\t"
-      "movq %%mm0,32(%[res_buf])\n\t"
-      "movq %%mm0,40(%[res_buf])\n\t"
-      "movq %%mm0,48(%[res_buf])\n\t"
-      "movq %%mm0,56(%[res_buf])\n\t"
-      "movq %%mm0,64(%[res_buf])\n\t"
-      "movq %%mm0,72(%[res_buf])\n\t"
-      "movq %%mm0,80(%[res_buf])\n\t"
-      "movq %%mm0,88(%[res_buf])\n\t"
-      "movq %%mm0,96(%[res_buf])\n\t"
-      "movq %%mm0,104(%[res_buf])\n\t"
-      "movq %%mm0,112(%[res_buf])\n\t"
-      "movq %%mm0,120(%[res_buf])\n\t"
-      :
-      :[res_buf]"r"(res_buf),[p]"r"((unsigned)p)
-      :"memory"
-    );
-  }
-  else{
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[res_buf])\n\t"
-      "movq %%mm0,8(%[res_buf])\n\t"
-      "movq %%mm0,16(%[res_buf])\n\t"
-      "movq %%mm0,24(%[res_buf])\n\t"
-      "movq %%mm0,32(%[res_buf])\n\t"
-      "movq %%mm0,40(%[res_buf])\n\t"
-      "movq %%mm0,48(%[res_buf])\n\t"
-      "movq %%mm0,56(%[res_buf])\n\t"
-      "movq %%mm0,64(%[res_buf])\n\t"
-      "movq %%mm0,72(%[res_buf])\n\t"
-      "movq %%mm0,80(%[res_buf])\n\t"
-      "movq %%mm0,88(%[res_buf])\n\t"
-      "movq %%mm0,96(%[res_buf])\n\t"
-      "movq %%mm0,104(%[res_buf])\n\t"
-      "movq %%mm0,112(%[res_buf])\n\t"
-      "movq %%mm0,120(%[res_buf])\n\t"
-      :
-      :[res_buf]"r"(res_buf)
-      :"memory"
-    );
-    res_buf[0]=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant);
-    /*This is planned to be rewritten in MMX.*/
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      int ci;
-      ci=OC_FZIG_ZAG[zzi];
-      res_buf[OC_FZIG_ZAGMMX[zzi]]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*
-       _ac_iquant[ci]);
-    }
-    if(_last_zzi<10)oc_idct8x8_10_mmx(res_buf);
-    else oc_idct8x8_mmx(res_buf);
-  }
+  int ystride;
+  /*Dequantize and apply the inverse transform.*/
+  oc_dequant_idct8x8_mmx(res_buf,_dct_coeffs,
+   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
+  ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
+    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],ystride,res_buf);
   }
   else{
     int ref_framei;
-    int ref_ystride;
     int mvoffsets[2];
     ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride;
-    if(oc_state_get_mv_offsets(_state,mvoffsets,_frag->mv[0],_frag->mv[1],
-     ref_ystride,_pli)>1){
-      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
+    if(oc_state_get_mv_offsets(_state,mvoffsets,
+     _frag->mv[0],_frag->mv[1],ystride,_pli)>1){
+      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],
+       _frag->buffer[ref_framei]+mvoffsets[0],
+       _frag->buffer[ref_framei]+mvoffsets[1],ystride,res_buf);
     }
     else{
-      oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
+      oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],
+       _frag->buffer[ref_framei]+mvoffsets[0],ystride,res_buf);
     }
   }
-  oc_restore_fpu(_state);
 }
 
+/*We copy these entire function to inline the actual MMX routines so that we
+   use only a single indirect call.*/
+
 /*Copies the fragments specified by the lists of fragment indices from one
    frame to another.
   _fragis:    A pointer to a list of fragment indices.
@@ -178,419 +67,24 @@
   _dst_frame: The reference frame to copy to.
   _src_frame: The reference frame to copy from.
   _pli:       The color plane the fragments lie in.*/
-void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli){
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli){
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;
-  ptrdiff_t  dst_ystride;
   int        src_framei;
-  ptrdiff_t  src_ystride;
+  int        ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride;
+  ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
-    oc_fragment   *frag;
-    unsigned char *dst;
-    unsigned char *src;
-    ptrdiff_t      s;
+    oc_fragment *frag;
     frag=_state->frags+*fragi;
-    dst=frag->buffer[dst_framei];
-    src=frag->buffer[src_framei];
-    __asm__ __volatile__(
-      /*src+0*src_ystride*/
-      "movq (%[src]),%%mm0\n\t"
-      /*s=src_ystride*3*/
-      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
-      /*src+1*src_ystride*/
-      "movq (%[src],%[src_ystride]),%%mm1\n\t"
-      /*src+2*src_ystride*/
-      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
-      /*src+3*src_ystride*/
-      "movq (%[src],%[s]),%%mm3\n\t"
-      /*dst+0*dst_ystride*/
-      "movq %%mm0,(%[dst])\n\t"
-      /*s=dst_ystride*3*/
-      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
-      /*dst+1*dst_ystride*/
-      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
-      /*Pointer to next 4.*/
-      "lea (%[src],%[src_ystride],4),%[src]\n\t"
-      /*dst+2*dst_ystride*/
-      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
-      /*dst+3*dst_ystride*/
-      "movq %%mm3,(%[dst],%[s])\n\t"
-      /*Pointer to next 4.*/
-      "lea (%[dst],%[dst_ystride],4),%[dst]\n\t"
-      /*src+0*src_ystride*/
-      "movq (%[src]),%%mm0\n\t"
-      /*s=src_ystride*3*/
-      "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
-      /*src+1*src_ystride*/
-      "movq (%[src],%[src_ystride]),%%mm1\n\t"
-      /*src+2*src_ystride*/
-      "movq (%[src],%[src_ystride],2),%%mm2\n\t"
-      /*src+3*src_ystride*/
-      "movq (%[src],%[s]),%%mm3\n\t"
-      /*dst+0*dst_ystride*/
-      "movq %%mm0,(%[dst])\n\t"
-      /*s=dst_ystride*3*/
-      "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
-      /*dst+1*dst_ystride*/
-      "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
-      /*dst+2*dst_ystride*/
-      "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
-      /*dst+3*dst_ystride*/
-      "movq %%mm3,(%[dst],%[s])\n\t"
-      :[s]"=&r"(s)
-      :[dst]"r"(dst),[src]"r"(src),[dst_ystride]"r"(dst_ystride),
-       [src_ystride]"r"(src_ystride)
-      :"memory"
-    );
+    OC_FRAG_COPY_MMX(frag->buffer[dst_framei],frag->buffer[src_framei],ystride);
   }
-  /*This needs to be removed when decode specific functions are implemented:*/
-  __asm__ __volatile__("emms\n\t");
 }
 
-static void loop_filter_v(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
-  ptrdiff_t s;
-  _pix-=_ystride*2;
-  __asm__ __volatile__(
-    /*mm0=0*/
-    "pxor %%mm0,%%mm0\n\t"
-    /*s=_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*mm7=_pix[0...8]*/
-    "movq (%[pix]),%%mm7\n\t"
-    /*mm4=_pix[0...8+_ystride*3]*/
-    "movq (%[pix],%[s]),%%mm4\n\t"
-    /*mm6=_pix[0...8]*/
-    "movq %%mm7,%%mm6\n\t"
-    /*Expand unsigned _pix[0...3] to 16 bits.*/
-    "punpcklbw %%mm0,%%mm6\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    /*Expand unsigned _pix[4...8] to 16 bits.*/
-    "punpckhbw %%mm0,%%mm7\n\t"
-    /*Expand other arrays too.*/
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm5\n\t"
-    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
-    "psubw %%mm4,%%mm6\n\t"
-    "psubw %%mm5,%%mm7\n\t"
-    /*mm5=mm4=_pix[0...8+_ystride]*/
-    "movq (%[pix],%[ystride]),%%mm4\n\t"
-    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
-    "movq (%[pix],%[ystride],2),%%mm2\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    "movq %%mm2,%%mm3\n\t"
-    "movq %%mm2,%%mm1\n\t"
-    /*Expand these arrays.*/
-    "punpckhbw %%mm0,%%mm5\n\t"
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm3\n\t"
-    "punpcklbw %%mm0,%%mm2\n\t"
-    /*mm0=3 3 3 3
-      mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
-    "pcmpeqw %%mm0,%%mm0\n\t"
-    "psubw %%mm5,%%mm3\n\t"
-    "psrlw $14,%%mm0\n\t"
-    "psubw %%mm4,%%mm2\n\t"
-    /*Scale by 3.*/
-    "pmullw %%mm0,%%mm3\n\t"
-    "pmullw %%mm0,%%mm2\n\t"
-    /*mm0=4 4 4 4
-      f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
-       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
-    "psrlw $1,%%mm0\n\t"
-    "paddw %%mm7,%%mm3\n\t"
-    "psllw $2,%%mm0\n\t"
-    "paddw %%mm6,%%mm2\n\t"
-    /*Add 4.*/
-    "paddw %%mm0,%%mm3\n\t"
-    "paddw %%mm0,%%mm2\n\t"
-    /*"Divide" by 8.*/
-    "psraw $3,%%mm3\n\t"
-    "psraw $3,%%mm2\n\t"
-    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
-    /*Free up mm5.*/
-    "packuswb %%mm5,%%mm4\n\t"
-    /*mm0=L L L L*/
-    "movq (%[ll]),%%mm0\n\t"
-    /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    "movq %%mm2,%%mm5\n\t"
-    "pxor %%mm6,%%mm6\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "psllw $1,%%mm7\n\t"
-    "psllw $1,%%mm6\n\t"
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    "pcmpgtw %%mm2,%%mm7\n\t"
-    "pcmpgtw %%mm6,%%mm5\n\t"
-    "pand %%mm7,%%mm2\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "pand %%mm5,%%mm2\n\t"
-    "psllw $1,%%mm7\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    "pcmpgtw %%mm3,%%mm7\n\t"
-    "pcmpgtw %%mm6,%%mm5\n\t"
-    "pand %%mm7,%%mm3\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "pand %%mm5,%%mm3\n\t"
-    /*if(R_i<-L)R_i'=R_i+2L;
-      if(R_i>L)R_i'=R_i-2L;
-      if(R_i<-L||R_i>L)R_i=-R_i':*/
-    "psraw $1,%%mm6\n\t"
-    "movq %%mm2,%%mm5\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm5=R_i>L?FF:00*/
-    "pcmpgtw %%mm0,%%mm5\n\t"
-    /*mm6=-L>R_i?FF:00*/
-    "pcmpgtw %%mm2,%%mm6\n\t"
-    /*mm7=R_i>L?2L:0*/
-    "pand %%mm5,%%mm7\n\t"
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm7,%%mm2\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    /*mm5=-L>R_i||R_i>L*/
-    "por %%mm6,%%mm5\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm7=-L>R_i?2L:0*/
-    "pand %%mm6,%%mm7\n\t"
-    "pxor %%mm6,%%mm6\n\t"
-    /*mm2=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm7,%%mm2\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    "pand %%mm2,%%mm5\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm5,%%mm2\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm5,%%mm2\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm6=-L>R_i?FF:00*/
-    "pcmpgtw %%mm3,%%mm6\n\t"
-    /*mm5=R_i>L?FF:00*/
-    "pcmpgtw %%mm0,%%mm5\n\t"
-    /*mm7=R_i>L?2L:0*/
-    "pand %%mm5,%%mm7\n\t"
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm7,%%mm3\n\t"
-    "psllw $1,%%mm0\n\t"
-    /*mm5=-L>R_i||R_i>L*/
-    "por %%mm6,%%mm5\n\t"
-    /*mm0=-L>R_i?2L:0*/
-    "pand %%mm6,%%mm0\n\t"
-    /*mm3=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm0,%%mm3\n\t"
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    "pand %%mm3,%%mm5\n\t"
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm5,%%mm3\n\t"
-    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm5,%%mm3\n\t"
-    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
-       saturation op code, so we have to promote things back 16 bits.*/
-    "pxor %%mm0,%%mm0\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm5\n\t"
-    "movq %%mm1,%%mm6\n\t"
-    "punpcklbw %%mm0,%%mm1\n\t"
-    "punpckhbw %%mm0,%%mm6\n\t"
-    /*_pix[0...8+_ystride]+=R_i*/
-    "paddw %%mm2,%%mm4\n\t"
-    "paddw %%mm3,%%mm5\n\t"
-    /*_pix[0...8+_ystride*2]-=R_i*/
-    "psubw %%mm2,%%mm1\n\t"
-    "psubw %%mm3,%%mm6\n\t"
-    "packuswb %%mm5,%%mm4\n\t"
-    "packuswb %%mm6,%%mm1\n\t"
-    /*Write it back out.*/
-    "movq %%mm4,(%[pix],%[ystride])\n\t"
-    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
-    :[s]"=&r"(s)
-    :[pix]"r"(_pix),[ystride]"r"((ptrdiff_t)_ystride),[ll]"r"(_ll)
-    :"memory"
-  );
-}
-
-/*This code implements the bulk of loop_filter_h().
-  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
-   four p0's to one register we must transpose the values in four mmx regs.
-  When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,ptrdiff_t _ystride,
- const ogg_int16_t *_ll){
-  ptrdiff_t s;
-  /*d doesn't technically need to be 64-bit on x86-64, but making it so will
-     help avoid partial register stalls.*/
-  ptrdiff_t d;
-  __asm__ __volatile__(
-    /*x x x x 3 2 1 0*/
-    "movd (%[pix]),%%mm0\n\t"
-    /*s=_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*x x x x 7 6 5 4*/
-    "movd (%[pix],%[ystride]),%%mm1\n\t"
-    /*x x x x B A 9 8*/
-    "movd (%[pix],%[ystride],2),%%mm2\n\t"
-    /*x x x x F E D C*/
-    "movd (%[pix],%[s]),%%mm3\n\t"
-    /*mm0=7 3 6 2 5 1 4 0*/
-    "punpcklbw %%mm1,%%mm0\n\t"
-    /*mm2=F B E A D 9 C 8*/
-    "punpcklbw %%mm3,%%mm2\n\t"
-    /*mm1=7 3 6 2 5 1 4 0*/
-    "movq %%mm0,%%mm1\n\t"
-    /*mm0=F B 7 3 E A 6 2*/
-    "punpckhwd %%mm2,%%mm0\n\t"
-    /*mm1=D 9 5 1 C 8 4 0*/
-    "punpcklwd %%mm2,%%mm1\n\t"
-    "pxor %%mm7,%%mm7\n\t"
-    /*mm5=D 9 5 1 C 8 4 0*/
-    "movq %%mm1,%%mm5\n\t"
-    /*mm1=x C x 8 x 4 x 0==pix[0]*/
-    "punpcklbw %%mm7,%%mm1\n\t"
-    /*mm5=x D x 9 x 5 x 1==pix[1]*/
-    "punpckhbw %%mm7,%%mm5\n\t"
-    /*mm3=F B 7 3 E A 6 2*/
-    "movq %%mm0,%%mm3\n\t"
-    /*mm0=x E x A x 6 x 2==pix[2]*/
-    "punpcklbw %%mm7,%%mm0\n\t"
-    /*mm3=x F x B x 7 x 3==pix[3]*/
-    "punpckhbw %%mm7,%%mm3\n\t"
-    /*mm1=mm1-mm3==pix[0]-pix[3]*/
-    "psubw %%mm3,%%mm1\n\t"
-    /*Save a copy of pix[2] for later.*/
-    "movq %%mm0,%%mm4\n\t"
-    /*mm2=3 3 3 3
-      mm0=mm0-mm5==pix[2]-pix[1]*/
-    "pcmpeqw %%mm2,%%mm2\n\t"
-    "psubw %%mm5,%%mm0\n\t"
-    "psrlw $14,%%mm2\n\t"
-    /*Scale by 3.*/
-    "pmullw %%mm2,%%mm0\n\t"
-    /*mm2=4 4 4 4
-      f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
-    "psrlw $1,%%mm2\n\t"
-    "paddw %%mm1,%%mm0\n\t"
-    "psllw $2,%%mm2\n\t"
-    /*Add 4.*/
-    "paddw %%mm2,%%mm0\n\t"
-    /*"Divide" by 8, producing the residuals R_i.*/
-    "psraw $3,%%mm0\n\t"
-    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
-    /*mm6=L L L L*/
-    "movq (%[ll]),%%mm6\n\t"
-    /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    "movq %%mm0,%%mm1\n\t"
-    "pxor %%mm2,%%mm2\n\t"
-    "movq %%mm6,%%mm3\n\t"
-    "psubw %%mm6,%%mm2\n\t"
-    "psllw $1,%%mm3\n\t"
-    "psllw $1,%%mm2\n\t"
-    /*mm0==R_3 R_2 R_1 R_0*/
-    /*mm1==R_3 R_2 R_1 R_0*/
-    /*mm2==-2L -2L -2L -2L*/
-    /*mm3==2L 2L 2L 2L*/
-    "pcmpgtw %%mm0,%%mm3\n\t"
-    "pcmpgtw %%mm2,%%mm1\n\t"
-    "pand %%mm3,%%mm0\n\t"
-    "pand %%mm1,%%mm0\n\t"
-    /*if(R_i<-L)R_i'=R_i+2L;
-      if(R_i>L)R_i'=R_i-2L;
-      if(R_i<-L||R_i>L)R_i=-R_i':*/
-    "psraw $1,%%mm2\n\t"
-    "movq %%mm0,%%mm1\n\t"
-    "movq %%mm6,%%mm3\n\t"
-    /*mm0==R_3 R_2 R_1 R_0*/
-    /*mm1==R_3 R_2 R_1 R_0*/
-    /*mm2==-L -L -L -L*/
-    /*mm6==L L L L*/
-    /*mm2=-L>R_i?FF:00*/
-    "pcmpgtw %%mm0,%%mm2\n\t"
-    /*mm1=R_i>L?FF:00*/
-    "pcmpgtw %%mm6,%%mm1\n\t"
-    /*mm3=2L 2L 2L 2L*/
-    "psllw $1,%%mm3\n\t"
-    /*mm6=2L 2L 2L 2L*/
-    "psllw $1,%%mm6\n\t"
-    /*mm3=R_i>L?2L:0*/
-    "pand %%mm1,%%mm3\n\t"
-    /*mm6=-L>R_i?2L:0*/
-    "pand %%mm2,%%mm6\n\t"
-    /*mm0=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm3,%%mm0\n\t"
-    /*mm1=-L>R_i||R_i>L*/
-    "por %%mm2,%%mm1\n\t"
-    /*mm0=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm6,%%mm0\n\t"
-    /*mm1=-L>R_i||R_i>L?R_i':0*/
-    "pand %%mm0,%%mm1\n\t"
-    /*mm0=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm1,%%mm0\n\t"
-    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm1,%%mm0\n\t"
-    /*_pix[1]+=R_i;*/
-    "paddw %%mm0,%%mm5\n\t"
-    /*_pix[2]-=R_i;*/
-    "psubw %%mm0,%%mm4\n\t"
-    /*mm5=x x x x D 9 5 1*/
-    "packuswb %%mm7,%%mm5\n\t"
-    /*mm4=x x x x E A 6 2*/
-    "packuswb %%mm7,%%mm4\n\t"
-    /*mm5=E D A 9 6 5 2 1*/
-    "punpcklbw %%mm4,%%mm5\n\t"
-    /*d=6 5 2 1*/
-    "movd %%mm5,%[d]\n\t"
-    "movw %w[d],1(%[pix])\n\t"
-    /*Why is there such a big stall here?*/
-    "psrlq $32,%%mm5\n\t"
-    "shr $16,%[d]\n\t"
-    "movw %w[d],1(%[pix],%[ystride])\n\t"
-    /*d=E D A 9*/
-    "movd %%mm5,%[d]\n\t"
-    "movw %w[d],1(%[pix],%[ystride],2)\n\t"
-    "shr $16,%[d]\n\t"
-    "movw %w[d],1(%[pix],%[s])\n\t"
-    :[s]"=&r"(s),[d]"=&r"(d),
-     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
-    :
-    :"memory"
-  );
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride,
- const ogg_int16_t *_ll){
-  _pix-=2;
-  loop_filter_h4(_pix,_ystride,_ll);
-  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
-}
-
-/*We copy the whole function because the MMX routines will be inlined 4 times,
-   and we can do just a single emms call at the end this way.
-  We also do not use the _bv lookup table, instead computing the values that
-   would lie in it on the fly.*/
-
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of
    fragments, so this row also needs to be available.
@@ -599,19 +93,18 @@
   _pli:       The color plane to filter.
   _fragy0:    The Y coordinate of the first fragment row to filter.
   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
-void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end){
-  ogg_int16_t __attribute__((aligned(8)))  ll[4];
-  th_img_plane                            *iplane;
-  oc_fragment_plane                       *fplane;
-  oc_fragment                             *frag_top;
-  oc_fragment                             *frag0;
-  oc_fragment                             *frag;
-  oc_fragment                             *frag_end;
-  oc_fragment                             *frag0_end;
-  oc_fragment                             *frag_bot;
-  ll[0]=ll[1]=ll[2]=ll[3]=
-   (ogg_int16_t)_state->loop_filter_limits[_state->qis[0]];
+  unsigned char OC_ALIGN8  ll[8];
+  const th_img_plane      *iplane;
+  const oc_fragment_plane *fplane;
+  oc_fragment             *frag_top;
+  oc_fragment             *frag0;
+  oc_fragment             *frag;
+  oc_fragment             *frag_end;
+  oc_fragment             *frag0_end;
+  oc_fragment             *frag_bot;
+  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
   /*The following loops are constructed somewhat non-intuitively on purpose.
@@ -629,16 +122,16 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->stride,ll);
+          OC_LOOP_FILTER_H_MMX(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->stride,ll);
+          OC_LOOP_FILTER_V_MMX(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->stride,ll);
+          OC_LOOP_FILTER_H_MMX(frag->buffer[_refi]+8,iplane->stride,ll);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
-          loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
+          OC_LOOP_FILTER_V_MMX((frag+fplane->nhfrags)->buffer[_refi],
            iplane->stride,ll);
         }
       }
@@ -646,8 +139,6 @@
     }
     frag0+=fplane->nhfrags;
   }
-  /*This needs to be removed when decode specific functions are implemented:*/
-  __asm__ __volatile__("emms\n\t");
 }
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -21,22 +21,24 @@
 
 void oc_state_vtable_init_x86(oc_theora_state *_state);
 
-void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
+void oc_frag_copy_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
  const ogg_int16_t *_residue);
-void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue);
-void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
- int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon_mmx(oc_theora_state *_state,oc_fragment *_frag,
+void oc_frag_recon_inter_mmx(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
+void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
+void oc_dequant_idct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+ int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+ const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_recon_mmx(const oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
+ const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
+void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_mmx(void);
-void oc_idct8x8_mmx(ogg_int16_t _y[64]);
-void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
-void oc_fill_idct_constants_mmx(void);
-void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end);
+
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -17,18 +17,20 @@
 
 #include "x86int.h"
 
-#if defined(USE_ASM)
+#if defined(OC_X86_ASM)
 
 #include "../../cpu.c"
 
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();
   if(_state->cpu_flags&OC_CPU_X86_MMX){
+    _state->opt_vtable.frag_copy=oc_frag_copy_mmx;
     _state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
     _state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
     _state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
-    _state->opt_vtable.state_frag_copy=oc_state_frag_copy_mmx;
+    _state->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
     _state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
+    _state->opt_vtable.state_frag_copy_list=oc_state_frag_copy_list_mmx;
     _state->opt_vtable.state_loop_filter_frag_rows=
      oc_state_loop_filter_frag_rows_mmx;
     _state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;

Modified: branches/theora-thusnelda/lib/enc/codec_internal.h
===================================================================
--- branches/theora-thusnelda/lib/enc/codec_internal.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/codec_internal.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -27,30 +27,51 @@
 #include "theora/theora.h"
 #include "../internal.h"
 #include "encoder_huffman.h"
+#include "huffenc.h"
 #include "../dec/ocintrin.h"
-typedef struct CP_INSTANCE CP_INSTANCE;
-#include "dsp.h"
 
-#define theora_read(x,y,z) ( oggpackB_read(x,y,z) )
 
-#define CURRENT_ENCODE_VERSION   1
-#define HUGE_ERROR              (1<<28)  /*  Out of range test value */
 
+typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
+typedef struct CP_INSTANCE       CP_INSTANCE;
+
+
+struct oc_enc_opt_vtable{
+  unsigned (*frag_sad)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  unsigned (*frag_sad_thresh)(const unsigned char *_src,
+   const unsigned char *_ref,int _ystride,unsigned _thresh);
+  unsigned (*frag_sad2_thresh)(const unsigned char *_src,
+   const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+   unsigned _thresh);
+  void     (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
+   const unsigned char *_ref,int _ystride);
+  void     (*frag_sub_128)(ogg_int16_t _diff[64],
+   const unsigned char *_src,int _ystride);
+  void     (*frag_copy)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride);
+  void     (*frag_copy2)(unsigned char *_dst,
+   const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+  void     (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void     (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void     (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+  void     (*dequant_idct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+   int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+   const ogg_uint16_t _ac_quant[64]);
+  void     (*enc_loop_filter)(CP_INSTANCE *cpi,int _flimit);
+  void     (*restore_fpu)(void);
+};
+
+
+void oc_enc_vtable_init(CP_INSTANCE *_cpi);
+
 /* Baseline dct height and width. */
 #define BLOCK_HEIGHT_WIDTH          8
 #define HFRAGPIXELS                 8
 #define VFRAGPIXELS                 8
 
-/* Blocks on INTRA/INTER Y/U/V planes */
-enum BlockMode {
-  BLOCK_Y,
-  BLOCK_U,
-  BLOCK_V,
-  BLOCK_INTER_Y,
-  BLOCK_INTER_U,
-  BLOCK_INTER_V
-};
-
 /* Baseline dct block size */
 #define BLOCK_SIZE              (BLOCK_HEIGHT_WIDTH * BLOCK_HEIGHT_WIDTH)
 
@@ -58,8 +79,6 @@
 #define UMV_BORDER              16
 #define STRIDE_EXTRA            (UMV_BORDER * 2)
 
-#define Q_TABLE_SIZE            64
-
 #define KEY_FRAME               0
 #define DELTA_FRAME             1
 
@@ -68,14 +87,6 @@
 #define MODE_METHODS            8
 #define MODE_METHOD_BITS        3
 
-/* Different key frame types/methods */
-#define DCT_KEY_FRAME           0
-
-#define KEY_FRAME_CONTEXT       5
-
-/* Number of search sites for a 4-step search (at pixel accuracy) */
-#define MAX_SEARCH_SITES       33
-
 #define MAX_MV_EXTENT 31  /* Max search distance in half pixel increments */
 
 /** block coding modes */
@@ -159,7 +170,7 @@
   int m[16]; // hilbert order: only 4 for luma, but 16 for U/V (to match f) */
 } superblock_t;
 
-typedef ogg_int16_t    quant_table[64]; 
+typedef ogg_int16_t    quant_table[64];
 typedef quant_table    quant_tables[64]; /* [zigzag][qi] */
 
 #include "enquant.h"
@@ -207,156 +218,150 @@
   /*This structure must be first.
     It contains entry points accessed by the decoder library's API wrapper, and
      is the only assumption that library makes about our internal format.*/
-  oc_state_dispatch_vtbl dispatch_vtbl;
+  oc_state_dispatch_vtable  dispatch_vtbl;
 
-  theora_info      info;
+  theora_info               info;
 
   /* ogg bitpacker for use in packet coding, other API state */
-  oggpack_buffer   *oggbuffer;
+  oggpack_buffer           *oggbuffer;
   /*The number of duplicates to produce for the next frame.*/
-  int               dup_count;
+  int                       dup_count;
   /*The number of duplicates remaining to be emitted for the current frame.*/
-  int               nqueued_dups;
+  int                       nqueued_dups;
 
-  unsigned char   *frame;
-  unsigned char   *recon;
-  unsigned char   *golden;
-  unsigned char   *lastrecon;
-  ogg_uint32_t     frame_size;
+  unsigned char            *frame;
+  unsigned char            *recon;
+  unsigned char            *golden;
+  unsigned char            *lastrecon;
+  ogg_uint32_t              frame_size;
 
-  /* SuperBlock, MacroBLock and Fragment Information */
-  unsigned char   *frag_coded;
-  ogg_uint32_t    *frag_buffer_index;
-  ogg_int16_t     *frag_dc;
-  ogg_int16_t     *frag_dc_tmp;
+  /*Superblock, macroblock and fragment Information.*/
+  unsigned char            *frag_coded;
+  ogg_uint32_t             *frag_buffer_index;
+  ogg_int16_t              *frag_dc;
+  ogg_int16_t              *frag_dc_tmp;
 
-  macroblock_t    *macro;
-  superblock_t    *super[3];
+  macroblock_t             *macro;
+  superblock_t             *super[3];
 
-  ogg_uint32_t     frag_h[3];
-  ogg_uint32_t     frag_v[3];
-  ogg_uint32_t     frag_n[3];
-  ogg_uint32_t     frag_total;
+  ogg_uint32_t              frag_h[3];
+  ogg_uint32_t              frag_v[3];
+  ogg_uint32_t              frag_n[3];
+  ogg_uint32_t              frag_total;
 
-  ogg_uint32_t     macro_h;
-  ogg_uint32_t     macro_v;
-  ogg_uint32_t     macro_total;
+  ogg_uint32_t              macro_h;
+  ogg_uint32_t              macro_v;
+  ogg_uint32_t              macro_total;
   
-  ogg_uint32_t     super_h[3];
-  ogg_uint32_t     super_v[3];
-  ogg_uint32_t     super_n[3];
-  ogg_uint32_t     super_total;
+  ogg_uint32_t              super_h[3];
+  ogg_uint32_t              super_v[3];
+  ogg_uint32_t              super_n[3];
+  ogg_uint32_t              super_total;
 
-  ogg_uint32_t     stride[3]; // stride of image and recon planes, accounting for borders
-  ogg_uint32_t     offset[3]; // data offset of first coded pixel in plane
+  /*Stride of image and recon planes, accounting for borders.*/
+  ogg_uint32_t              stride[3];
+  /*Data offset of first coded pixel in plane.*/
+  ogg_uint32_t              offset[3];
 
   /*********************************************************************/
   /* state and stats */
-  
-  int              HeadersWritten;
-  ogg_uint32_t     LastKeyFrame;
-  ogg_int64_t      CurrentFrame;
-  unsigned char    FrameType;
-  int              readyflag;
-  int              packetflag;
-  int              doneflag;
-  int              first_inter_frame;
 
-  int              huffchoice[2][2][2]; /* [key/inter][dc/ac][luma/chroma] */
+  int                       HeadersWritten;
+  ogg_uint32_t              LastKeyFrame;
+  ogg_int64_t               CurrentFrame;
+  unsigned char             FrameType;
+  int                       readyflag;
+  int                       packetflag;
+  int                       doneflag;
+  int                       first_inter_frame;
 
-  ogg_uint32_t     dc_bits[2][DC_HUFF_CHOICES];
-  ogg_uint32_t     ac1_bits[2][AC_HUFF_CHOICES];
-  ogg_uint32_t     acN_bits[2][AC_HUFF_CHOICES];
-  ogg_uint32_t     MVBits_0; /* count of bits used by MV coding mode 0 */
-  ogg_uint32_t     MVBits_1; /* count of bits used by MV coding mode 1 */
-  oc_mode_scheme_chooser chooser;
+  /*Indexed via [key/inter][dc/ac][luma/chroma].*/
+  int                       huffchoice[2][2][2];
 
+  ogg_uint32_t              dc_bits[2][DC_HUFF_CHOICES];
+  ogg_uint32_t              ac1_bits[2][AC_HUFF_CHOICES];
+  ogg_uint32_t              acN_bits[2][AC_HUFF_CHOICES];
+  /*Count of bits used by MV coding mode 0.*/
+  ogg_uint32_t              MVBits_0;
+  /*Count of bits used by MV coding mode 1.*/
+  ogg_uint32_t              MVBits_1;
+  oc_mode_scheme_chooser    chooser;
+
   /*********************************************************************/
   /* Token Buffers */
-  int             *fr_partial;
-  unsigned char   *fr_partial_bits;
-  int             *fr_full;
-  unsigned char   *fr_full_bits;
-  ogg_int16_t     *fr_block;
-  unsigned char   *fr_block_bits;
-  int              fr_partial_count;
-  int              fr_full_count;
-  int              fr_block_count;
+  int                      *fr_partial;
+  unsigned char            *fr_partial_bits;
+  int                      *fr_full;
+  unsigned char            *fr_full_bits;
+  ogg_int16_t              *fr_block;
+  unsigned char            *fr_block_bits;
+  int                       fr_partial_count;
+  int                       fr_full_count;
+  int                       fr_block_count;
 
 
-  int              stack_offset;
-  unsigned char   *dct_token_storage;
-  ogg_uint16_t    *dct_token_eb_storage;
-  unsigned char   *dct_token[64];
-  ogg_uint16_t    *dct_token_eb[64];
+  int                       stack_offset;
+  unsigned char            *dct_token_storage;
+  ogg_uint16_t             *dct_token_eb_storage;
+  unsigned char            *dct_token[64];
+  ogg_uint16_t             *dct_token_eb[64];
 
-  ogg_int32_t      dct_token_count[64];
-  ogg_int32_t      dct_token_ycount[64];
+  ogg_int32_t               dct_token_count[64];
+  ogg_int32_t               dct_token_ycount[64];
 
-  int              eob_run[64];
-  int              eob_pre[64];
-  int              eob_ypre[64];
+  int                       eob_run[64];
+  int                       eob_pre[64];
+  int                       eob_ypre[64];
 
   /********************************************************************/
   /* Fragment SAD->bitrate estimation tracking metrics */
-  long             rho_count[65]; 
+  long                      rho_count[65]; 
 
 #ifdef COLLECT_METRICS
-  long             rho_postop; 
-  int             *frag_mbi;
-  int             *frag_sad;
-  int             *dct_token_frag_storage;
-  int             *dct_token_frag[64];
-  int             *dct_eob_fi_storage;
-  int             *dct_eob_fi_stack[64];
-  int              dct_eob_fi_count[64];
-  ogg_int64_t     dist_dist[3][8];
-  ogg_int64_t     dist_bits[3][8];
+  long                      rho_postop;
+  int                      *frag_mbi;
+  int                      *frag_sad;
+  int                      *dct_token_frag_storage;
+  int                      *dct_token_frag[64];
+  int                      *dct_eob_fi_storage;
+  int                      *dct_eob_fi_stack[64];
+  int                       dct_eob_fi_count[64];
+  ogg_int64_t               dist_dist[3][8];
+  ogg_int64_t               dist_bits[3][8];
 #endif
 
   /********************************************************************/
   /* Setup */
-  int              keyframe_granule_shift;
-  int              lambda;
-  int              BaseQ;
-  int              MinQ;
-  int              GoldenFrameEnabled;
-  int              InterPrediction;
-  int              MotionCompensation;
+  int                     keyframe_granule_shift;
+  int                     lambda;
+  int                     BaseQ;
+  int                     MinQ;
+  int                     GoldenFrameEnabled;
+  int                     InterPrediction;
+  int                     MotionCompensation;
 
   /* hufftables and quant setup ****************************************/
 
-  HUFF_ENTRY      *HuffRoot_VP3x[NUM_HUFF_TABLES];
-  ogg_uint32_t    *HuffCodeArray_VP3x[NUM_HUFF_TABLES];
-  unsigned char   *HuffCodeLengthArray_VP3x[NUM_HUFF_TABLES];
-  const unsigned char *ExtraBitLengths_VP3x;
+  th_huff_code            huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
 
-  th_quant_info    quant_info;
-  quant_tables     quant_tables[2][3];
-  oc_iquant_tables iquant_tables[2][3];
+  th_quant_info           quant_info;
+  quant_tables            quant_tables[2][3];
+  oc_iquant_tables        iquant_tables[2][3];
   /*An "average" quantizer for each quantizer type (INTRA or INTER) and QI
      value.
     This is used to paramterize the rate control decisions.
     They are kept in the log domain to simplify later processing.
     Keep in mind these are DCT domain quantizers, and so are scaled by an
      additional factor of 4 from the pixel domain.*/
-  ogg_int64_t      log_qavg[2][64];
+  ogg_int64_t             log_qavg[2][64];
   /*The buffer state used to drive rate control.*/
-  oc_rc_state      rc;
-  DspFunctions     dsp;  /* Selected functions for this platform */
-
+  oc_rc_state             rc;
+  /*Table for encoder acceleration functions.*/
+  oc_enc_opt_vtable       opt_vtable;
 };
 
-#define clamp255(x) ((unsigned char)((((x)<0)-1) & ((x) | -((x)>255))))
-
-extern void IDct1( const ogg_int16_t *InputData,
-                   const ogg_int16_t *QuantMatrix,
-                   ogg_int16_t *OutputData );
-
 extern void ReconRefFrames (CP_INSTANCE *cpi);
 
-extern void fdct_short ( ogg_int16_t *InputData, ogg_int16_t *OutputData );
-
 typedef struct {
   int coeff;
   int count; /* -1 indicates no token, ie, midst of an EOB run */
@@ -386,10 +391,6 @@
 extern void dct_tokenize_mark_ac_chroma (CP_INSTANCE *cpi);
 
 extern void InitQTables( CP_INSTANCE *cpi );
-extern void InitHuffmanSet( CP_INSTANCE *cpi );
-extern void ClearHuffmanSet( CP_INSTANCE *cpi );
-extern void WriteHuffmanTrees(HUFF_ENTRY *HuffRoot[NUM_HUFF_TABLES],
-                              oggpack_buffer *opb);
 
 extern void WriteFrameHeader( CP_INSTANCE *cpi) ;
 
@@ -453,4 +454,53 @@
 extern void DumpMetrics(CP_INSTANCE *cpi);
 #endif
 
+/*Encoder-specific accelerated functions.*/
+void oc_enc_frag_sub(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+unsigned oc_enc_frag_sad(const CP_INSTANCE *_cpi,const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh(const CP_INSTANCE *_cpi,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh(const CP_INSTANCE *_cpi,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh);
+void oc_enc_frag_copy(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_frag_recon_intra(const CP_INSTANCE *_cpi,
+ unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_frag_recon_inter(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_enc_fdct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64]);
+void oc_enc_dequant_idct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_enc_loop_filter(CP_INSTANCE *_cpi,int _flimit);
+void oc_enc_restore_fpu(const CP_INSTANCE *_cpi);
+
+/*Default pure-C implementations.*/
+void oc_enc_vtable_init_c(CP_INSTANCE *_cpi);
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride);
+void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride);
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_loop_filter_c(CP_INSTANCE *_cpi,int _flimit);
+void oc_enc_restore_fpu_c(void);
+
 #endif /* ENCODER_INTERNAL_H */

Modified: branches/theora-thusnelda/lib/enc/dct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/dct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -16,21 +16,10 @@
  ********************************************************************/
 
 #include "codec_internal.h"
-#include "dsp.h"
-#include "../cpu.h"
+#include "../dec/dct.h"
 
 
 
-#define OC_C1S7 (64277)
-#define OC_C2S6 (60547)
-#define OC_C3S5 (54491)
-#define OC_C4S4 (46341)
-#define OC_C5S3 (36410)
-#define OC_C6S2 (25080)
-#define OC_C7S1 (12785)
-
-
-
 /*Performs a forward 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 from the orthonormal version of the
    transform.
@@ -39,7 +28,7 @@
        block).
   _x: The input coefficients.
       The first 8 entries are used (e.g., from a row of an 8x8 block).*/
-static void oc_fdct8(const ogg_int16_t _x[8],ogg_int16_t *_y){
+static void oc_fdct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
   int t0;
   int t1;
   int t2;
@@ -54,17 +43,17 @@
   int v;
   /*Stage 1:*/
   /*0-7 butterfly.*/
-  t0=_x[0]+(int)_x[7];
-  t7=_x[0]-(int)_x[7];
+  t0=_x[0<<3]+(int)_x[7<<3];
+  t7=_x[0<<3]-(int)_x[7<<3];
   /*1-6 butterfly.*/
-  t1=_x[1]+(int)_x[6];
-  t6=_x[1]-(int)_x[6];
+  t1=_x[1<<3]+(int)_x[6<<3];
+  t6=_x[1<<3]-(int)_x[6<<3];
   /*2-5 butterfly.*/
-  t2=_x[2]+(int)_x[5];
-  t5=_x[2]-(int)_x[5];
+  t2=_x[2<<3]+(int)_x[5<<3];
+  t5=_x[2<<3]-(int)_x[5<<3];
   /*3-4 butterfly.*/
-  t3=_x[3]+(int)_x[4];
-  t4=_x[3]-(int)_x[4];
+  t3=_x[3<<3]+(int)_x[4<<3];
+  t4=_x[3<<3]-(int)_x[4<<3];
   /*Stage 2:*/
   /*0-3 butterfly.*/
   r=t0+t3;
@@ -111,35 +100,40 @@
   s=(27146*t1+0xB500>>16)+t1+(t1!=0);
   u=r+s>>1;
   v=r-u;
-  _y[0<<3]=u;
-  _y[4<<3]=v;
+  _y[0]=u;
+  _y[4]=v;
   /*3-2 rotation by 6pi/16*/
   u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
   s=(OC_C6S2*u>>16)-t2;
   v=(s*21600+0x2800>>18)+s+(s!=0);
-  _y[2<<3]=u;
-  _y[6<<3]=v;
+  _y[2]=u;
+  _y[6]=v;
   /*6-5 rotation by 3pi/16*/
   u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
   s=t6-(OC_C5S3*u>>16);
   v=(s*26568+0x3400>>17)+s+(s!=0);
-  _y[5<<3]=u;
-  _y[3<<3]=v;
+  _y[5]=u;
+  _y[3]=v;
   /*7-4 rotation by 7pi/16*/
   u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
   s=(OC_C7S1*u>>16)-t4;
   v=(s*20539+0x3000>>20)+s+(s!=0);
-  _y[1<<3]=u;
-  _y[7<<3]=v;
+  _y[1]=u;
+  _y[7]=v;
 }
 
+void oc_enc_fdct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64]){
+  (*_cpi->opt_vtable.fdct8x8)(_y,_x);
+}
+
 /*Performs a forward 8x8 Type-II DCT transform.
   The output is scaled by a factor of 4 relative to the orthonormal version
    of the transform.
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
-static void oc_fdct8x8_c(const ogg_int16_t _x[64],ogg_int16_t _y[64]){
+void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
   const ogg_int16_t *in;
   ogg_int16_t       *end;
   ogg_int16_t       *out;
@@ -147,34 +141,18 @@
   int                i;
   /*Add two extra bits of working precision to improve accuracy; any more and
      we could overflow.*/
-  for(i=0;i<64;i++)w[i>>3|(i&7)<<3]=_x[i]<<2;
+  for(i=0;i<64;i++)w[i]=_x[i]<<2;
   /*These biases correct for some systematic error that remains in the full
      fDCT->iDCT round trip.*/
   w[0]+=(w[0]!=0)+1;
-  w[1]--;
-  w[8]++;
-  /*Transform rows of x into columns of w.*/
-  for(in=w,out=_y,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
-  /*Transform rows of w into columns of y.*/
-  for(in=_y,out=w,end=out+8;out<end;in+=8,out++)oc_fdct8(in,out);
+  w[1]++;
+  w[8]--;
+  /*Transform columns of w into rows of _y.*/
+  for(in=w,out=_y,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
+  /*Transform columns of _y into rows of w.*/
+  for(in=_y,out=w,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
   /*Round the result back to the external working precision (which is still
      scaled by four relative to the orthogonal result).
     TODO: We should just update the external working precision.*/
-  for(i=0;i<64;i++)_y[i>>3|(i&7)<<3]=w[i]+2>>2;
+  for(i=0;i<64;i++)_y[i]=w[i]+2>>2;
 }
-
-
-void dsp_dct_init(DspFunctions *_funcs,ogg_uint32_t _cpu_flags){
-  _funcs->fdct_short=oc_fdct8x8_c;
-  dsp_dct_decode_init(_funcs,_cpu_flags);
-  dsp_idct_init(_funcs,_cpu_flags);
-#if defined(USE_ASM)
-  /*TODO: Need to write an MMX version.*/
-  if(_cpu_flags&OC_CPU_X86_MMX){
-    dsp_mmx_fdct_init(_funcs);
-  }
-  if(_cpu_flags&OC_CPU_X86_SSE2){
-    dsp_sse2_fdct_init(_funcs);
-  }
-#endif
-}

Modified: branches/theora-thusnelda/lib/enc/dct_decode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_decode.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/dct_decode.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -148,8 +148,8 @@
 
     FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
 
-    PixelPtr[1] = clamp255(PixelPtr[1] + FiltVal);
-    PixelPtr[2] = clamp255(PixelPtr[2] - FiltVal);
+    PixelPtr[1] = OC_CLAMP255(PixelPtr[1] + FiltVal);
+    PixelPtr[2] = OC_CLAMP255(PixelPtr[2] - FiltVal);
 
     PixelPtr += LineLength;
   }
@@ -170,14 +170,14 @@
 
     FiltVal = *(BoundingValuePtr+((FiltVal + 4) >> 3));
 
-    PixelPtr[LineLength] = clamp255(PixelPtr[LineLength] + FiltVal);
-    PixelPtr[2 * LineLength] = clamp255(PixelPtr[2*LineLength] - FiltVal);
+    PixelPtr[LineLength] = OC_CLAMP255(PixelPtr[LineLength] + FiltVal);
+    PixelPtr[2 * LineLength] = OC_CLAMP255(PixelPtr[2*LineLength] - FiltVal);
 
     PixelPtr ++;
   }
 }
 
-static void LoopFilter__c(CP_INSTANCE *cpi, int FLimit){
+void oc_enc_loop_filter_c(CP_INSTANCE *cpi, int FLimit){
 
   int j;
   ogg_int16_t BoundingValues[256];
@@ -198,18 +198,18 @@
       ogg_uint32_t *bp_left = bp;
       ogg_uint32_t *bp_right = bp + h;
       while(bp<bp_right){
-	if(cp[0]){
-	  if(bp>bp_left)
-	    loop_filter_h(&cpi->lastrecon[bp[0]],stride,bvp);
-	  if(bp_left>bp_begin)
-	    loop_filter_v(&cpi->lastrecon[bp[0]],stride,bvp);
-	  if(bp+1<bp_right && !cp[1])
-	    loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,bvp);
-	  if(bp+h<bp_end && !cp[h])
-	    loop_filter_v(&cpi->lastrecon[bp[h]],stride,bvp);
-	}
-	bp++;
-	cp++;
+        if(cp[0]){
+          if(bp>bp_left)
+            loop_filter_h(&cpi->lastrecon[bp[0]],stride,bvp);
+          if(bp_left>bp_begin)
+            loop_filter_v(&cpi->lastrecon[bp[0]],stride,bvp);
+          if(bp+1<bp_right && !cp[1])
+            loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,bvp);
+          if(bp+h<bp_end && !cp[h])
+            loop_filter_v(&cpi->lastrecon[bp[h]],stride,bvp);
+        }
+        bp++;
+        cp++;
       }
     }
   }
@@ -222,7 +222,7 @@
   cpi->lastrecon=cpi->recon;
   cpi->recon=temp;
   /* Apply a loop filter to edge pixels of updated blocks */
-  dsp_LoopFilter(cpi->dsp, cpi, cpi->quant_info.loop_filter_limits[cpi->BaseQ] /* temp */);
+  oc_enc_loop_filter(cpi,cpi->quant_info.loop_filter_limits[cpi->BaseQ]);
   /* We may need to update the UMV border */
   UpdateUMVBorder(cpi, cpi->lastrecon);
   /*Swap back.*/
@@ -230,13 +230,3 @@
   cpi->lastrecon=cpi->recon;
   cpi->recon=temp;
 }
-
-void dsp_dct_decode_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
-{
-  funcs->LoopFilter = LoopFilter__c;
-#if defined(USE_ASM)
-  if (cpu_flags & OC_CPU_X86_MMX) {
-    dsp_mmx_dct_decode_init(funcs);
-  }
-#endif
-}

Modified: branches/theora-thusnelda/lib/enc/dct_encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_encode.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/dct_encode.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -18,7 +18,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include "codec_internal.h"
-#include "dsp.h"
 #include "quant_lookup.h"
 
 static void make_eobrun_token(int run, int *token, int *eb){
@@ -227,7 +226,7 @@
 /* only counts bits */
 static int tokencost(CP_INSTANCE *cpi, int huff, int coeff, int token){
   huff += acoffset[coeff];
-  return cpi->HuffCodeLengthArray_VP3x[huff][token] + cpi->ExtraBitLengths_VP3x[token];
+  return cpi->huff_codes[huff][token].nbits+OC_DCT_TOKEN_EXTRA_BITS[token];
 }
 
 void tokenlog_rollback(CP_INSTANCE *cpi, token_checkpoint_t *stack,int n){
@@ -248,17 +247,17 @@
     /* DC */
     int i;
     for ( i = 0; i < DC_HUFF_CHOICES; i++)
-      cpi->dc_bits[chroma][i] += cpi->HuffCodeLengthArray_VP3x[i][token];
+      cpi->dc_bits[chroma][i] += cpi->huff_codes[i][token].nbits;
   }else if (coeff == 1){
     /* AC == 1*/
     int i,offset = acoffset[1]+AC_HUFF_OFFSET;
     for ( i = 0; i < AC_HUFF_CHOICES; i++)
-      cpi->ac1_bits[chroma][i] += cpi->HuffCodeLengthArray_VP3x[offset+i][token];
+      cpi->ac1_bits[chroma][i] += cpi->huff_codes[offset+i][token].nbits;
   }else{
     /* AC > 1*/
     int i,offset = acoffset[coeff]+AC_HUFF_OFFSET;
     for ( i = 0; i < AC_HUFF_CHOICES; i++)
-      cpi->acN_bits[chroma][i] += cpi->HuffCodeLengthArray_VP3x[offset+i][token];
+      cpi->acN_bits[chroma][i] += cpi->huff_codes[offset+i][token].nbits;
   }
 }
 

Deleted: branches/theora-thusnelda/lib/enc/dsp.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/dsp.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,213 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-#include "codec_internal.h"
-#include "../cpu.c"
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-static void set8x8__c (unsigned char val, unsigned char *ptr,
-		       ogg_uint32_t PixelsPerLine){
-  /* For each block row */
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-  ptr+=PixelsPerLine;
-  memset(ptr,val,8);
-}
-
-static void sub8x8__c (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
-		       ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine){
-  int i;
-
-  /* For each block row */
-  for (i=8; i; i--) {
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], ReconPtr[0]);
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], ReconPtr[1]);
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], ReconPtr[2]);
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], ReconPtr[3]);
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], ReconPtr[4]);
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], ReconPtr[5]);
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], ReconPtr[6]);
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], ReconPtr[7]);
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    ReconPtr += PixelsPerLine;
-    DctInputPtr += 8;
-  }
-}
-
-static void sub8x8_128__c (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-			   ogg_uint32_t PixelsPerLine) {
-  int i;
-  /* For each block row */
-  for (i=8; i; i--) {
-    /* INTRA mode so code raw image data */
-    /* We convert the data to 8 bit signed (by subtracting 128) as
-       this reduces the internal precision requirments in the DCT
-       transform. */
-    DctInputPtr[0] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[0], 128);
-    DctInputPtr[1] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[1], 128);
-    DctInputPtr[2] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[2], 128);
-    DctInputPtr[3] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[3], 128);
-    DctInputPtr[4] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[4], 128);
-    DctInputPtr[5] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[5], 128);
-    DctInputPtr[6] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[6], 128);
-    DctInputPtr[7] = (ogg_int16_t) DSP_OP_DIFF (FiltPtr[7], 128);
-
-    /* Start next row */
-    FiltPtr += PixelsPerLine;
-    DctInputPtr += 8;
-  }
-}
-
-static ogg_uint32_t sad8x8__c (const unsigned char *ptr1, 
-			       const unsigned char *ptr2, 
-			       ogg_uint32_t stride)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
-    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
-    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
-    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
-    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
-    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
-    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
-    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
-    /* Step to next row of block. */
-    ptr1 += stride;
-    ptr2 += stride;
-  }
-
-  return sad;
-}
-
-static ogg_uint32_t sad8x8_thres__c (const unsigned char *ptr1, 
-				     const unsigned char *ptr2, 
-				     ogg_uint32_t stride, 
-				     ogg_uint32_t thres)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(ptr1[0], ptr2[0]);
-    sad += DSP_OP_ABS_DIFF(ptr1[1], ptr2[1]);
-    sad += DSP_OP_ABS_DIFF(ptr1[2], ptr2[2]);
-    sad += DSP_OP_ABS_DIFF(ptr1[3], ptr2[3]);
-    sad += DSP_OP_ABS_DIFF(ptr1[4], ptr2[4]);
-    sad += DSP_OP_ABS_DIFF(ptr1[5], ptr2[5]);
-    sad += DSP_OP_ABS_DIFF(ptr1[6], ptr2[6]);
-    sad += DSP_OP_ABS_DIFF(ptr1[7], ptr2[7]);
-
-    if (sad > thres )
-      break;
-
-    /* Step to next row of block. */
-    ptr1 += stride;
-    ptr2 += stride;
-  }
-
-  return sad;
-}
-
-static ogg_uint32_t sad8x8_xy2_thres__c (const unsigned char *SrcData, 
-					 const unsigned char *RefDataPtr1,
-					 const unsigned char *RefDataPtr2, 
-					 ogg_uint32_t Stride,
-					 ogg_uint32_t thres)
-{
-  ogg_uint32_t  i;
-  ogg_uint32_t  sad = 0;
-
-  for (i=8; i; i--) {
-    sad += DSP_OP_ABS_DIFF(SrcData[0], DSP_OP_AVG (RefDataPtr1[0], RefDataPtr2[0]));
-    sad += DSP_OP_ABS_DIFF(SrcData[1], DSP_OP_AVG (RefDataPtr1[1], RefDataPtr2[1]));
-    sad += DSP_OP_ABS_DIFF(SrcData[2], DSP_OP_AVG (RefDataPtr1[2], RefDataPtr2[2]));
-    sad += DSP_OP_ABS_DIFF(SrcData[3], DSP_OP_AVG (RefDataPtr1[3], RefDataPtr2[3]));
-    sad += DSP_OP_ABS_DIFF(SrcData[4], DSP_OP_AVG (RefDataPtr1[4], RefDataPtr2[4]));
-    sad += DSP_OP_ABS_DIFF(SrcData[5], DSP_OP_AVG (RefDataPtr1[5], RefDataPtr2[5]));
-    sad += DSP_OP_ABS_DIFF(SrcData[6], DSP_OP_AVG (RefDataPtr1[6], RefDataPtr2[6]));
-    sad += DSP_OP_ABS_DIFF(SrcData[7], DSP_OP_AVG (RefDataPtr1[7], RefDataPtr2[7]));
-
-    if ( sad > thres )
-      break;
-
-    /* Step to next row of block. */
-    SrcData += Stride;
-    RefDataPtr1 += Stride;
-    RefDataPtr2 += Stride;
-  }
-
-  return sad;
-}
-
-static void nop (void) { /* NOP */ }
-
-void dsp_init(DspFunctions *funcs)
-{
-  funcs->save_fpu = nop;
-  funcs->restore_fpu = nop;
-  funcs->set8x8 = set8x8__c;
-  funcs->sub8x8 = sub8x8__c;
-  funcs->sub8x8_128 = sub8x8_128__c;
-  funcs->sad8x8 = sad8x8__c;
-  funcs->sad8x8_thres = sad8x8_thres__c;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__c;
-}
-
-void dsp_static_init(DspFunctions *funcs)
-{
-  ogg_uint32_t cpuflags;
-
-  cpuflags = oc_cpu_flags_get ();
-  dsp_init (funcs);
-
-  dsp_recon_init (funcs, cpuflags);
-  dsp_dct_init (funcs, cpuflags);
-#if defined(USE_ASM)
-  if (cpuflags & OC_CPU_X86_MMX) {
-    dsp_mmx_init(funcs);
-  }
-# ifndef WIN32
-  /* This is implemented for win32 yet */
-  if (cpuflags & OC_CPU_X86_MMXEXT) {
-    dsp_mmxext_init(funcs);
-  }
-# endif
-#endif
-}
-

Deleted: branches/theora-thusnelda/lib/enc/dsp.h
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/dsp.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,135 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#ifndef DSP_H
-#define DSP_H
-
-typedef struct DspFunctions DspFunctions;
-
-#include "theora/theora.h"
-#include "codec_internal.h"
-#include "../cpu.h"
-
-struct DspFunctions{
-  void   (*save_fpu)              (void);
-  void   (*restore_fpu)           (void);
-
-  void   (*set8x8)                (unsigned char val, unsigned char *ptr,
-				   ogg_uint32_t stride);
-
-  void   (*sub8x8)                (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
-				   ogg_int16_t *DctInputPtr, ogg_uint32_t stride);
-
-  void   (*sub8x8_128)            (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-				   ogg_uint32_t stride);
-
-  void   (*copy8x8)               (const unsigned char *src, unsigned char *dest, 
-				   ogg_uint32_t stride);
-  
-  void   (*copy8x8_half)          (const unsigned char *src1, const unsigned char *src2, 
-				   unsigned char *dest, ogg_uint32_t stride);
-
-  void   (*recon8x8)              (unsigned char *ReconPtr, const ogg_int16_t *ChangePtr, 
-				   ogg_uint32_t stride);
-
-  void   (*fdct_short)            (const ogg_int16_t *InputData, ogg_int16_t *OutputData);
-
-  ogg_uint32_t (*sad8x8)          (const unsigned char *ptr1, const unsigned char *ptr2, 
-				   ogg_uint32_t stride);
-
-  ogg_uint32_t (*sad8x8_thres)    (const unsigned char *ptr1, const unsigned char *ptr2, 
-				   ogg_uint32_t stride, ogg_uint32_t thres);
-
-  ogg_uint32_t (*sad8x8_xy2_thres)(const unsigned char *SrcData, const unsigned char *RefDataPtr1,
-				   const unsigned char *RefDataPtr2, ogg_uint32_t stride,
-				   ogg_uint32_t thres);
-                 
-  void (*LoopFilter)              (CP_INSTANCE *cpi, int FLimit);
-
-  void (*FilterVert)              (unsigned char * PixelPtr,
-				   ogg_int32_t LineLength, ogg_int16_t *BoundingValuePtr);
-  
-  void (*IDctSlow)                (const ogg_int16_t *InputData, 
-				   const ogg_int16_t *QuantMatrix, 
-				   ogg_int16_t *OutputData);
-
-  void (*IDct3)                   (const ogg_int16_t *InputData, 
-				   const ogg_int16_t *QuantMatrix, 
-				   ogg_int16_t *OutputData);
-  
-  void (*IDct10)                  (const ogg_int16_t *InputData, 
-				   const ogg_int16_t *QuantMatrix, 
-				   ogg_int16_t *OutputData);
-};
-
-extern void dsp_dct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
-extern void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags);
-extern void dsp_dct_decode_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
-extern void dsp_idct_init(DspFunctions *funcs, ogg_uint32_t cpu_flags);
-
-void dsp_init(DspFunctions *funcs);
-void dsp_static_init(DspFunctions *funcs);
-#if defined(USE_ASM) && (defined(__i386__) || defined(__x86_64__) || defined(WIN32))
-extern void dsp_mmx_init(DspFunctions *funcs);
-extern void dsp_mmxext_init(DspFunctions *funcs);
-extern void dsp_mmx_fdct_init(DspFunctions *funcs);
-extern void dsp_mmx_recon_init(DspFunctions *funcs);
-extern void dsp_mmx_dct_decode_init(DspFunctions *funcs);
-extern void dsp_mmx_idct_init(DspFunctions *funcs);
-# if defined(__amd64__)||defined(__x86_64__)
-extern void dsp_sse2_fdct_init(DspFunctions *funcs);
-# endif
-#endif
-
-#define dsp_save_fpu(funcs) (funcs.save_fpu ())
-
-#define dsp_restore_fpu(funcs) (funcs.restore_fpu ())
-
-#define dsp_set8x8(funcs,a1,a2,a3) (funcs.set8x8 (a1,a2,a3))
-
-#define dsp_sub8x8(funcs,a1,a2,a3,a4) (funcs.sub8x8 (a1,a2,a3,a4))
-
-#define dsp_sub8x8_128(funcs,a1,a2,a3) (funcs.sub8x8_128 (a1,a2,a3))
-
-#define dsp_copy8x8(funcs,ptr1,ptr2,str1) (funcs.copy8x8 (ptr1,ptr2,str1))
-
-#define dsp_copy8x8_half(funcs,ptr1,ptr2,ptr3,str1) (funcs.copy8x8_half (ptr1,ptr2,ptr3,str1))
-
-#define dsp_recon8x8(funcs,ptr1,ptr2,str1) (funcs.recon8x8 (ptr1,ptr2,str1))
-
-#define dsp_fdct_short(funcs,in,out) (funcs.fdct_short (in,out))
-
-#define dsp_sad8x8(funcs,ptr1,ptr2,str) (funcs.sad8x8 (ptr1,ptr2,str))
-
-#define dsp_sad8x8_thres(funcs,ptr1,ptr2,str,t) (funcs.sad8x8_thres (ptr1,ptr2,str,t))
-
-#define dsp_sad8x8_xy2_thres(funcs,ptr1,ptr2,ptr3,str,t) \
-  (funcs.sad8x8_xy2_thres (ptr1,ptr2,ptr3,str,t))
-
-#define dsp_LoopFilter(funcs, ptr1, i) \
-  (funcs.LoopFilter(ptr1, i))
-
-#define dsp_IDctSlow(funcs, ptr1, ptr2, ptr3) \
-    (funcs.IDctSlow(ptr1, ptr2, ptr3))
-
-#define dsp_IDct3(funcs, ptr1, ptr2, ptr3) \
-    (funcs.IDctSlow(ptr1, ptr2, ptr3))
-
-#define dsp_IDct10(funcs, ptr1, ptr2, ptr3) \
-   (funcs.IDctSlow(ptr1, ptr2, ptr3))
-
-#endif /* DSP_H */

Modified: branches/theora-thusnelda/lib/enc/encapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -8,818 +8,10 @@
 /*Wrapper to translate the new API into the old API.
   Eventually we need to convert the old functions to support the new API
    natively and do the translation the other way.
-  theora-exp already the necessary code to do so.*/
+  theora-exp already has the necessary code to do so.*/
 
 
 
-/*The default Huffman codes used for VP3.1.
-  It's kind of useless to include this, as TH_ENCCTL_SET_HUFFMAN_CODES is not
-   actually implemented in the old encoder, but it's part of the public API.*/
-const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]={
-  {
-    {0x002D, 6},{0x0026, 7},{0x0166, 9},{0x004E, 8},
-    {0x02CE,10},{0x059E,11},{0x027D,11},{0x0008, 5},
-    {0x04F9,12},{0x000F, 4},{0x000E, 4},{0x001B, 5},
-    {0x0006, 4},{0x0008, 4},{0x0005, 4},{0x001A, 5},
-    {0x0015, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
-    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x0029, 6},
-    {0x0028, 6},{0x00B2, 8},{0x04F8,12},{0x059F,11},
-    {0x009E, 9},{0x013F,10},{0x0012, 6},{0x0058, 7}
-  },
-  {
-    {0x0010, 5},{0x0047, 7},{0x01FF, 9},{0x008C, 8},
-    {0x03FC,10},{0x046A,11},{0x0469,11},{0x0022, 6},
-    {0x11A1,13},{0x000E, 4},{0x000D, 4},{0x0004, 4},
-    {0x0005, 4},{0x0009, 4},{0x0006, 4},{0x001E, 5},
-    {0x0016, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
-    {0x0000, 3},{0x000A, 4},{0x0017, 5},{0x007D, 7},
-    {0x007E, 7},{0x011B, 9},{0x08D1,12},{0x03FD,10},
-    {0x046B,11},{0x11A0,13},{0x007C, 7},{0x00FE, 8}
-  },
-  {
-    {0x0016, 5},{0x0020, 6},{0x0086, 8},{0x0087, 8},
-    {0x0367,10},{0x06CC,11},{0x06CB,11},{0x006E, 7},
-    {0x366D,14},{0x000F, 4},{0x000E, 4},{0x0004, 4},
-    {0x0005, 4},{0x000A, 4},{0x0006, 4},{0x001A, 5},
-    {0x0011, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
-    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x006F, 7},
-    {0x006D, 7},{0x0364,10},{0x0D9A,12},{0x06CA,11},
-    {0x1B37,13},{0x366C,14},{0x0042, 7},{0x00D8, 8}
-  },
-  {
-    {0x0000, 4},{0x002D, 6},{0x00F7, 8},{0x0058, 7},
-    {0x0167, 9},{0x02CB,10},{0x02CA,10},{0x000E, 6},
-    {0x1661,13},{0x0003, 3},{0x0002, 3},{0x0008, 4},
-    {0x0009, 4},{0x000D, 4},{0x0002, 4},{0x001F, 5},
-    {0x0017, 5},{0x0001, 4},{0x000C, 4},{0x000E, 4},
-    {0x000A, 4},{0x0006, 5},{0x0078, 7},{0x000F, 6},
-    {0x007A, 7},{0x0164, 9},{0x0599,11},{0x02CD,10},
-    {0x0B31,12},{0x1660,13},{0x0079, 7},{0x00F6, 8}
-  },
-  {
-    {0x0003, 4},{0x003C, 6},{0x000F, 7},{0x007A, 7},
-    {0x001D, 8},{0x0020, 9},{0x0072,10},{0x0006, 6},
-    {0x0399,13},{0x0004, 3},{0x0005, 3},{0x0005, 4},
-    {0x0006, 4},{0x000E, 4},{0x0004, 4},{0x0000, 4},
-    {0x0019, 5},{0x0002, 4},{0x000D, 4},{0x0007, 4},
-    {0x001F, 5},{0x0030, 6},{0x0011, 8},{0x0031, 6},
-    {0x0005, 6},{0x0021, 9},{0x00E7,11},{0x0038, 9},
-    {0x01CD,12},{0x0398,13},{0x007B, 7},{0x0009, 7}
-  },
-  {
-    {0x0009, 4},{0x0002, 5},{0x0074, 7},{0x0007, 6},
-    {0x00EC, 8},{0x00D1, 9},{0x01A6,10},{0x0006, 6},
-    {0x0D21,13},{0x0005, 3},{0x0006, 3},{0x0008, 4},
-    {0x0007, 4},{0x000F, 4},{0x0004, 4},{0x0000, 4},
-    {0x001C, 5},{0x0002, 4},{0x0005, 4},{0x0003, 4},
-    {0x000C, 5},{0x0035, 7},{0x01A7,10},{0x001B, 6},
-    {0x0077, 7},{0x01A5,10},{0x0349,11},{0x00D0, 9},
-    {0x0691,12},{0x0D20,13},{0x0075, 7},{0x00ED, 8}
-  },
-  {
-    {0x000A, 4},{0x000C, 5},{0x0012, 6},{0x001B, 6},
-    {0x00B7, 8},{0x016C, 9},{0x0099, 9},{0x005A, 7},
-    {0x16D8,13},{0x0007, 3},{0x0006, 3},{0x0009, 4},
-    {0x0008, 4},{0x0000, 3},{0x0005, 4},{0x0017, 5},
-    {0x000E, 5},{0x0002, 4},{0x0003, 4},{0x000F, 5},
-    {0x001A, 6},{0x004D, 8},{0x2DB3,14},{0x002C, 6},
-    {0x0011, 6},{0x02DA,10},{0x05B7,11},{0x0098, 9},
-    {0x0B6D,12},{0x2DB2,14},{0x0010, 6},{0x0027, 7}
-  },
-  {
-    {0x000D, 4},{0x000F, 5},{0x001D, 6},{0x0008, 5},
-    {0x0051, 7},{0x0056, 8},{0x00AF, 9},{0x002A, 7},
-    {0x148A,13},{0x0007, 3},{0x0000, 2},{0x0008, 4},
-    {0x0009, 4},{0x000C, 4},{0x0006, 4},{0x0017, 5},
-    {0x000B, 5},{0x0016, 5},{0x0015, 5},{0x0009, 5},
-    {0x0050, 7},{0x00AE, 9},{0x2917,14},{0x001C, 6},
-    {0x0014, 6},{0x0290,10},{0x0523,11},{0x0149, 9},
-    {0x0A44,12},{0x2916,14},{0x0053, 7},{0x00A5, 8}
-  },
-  {
-    {0x0001, 4},{0x001D, 6},{0x00F5, 8},{0x00F4, 8},
-    {0x024D,10},{0x0499,11},{0x0498,11},{0x0001, 5},
-    {0x0021, 6},{0x0006, 3},{0x0005, 3},{0x0006, 4},
-    {0x0005, 4},{0x0002, 4},{0x0007, 5},{0x0025, 6},
-    {0x007B, 7},{0x001C, 6},{0x0020, 6},{0x000D, 6},
-    {0x0048, 7},{0x0092, 8},{0x0127, 9},{0x000E, 4},
-    {0x0004, 4},{0x0011, 5},{0x000C, 6},{0x003C, 6},
-    {0x000F, 5},{0x0000, 5},{0x001F, 5},{0x0013, 5}
-  },
-  {
-    {0x0005, 4},{0x003C, 6},{0x0040, 7},{0x000D, 7},
-    {0x0031, 9},{0x0061,10},{0x0060,10},{0x0002, 5},
-    {0x00F5, 8},{0x0006, 3},{0x0005, 3},{0x0007, 4},
-    {0x0006, 4},{0x0002, 4},{0x0009, 5},{0x0025, 6},
-    {0x0007, 6},{0x0021, 6},{0x0024, 6},{0x0010, 6},
-    {0x0041, 7},{0x00F4, 8},{0x0019, 8},{0x000E, 4},
-    {0x0003, 4},{0x0011, 5},{0x0011, 6},{0x003F, 6},
-    {0x003E, 6},{0x007B, 7},{0x0000, 4},{0x0013, 5}
-  },
-  {
-    {0x000A, 4},{0x0007, 5},{0x0001, 6},{0x0009, 6},
-    {0x0131, 9},{0x0261,10},{0x0260,10},{0x0015, 6},
-    {0x0001, 7},{0x0007, 3},{0x0006, 3},{0x0008, 4},
-    {0x0007, 4},{0x0006, 4},{0x0012, 5},{0x002F, 6},
-    {0x0014, 6},{0x0027, 6},{0x002D, 6},{0x0016, 6},
-    {0x004D, 7},{0x0099, 8},{0x0000, 7},{0x0004, 4},
-    {0x0001, 4},{0x0005, 5},{0x0017, 6},{0x002E, 6},
-    {0x002C, 6},{0x0008, 6},{0x0006, 5},{0x0001, 5}
-  },
-  {
-    {0x0000, 3},{0x000E, 5},{0x0017, 6},{0x002A, 6},
-    {0x0010, 7},{0x00F9,10},{0x00F8,10},{0x001E, 7},
-    {0x003F, 8},{0x0007, 3},{0x0006, 3},{0x0009, 4},
-    {0x0008, 4},{0x0006, 4},{0x000F, 5},{0x0005, 5},
-    {0x0016, 6},{0x0029, 6},{0x002B, 6},{0x0015, 6},
-    {0x0050, 7},{0x0011, 7},{0x007D, 9},{0x0004, 4},
-    {0x0017, 5},{0x0006, 5},{0x0014, 6},{0x002C, 6},
-    {0x002D, 6},{0x000E, 6},{0x0009, 6},{0x0051, 7}
-  },
-  {
-    {0x0002, 3},{0x0018, 5},{0x002F, 6},{0x000D, 5},
-    {0x0053, 7},{0x0295,10},{0x0294,10},{0x00A4, 8},
-    {0x007C, 8},{0x0000, 2},{0x0007, 3},{0x0009, 4},
-    {0x0008, 4},{0x001B, 5},{0x000C, 5},{0x0028, 6},
-    {0x006A, 7},{0x001E, 6},{0x001D, 6},{0x0069, 7},
-    {0x00D7, 8},{0x007D, 8},{0x014B, 9},{0x0019, 5},
-    {0x0016, 5},{0x002E, 6},{0x001C, 6},{0x002B, 6},
-    {0x002A, 6},{0x0068, 7},{0x003F, 7},{0x00D6, 8}
-  },
-  {
-    {0x0002, 3},{0x001B, 5},{0x000C, 5},{0x0018, 5},
-    {0x0029, 6},{0x007F, 8},{0x02F0,10},{0x0198, 9},
-    {0x0179, 9},{0x0000, 2},{0x0007, 3},{0x0009, 4},
-    {0x0008, 4},{0x001A, 5},{0x000D, 5},{0x002A, 6},
-    {0x0064, 7},{0x001E, 6},{0x0067, 7},{0x005F, 7},
-    {0x00CD, 8},{0x007E, 8},{0x02F1,10},{0x0016, 5},
-    {0x000E, 5},{0x002E, 6},{0x0065, 7},{0x002B, 6},
-    {0x0028, 6},{0x003E, 7},{0x00BD, 8},{0x0199, 9}
-  },
-  {
-    {0x0002, 3},{0x0007, 4},{0x0016, 5},{0x0006, 4},
-    {0x0036, 6},{0x005C, 7},{0x015D, 9},{0x015C, 9},
-    {0x02BF,10},{0x0000, 2},{0x0007, 3},{0x0009, 4},
-    {0x0008, 4},{0x0018, 5},{0x0034, 6},{0x002A, 6},
-    {0x005E, 7},{0x006A, 7},{0x0064, 7},{0x005D, 7},
-    {0x00CB, 8},{0x00AD, 8},{0x02BE,10},{0x0014, 5},
-    {0x0033, 6},{0x006E, 7},{0x005F, 7},{0x006F, 7},
-    {0x006B, 7},{0x00CA, 8},{0x00AC, 8},{0x015E, 9}
-  },
-  {
-    {0x000F, 4},{0x001D, 5},{0x0018, 5},{0x000B, 4},
-    {0x0019, 5},{0x0029, 6},{0x00D6, 8},{0x0551,11},
-    {0x0AA1,12},{0x0001, 2},{0x0000, 2},{0x0009, 4},
-    {0x0008, 4},{0x001B, 5},{0x0038, 6},{0x0028, 6},
-    {0x0057, 7},{0x006A, 7},{0x0068, 7},{0x0056, 7},
-    {0x00E5, 8},{0x0155, 9},{0x0AA0,12},{0x0073, 7},
-    {0x0069, 7},{0x00D7, 8},{0x00AB, 8},{0x00E4, 8},
-    {0x00A9, 8},{0x0151, 9},{0x0150, 9},{0x02A9,10}
-  },
-  {
-    {0x0008, 5},{0x0025, 7},{0x017A, 9},{0x02F7,10},
-    {0x0BDB,12},{0x17B4,13},{0x2F6B,14},{0x001D, 5},
-    {0x2F6A,14},{0x0008, 4},{0x0007, 4},{0x0001, 4},
-    {0x0002, 4},{0x000A, 4},{0x0006, 4},{0x0000, 4},
-    {0x001C, 5},{0x0009, 4},{0x000D, 4},{0x000F, 4},
-    {0x000C, 4},{0x0003, 4},{0x000A, 5},{0x0016, 5},
-    {0x0013, 6},{0x005D, 7},{0x0024, 7},{0x00BC, 8},
-    {0x005C, 7},{0x05EC,11},{0x000B, 5},{0x005F, 7}
-  },
-  {
-    {0x000F, 5},{0x0010, 6},{0x004B, 8},{0x00C6, 8},
-    {0x031D,10},{0x0C71,12},{0x0C70,12},{0x0001, 4},
-    {0x0C73,12},{0x0008, 4},{0x0009, 4},{0x0002, 4},
-    {0x0003, 4},{0x000B, 4},{0x0006, 4},{0x0000, 4},
-    {0x001C, 5},{0x0005, 4},{0x000D, 4},{0x000F, 4},
-    {0x000A, 4},{0x0019, 5},{0x0013, 6},{0x001D, 5},
-    {0x0030, 6},{0x0062, 7},{0x0024, 7},{0x004A, 8},
-    {0x018F, 9},{0x0C72,12},{0x000E, 5},{0x0011, 6}
-  },
-  {
-    {0x001B, 5},{0x0003, 6},{0x008D, 8},{0x0040, 7},
-    {0x0239,10},{0x0471,11},{0x08E0,12},{0x0003, 4},
-    {0x11C3,13},{0x000A, 4},{0x0009, 4},{0x0004, 4},
-    {0x0005, 4},{0x000E, 4},{0x0007, 4},{0x0001, 4},
-    {0x001E, 5},{0x0006, 4},{0x000C, 4},{0x000B, 4},
-    {0x0002, 4},{0x0000, 5},{0x0041, 7},{0x001F, 5},
-    {0x0022, 6},{0x0002, 6},{0x008F, 8},{0x008C, 8},
-    {0x011D, 9},{0x11C2,13},{0x001A, 5},{0x0021, 6}
-  },
-  {
-    {0x001F, 5},{0x0003, 6},{0x0003, 7},{0x0043, 7},
-    {0x000B, 9},{0x0015,10},{0x0051,12},{0x0003, 4},
-    {0x0050,12},{0x000D, 4},{0x000C, 4},{0x0004, 4},
-    {0x0006, 4},{0x000E, 4},{0x000A, 4},{0x0001, 4},
-    {0x001E, 5},{0x0005, 4},{0x0009, 4},{0x0007, 4},
-    {0x0011, 5},{0x0002, 6},{0x0004, 8},{0x0002, 4},
-    {0x002D, 6},{0x0020, 6},{0x0042, 7},{0x0001, 7},
-    {0x0000, 7},{0x0029,11},{0x0017, 5},{0x002C, 6}
-  },
-  {
-    {0x0003, 4},{0x001F, 6},{0x003A, 7},{0x005D, 7},
-    {0x0173, 9},{0x02E4,10},{0x172D,13},{0x0004, 4},
-    {0x172C,13},{0x000F, 4},{0x000E, 4},{0x0009, 4},
-    {0x0008, 4},{0x000C, 4},{0x000A, 4},{0x0001, 4},
-    {0x0016, 5},{0x0002, 4},{0x0005, 4},{0x001A, 5},
-    {0x002F, 6},{0x0038, 7},{0x05CA,11},{0x0006, 4},
-    {0x0037, 6},{0x001E, 6},{0x003B, 7},{0x0039, 7},
-    {0x00B8, 8},{0x0B97,12},{0x0000, 4},{0x0036, 6}
-  },
-  {
-    {0x0006, 4},{0x0037, 6},{0x005D, 7},{0x000C, 6},
-    {0x00B9, 8},{0x02E3,10},{0x05C4,11},{0x0004, 4},
-    {0x1715,13},{0x0000, 3},{0x000F, 4},{0x0008, 4},
-    {0x0007, 4},{0x000C, 4},{0x0009, 4},{0x001D, 5},
-    {0x0016, 5},{0x001C, 5},{0x001A, 5},{0x000B, 5},
-    {0x005E, 7},{0x0170, 9},{0x1714,13},{0x000A, 4},
-    {0x000A, 5},{0x0036, 6},{0x005F, 7},{0x001B, 7},
-    {0x001A, 7},{0x0B8B,12},{0x0002, 4},{0x0007, 5}
-  },
-  {
-    {0x000C, 4},{0x000B, 5},{0x0079, 7},{0x0022, 6},
-    {0x00F0, 8},{0x0119, 9},{0x0230,10},{0x001D, 5},
-    {0x08C4,12},{0x0001, 3},{0x0000, 3},{0x000A, 4},
-    {0x0009, 4},{0x000B, 4},{0x0007, 4},{0x001C, 5},
-    {0x003D, 6},{0x000D, 5},{0x0008, 5},{0x0015, 6},
-    {0x008D, 8},{0x118B,13},{0x118A,13},{0x000D, 4},
-    {0x0010, 5},{0x0009, 5},{0x0014, 6},{0x0047, 7},
-    {0x00F1, 8},{0x0463,11},{0x001F, 5},{0x000C, 5}
-  },
-  {
-    {0x0000, 3},{0x001A, 5},{0x0033, 6},{0x000C, 5},
-    {0x0046, 7},{0x01E3, 9},{0x03C5,10},{0x0017, 5},
-    {0x1E21,13},{0x0002, 3},{0x0001, 3},{0x0009, 4},
-    {0x000A, 4},{0x0007, 4},{0x001B, 5},{0x003D, 6},
-    {0x001B, 6},{0x0022, 6},{0x0079, 7},{0x00F0, 8},
-    {0x1E20,13},{0x1E23,13},{0x1E22,13},{0x000E, 4},
-    {0x0016, 5},{0x0018, 5},{0x0032, 6},{0x001A, 6},
-    {0x0047, 7},{0x0789,11},{0x001F, 5},{0x0010, 5}
-  },
-  {
-    {0x001D, 5},{0x0061, 7},{0x004E, 8},{0x009E, 9},
-    {0x027C,11},{0x09F5,13},{0x09F4,13},{0x0003, 4},
-    {0x0060, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
-    {0x000A, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
-    {0x0031, 6},{0x0008, 5},{0x0038, 6},{0x0012, 6},
-    {0x0026, 7},{0x013F,10},{0x04FB,12},{0x000D, 4},
-    {0x0002, 4},{0x000C, 5},{0x0039, 6},{0x001C, 6},
-    {0x000F, 5},{0x001D, 6},{0x0008, 4},{0x0019, 5}
-  },
-  {
-    {0x0007, 4},{0x0019, 6},{0x00AB, 8},{0x00AA, 8},
-    {0x0119,10},{0x0461,12},{0x0460,12},{0x001B, 5},
-    {0x0047, 8},{0x0001, 3},{0x0000, 3},{0x000C, 4},
-    {0x000B, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
-    {0x0035, 6},{0x003D, 6},{0x003C, 6},{0x0018, 6},
-    {0x0022, 7},{0x008D, 9},{0x0231,11},{0x000E, 4},
-    {0x001F, 5},{0x0009, 5},{0x002B, 6},{0x0010, 6},
-    {0x0034, 6},{0x0054, 7},{0x0008, 4},{0x0014, 5}
-  },
-  {
-    {0x000C, 4},{0x0005, 5},{0x0008, 6},{0x005B, 7},
-    {0x004D, 9},{0x0131,11},{0x0261,12},{0x001A, 5},
-    {0x0012, 7},{0x0000, 3},{0x000F, 4},{0x000A, 4},
-    {0x0009, 4},{0x0006, 4},{0x001B, 5},{0x0006, 5},
-    {0x001C, 6},{0x002C, 6},{0x0015, 6},{0x005A, 7},
-    {0x0027, 8},{0x0099,10},{0x0260,12},{0x000E, 4},
-    {0x0004, 4},{0x000F, 5},{0x0007, 5},{0x001D, 6},
-    {0x000B, 5},{0x0014, 6},{0x0008, 4},{0x0017, 5}
-  },
-  {
-    {0x000F, 4},{0x0013, 5},{0x0075, 7},{0x0024, 6},
-    {0x0095, 8},{0x0251,10},{0x04A0,11},{0x0010, 5},
-    {0x00C8, 8},{0x0002, 3},{0x0001, 3},{0x0001, 4},
-    {0x0000, 4},{0x001A, 5},{0x0011, 5},{0x002C, 6},
-    {0x0065, 7},{0x0074, 7},{0x004B, 7},{0x00C9, 8},
-    {0x0129, 9},{0x0943,12},{0x0942,12},{0x0003, 3},
-    {0x000A, 4},{0x001C, 5},{0x0018, 5},{0x0033, 6},
-    {0x0017, 5},{0x002D, 6},{0x001B, 5},{0x003B, 6}
-  },
-  {
-    {0x0003, 3},{0x001A, 5},{0x002D, 6},{0x0038, 6},
-    {0x0028, 7},{0x0395,10},{0x0E51,12},{0x0037, 6},
-    {0x00E4, 8},{0x0001, 3},{0x0000, 3},{0x001F, 5},
-    {0x001E, 5},{0x0017, 5},{0x003A, 6},{0x0073, 7},
-    {0x002A, 7},{0x002B, 7},{0x0029, 7},{0x01CB, 9},
-    {0x0729,11},{0x1CA1,13},{0x1CA0,13},{0x0004, 3},
-    {0x000A, 4},{0x0004, 4},{0x0018, 5},{0x0036, 6},
-    {0x000B, 5},{0x002C, 6},{0x0019, 5},{0x003B, 6}
-  },
-  {
-    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0017, 5},
-    {0x0075, 7},{0x01F5, 9},{0x07D1,11},{0x0017, 6},
-    {0x01F6, 9},{0x0001, 3},{0x0000, 3},{0x001B, 5},
-    {0x001A, 5},{0x000A, 5},{0x0032, 6},{0x0074, 7},
-    {0x00F8, 8},{0x00F9, 8},{0x01F7, 9},{0x03E9,10},
-    {0x0FA0,12},{0x1F43,13},{0x1F42,13},{0x0003, 3},
-    {0x000A, 4},{0x001E, 5},{0x001C, 5},{0x003B, 6},
-    {0x0018, 5},{0x0016, 6},{0x0016, 5},{0x0033, 6}
-  },
-  {
-    {0x0004, 3},{0x0007, 4},{0x0018, 5},{0x001E, 5},
-    {0x0036, 6},{0x0031, 7},{0x0177, 9},{0x0077, 7},
-    {0x0176, 9},{0x0001, 3},{0x0000, 3},{0x001A, 5},
-    {0x0019, 5},{0x003A, 6},{0x0019, 6},{0x005C, 7},
-    {0x00BA, 8},{0x0061, 8},{0x00C1, 9},{0x0180,10},
-    {0x0302,11},{0x0607,12},{0x0606,12},{0x0002, 3},
-    {0x000A, 4},{0x001F, 5},{0x001C, 5},{0x0037, 6},
-    {0x0016, 5},{0x0076, 7},{0x000D, 5},{0x002F, 6}
-  },
-  {
-    {0x0000, 3},{0x000A, 4},{0x001A, 5},{0x000C, 4},
-    {0x001D, 5},{0x0039, 6},{0x0078, 7},{0x005E, 7},
-    {0x0393,11},{0x0002, 3},{0x0001, 3},{0x0016, 5},
-    {0x000F, 5},{0x002E, 6},{0x005F, 7},{0x0073, 8},
-    {0x00E5, 9},{0x01C8,10},{0x0E4A,13},{0x1C97,14},
-    {0x1C96,14},{0x0E49,13},{0x0E48,13},{0x0004, 3},
-    {0x0006, 4},{0x001F, 5},{0x001B, 5},{0x001D, 6},
-    {0x0038, 6},{0x0038, 7},{0x003D, 6},{0x0079, 7}
-  },
-  {
-    {0x000B, 5},{0x002B, 7},{0x0054, 8},{0x01B7, 9},
-    {0x06D9,11},{0x0DB1,12},{0x0DB0,12},{0x0002, 4},
-    {0x00AB, 9},{0x0009, 4},{0x000A, 4},{0x0007, 4},
-    {0x0008, 4},{0x000F, 4},{0x000C, 4},{0x0003, 4},
-    {0x001D, 5},{0x0004, 4},{0x000B, 4},{0x0006, 4},
-    {0x001A, 5},{0x0003, 6},{0x00AA, 9},{0x0001, 4},
-    {0x0000, 5},{0x0014, 6},{0x006C, 7},{0x00DA, 8},
-    {0x0002, 6},{0x036D,10},{0x001C, 5},{0x0037, 6}
-  },
-  {
-    {0x001D, 5},{0x0004, 6},{0x00B6, 8},{0x006A, 8},
-    {0x05B9,11},{0x16E1,13},{0x16E0,13},{0x0007, 4},
-    {0x016F, 9},{0x000C, 4},{0x000D, 4},{0x0009, 4},
-    {0x0008, 4},{0x000F, 4},{0x000A, 4},{0x0003, 4},
-    {0x0017, 5},{0x0002, 4},{0x0004, 4},{0x001C, 5},
-    {0x002C, 6},{0x006B, 8},{0x0B71,12},{0x0005, 4},
-    {0x0003, 5},{0x001B, 6},{0x005A, 7},{0x0034, 7},
-    {0x0005, 6},{0x02DD,10},{0x0000, 4},{0x000C, 5}
-  },
-  {
-    {0x0003, 4},{0x007F, 7},{0x00A1, 8},{0x00A0, 8},
-    {0x020C,10},{0x0834,12},{0x106B,13},{0x0007, 4},
-    {0x0082, 8},{0x000E, 4},{0x000D, 4},{0x000B, 4},
-    {0x000C, 4},{0x0000, 3},{0x0009, 4},{0x0002, 4},
-    {0x0011, 5},{0x001E, 5},{0x0015, 5},{0x003E, 6},
-    {0x0040, 7},{0x041B,11},{0x106A,13},{0x0006, 4},
-    {0x000A, 5},{0x0029, 6},{0x007E, 7},{0x0051, 7},
-    {0x0021, 6},{0x0107, 9},{0x0004, 4},{0x000B, 5}
-  },
-  {
-    {0x0007, 4},{0x001B, 6},{0x00F6, 8},{0x00E9, 8},
-    {0x03A1,10},{0x0740,11},{0x0E82,12},{0x001F, 5},
-    {0x01EF, 9},{0x0001, 3},{0x0002, 3},{0x000B, 4},
-    {0x000C, 4},{0x000D, 4},{0x0008, 4},{0x001C, 5},
-    {0x0003, 5},{0x0012, 5},{0x0002, 5},{0x0075, 7},
-    {0x01D1, 9},{0x1D07,13},{0x1D06,13},{0x000A, 4},
-    {0x0013, 5},{0x003B, 6},{0x001A, 6},{0x007A, 7},
-    {0x003C, 6},{0x01EE, 9},{0x0000, 4},{0x000C, 5}
-  },
-  {
-    {0x000D, 4},{0x003D, 6},{0x0042, 7},{0x0037, 7},
-    {0x00D9, 9},{0x0362,11},{0x06C6,12},{0x001F, 5},
-    {0x0086, 8},{0x0001, 3},{0x0002, 3},{0x000C, 4},
-    {0x000B, 4},{0x000A, 4},{0x0001, 4},{0x000F, 5},
-    {0x0025, 6},{0x003C, 6},{0x001A, 6},{0x0087, 8},
-    {0x01B0,10},{0x0D8F,13},{0x0D8E,13},{0x000E, 4},
-    {0x0013, 5},{0x000C, 5},{0x0024, 6},{0x0020, 6},
-    {0x0011, 5},{0x006D, 8},{0x0000, 4},{0x000E, 5}
-  },
-  {
-    {0x0000, 3},{0x0012, 5},{0x0076, 7},{0x0077, 7},
-    {0x014D, 9},{0x0533,11},{0x14C9,13},{0x0013, 5},
-    {0x00A5, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
-    {0x000C, 4},{0x0008, 4},{0x001A, 5},{0x002B, 6},
-    {0x0075, 7},{0x0074, 7},{0x00A7, 8},{0x0298,10},
-    {0x14C8,13},{0x14CB,13},{0x14CA,13},{0x000F, 4},
-    {0x001C, 5},{0x0007, 5},{0x002A, 6},{0x0028, 6},
-    {0x001B, 5},{0x00A4, 8},{0x0002, 4},{0x0006, 5}
-  },
-  {
-    {0x0002, 3},{0x001A, 5},{0x002B, 6},{0x003A, 6},
-    {0x00ED, 8},{0x0283,10},{0x0A0A,12},{0x0004, 5},
-    {0x00A1, 8},{0x0004, 3},{0x0003, 3},{0x000B, 4},
-    {0x000C, 4},{0x001F, 5},{0x0006, 5},{0x0077, 7},
-    {0x00A3, 8},{0x00A2, 8},{0x0140, 9},{0x1417,13},
-    {0x1416,13},{0x0A09,12},{0x0A08,12},{0x0000, 3},
-    {0x001E, 5},{0x0007, 5},{0x002A, 6},{0x0029, 6},
-    {0x001C, 5},{0x00EC, 8},{0x001B, 5},{0x0005, 5}
-  },
-  {
-    {0x0002, 3},{0x0002, 4},{0x0018, 5},{0x001D, 5},
-    {0x0035, 6},{0x00E4, 8},{0x01CF,11},{0x001D, 7},
-    {0x0072, 9},{0x0004, 3},{0x0005, 3},{0x0006, 4},
-    {0x0007, 4},{0x0006, 5},{0x0073, 7},{0x0038, 8},
-    {0x01CE,11},{0x039B,12},{0x0398,12},{0x0733,13},
-    {0x0732,13},{0x0735,13},{0x0734,13},{0x0000, 3},
-    {0x001F, 5},{0x001B, 5},{0x0034, 6},{0x000F, 6},
-    {0x001E, 5},{0x00E5, 8},{0x0019, 5},{0x0038, 6}
-  },
-  {
-    {0x0016, 5},{0x0050, 7},{0x0172, 9},{0x02E7,10},
-    {0x1732,13},{0x2E67,14},{0x2E66,14},{0x0006, 4},
-    {0x0051, 7},{0x0001, 3},{0x0000, 3},{0x000D, 4},
-    {0x000C, 4},{0x0009, 4},{0x001C, 5},{0x0009, 5},
-    {0x001C, 6},{0x001D, 6},{0x005D, 7},{0x00B8, 8},
-    {0x05CD,11},{0x1731,13},{0x1730,13},{0x000F, 4},
-    {0x0005, 4},{0x000F, 5},{0x0008, 5},{0x0029, 6},
-    {0x001D, 5},{0x002F, 6},{0x0008, 4},{0x0015, 5}
-  },
-  {
-    {0x0009, 4},{0x0021, 6},{0x0040, 7},{0x00AD, 8},
-    {0x02B0,10},{0x1589,13},{0x1588,13},{0x001C, 5},
-    {0x005F, 7},{0x0000, 3},{0x000F, 4},{0x000D, 4},
-    {0x000C, 4},{0x0006, 4},{0x0011, 5},{0x002A, 6},
-    {0x0057, 7},{0x005E, 7},{0x0041, 7},{0x0159, 9},
-    {0x0563,11},{0x158B,13},{0x158A,13},{0x0001, 3},
-    {0x0005, 4},{0x0014, 5},{0x003B, 6},{0x002E, 6},
-    {0x0004, 4},{0x003A, 6},{0x0007, 4},{0x0016, 5}
-  },
-  {
-    {0x000E, 4},{0x0007, 5},{0x0046, 7},{0x0045, 7},
-    {0x0064, 9},{0x032A,12},{0x0657,13},{0x0018, 5},
-    {0x000D, 6},{0x0000, 3},{0x000F, 4},{0x000A, 4},
-    {0x000B, 4},{0x001A, 5},{0x0036, 6},{0x0047, 7},
-    {0x0044, 7},{0x0018, 7},{0x0033, 8},{0x00CB,10},
-    {0x0656,13},{0x0329,12},{0x0328,12},{0x0002, 3},
-    {0x0006, 4},{0x0019, 5},{0x000E, 5},{0x0037, 6},
-    {0x0009, 4},{0x000F, 5},{0x0002, 4},{0x0010, 5}
-  },
-  {
-    {0x0003, 3},{0x0018, 5},{0x0023, 6},{0x0077, 7},
-    {0x0194, 9},{0x1956,13},{0x32AF,14},{0x003A, 6},
-    {0x0076, 7},{0x0002, 3},{0x0001, 3},{0x001F, 5},
-    {0x001E, 5},{0x0014, 5},{0x0022, 6},{0x0064, 7},
-    {0x0197, 9},{0x0196, 9},{0x032B,10},{0x0654,11},
-    {0x32AE,14},{0x1955,13},{0x1954,13},{0x0000, 3},
-    {0x0009, 4},{0x001C, 5},{0x0015, 5},{0x0010, 5},
-    {0x000D, 4},{0x0017, 5},{0x0016, 5},{0x0033, 6}
-  },
-  {
-    {0x0005, 3},{0x0006, 4},{0x003E, 6},{0x0010, 5},
-    {0x0048, 7},{0x093F,12},{0x24FA,14},{0x0032, 6},
-    {0x0067, 7},{0x0002, 3},{0x0001, 3},{0x001B, 5},
-    {0x001E, 5},{0x0034, 6},{0x0066, 7},{0x0092, 8},
-    {0x0126, 9},{0x024E,10},{0x049E,11},{0x49F7,15},
-    {0x49F6,15},{0x24F9,14},{0x24F8,14},{0x0000, 3},
-    {0x0007, 4},{0x0018, 5},{0x0011, 5},{0x003F, 6},
-    {0x000E, 4},{0x0013, 5},{0x0035, 6},{0x0025, 6}
-  },
-  {
-    {0x0005, 3},{0x0008, 4},{0x0012, 5},{0x001C, 5},
-    {0x001C, 6},{0x00EA, 9},{0x1D75,14},{0x001E, 6},
-    {0x0066, 7},{0x0001, 3},{0x0002, 3},{0x001B, 5},
-    {0x001A, 5},{0x001F, 6},{0x003B, 7},{0x0074, 8},
-    {0x01D6,10},{0x03AF,11},{0x1D74,14},{0x1D77,14},
-    {0x1D76,14},{0x0EB9,13},{0x0EB8,13},{0x000F, 4},
-    {0x0006, 4},{0x0013, 5},{0x003B, 6},{0x003A, 6},
-    {0x0000, 3},{0x0018, 5},{0x0032, 6},{0x0067, 7}
-  },
-  {
-    {0x0004, 3},{0x000A, 4},{0x001B, 5},{0x000C, 4},
-    {0x000D, 5},{0x00E6, 8},{0x0684,11},{0x0072, 7},
-    {0x00E7, 8},{0x0002, 3},{0x0001, 3},{0x0017, 5},
-    {0x0016, 5},{0x0018, 6},{0x00D1, 8},{0x01A0, 9},
-    {0x0686,11},{0x0D0F,12},{0x0D0A,12},{0x1A17,13},
-    {0x1A16,13},{0x1A1D,13},{0x1A1C,13},{0x000F, 4},
-    {0x001D, 5},{0x000E, 5},{0x0035, 6},{0x0038, 6},
-    {0x0000, 3},{0x000F, 5},{0x0019, 6},{0x0069, 7}
-  },
-  {
-    {0x0003, 3},{0x000C, 4},{0x001B, 5},{0x0000, 3},
-    {0x0003, 4},{0x002E, 6},{0x0051, 9},{0x00BC, 8},
-    {0x0053, 9},{0x0004, 3},{0x0002, 3},{0x0016, 5},
-    {0x0015, 5},{0x0015, 7},{0x0050, 9},{0x00A4,10},
-    {0x0294,12},{0x052B,13},{0x052A,13},{0x052D,13},
-    {0x052C,13},{0x052F,13},{0x052E,13},{0x000E, 4},
-    {0x001A, 5},{0x0004, 5},{0x0028, 6},{0x0029, 6},
-    {0x000F, 4},{0x000B, 6},{0x005F, 7},{0x00BD, 8}
-  },
-  {
-    {0x0003, 4},{0x0009, 6},{0x00D0, 8},{0x01A3, 9},
-    {0x0344,10},{0x0D14,12},{0x1A2B,13},{0x0004, 4},
-    {0x0015, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
-    {0x000C, 4},{0x000E, 4},{0x0009, 4},{0x001B, 5},
-    {0x000A, 5},{0x0014, 5},{0x000D, 5},{0x002A, 6},
-    {0x0014, 7},{0x068B,11},{0x1A2A,13},{0x0008, 4},
-    {0x000B, 5},{0x002B, 6},{0x000B, 6},{0x0069, 7},
-    {0x0035, 6},{0x0008, 6},{0x0007, 4},{0x000C, 5}
-  },
-  {
-    {0x000A, 4},{0x003C, 6},{0x0032, 7},{0x0030, 7},
-    {0x00C5, 9},{0x0621,12},{0x0620,12},{0x001F, 5},
-    {0x0033, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
-    {0x000D, 4},{0x000C, 4},{0x0004, 4},{0x000D, 5},
-    {0x0026, 6},{0x0027, 6},{0x0014, 6},{0x0063, 8},
-    {0x0189,10},{0x0623,12},{0x0622,12},{0x000B, 4},
-    {0x0012, 5},{0x003D, 6},{0x0022, 6},{0x0015, 6},
-    {0x000B, 5},{0x0023, 6},{0x0007, 4},{0x0010, 5}
-  },
-  {
-    {0x000F, 4},{0x000C, 5},{0x0043, 7},{0x0010, 6},
-    {0x0044, 8},{0x0114,10},{0x0455,12},{0x0018, 5},
-    {0x0023, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
-    {0x000D, 4},{0x0009, 4},{0x0019, 5},{0x0009, 5},
-    {0x0017, 6},{0x0016, 6},{0x0042, 7},{0x008B, 9},
-    {0x0454,12},{0x0457,12},{0x0456,12},{0x000B, 4},
-    {0x0015, 5},{0x000A, 5},{0x0029, 6},{0x0020, 6},
-    {0x000D, 5},{0x0028, 6},{0x0007, 4},{0x0011, 5}
-  },
-  {
-    {0x0001, 3},{0x001A, 5},{0x0029, 6},{0x002A, 6},
-    {0x00A0, 8},{0x0285,10},{0x1425,13},{0x0002, 5},
-    {0x0000, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
-    {0x000B, 4},{0x0008, 4},{0x0012, 5},{0x0001, 6},
-    {0x0051, 7},{0x0001, 7},{0x0143, 9},{0x0508,11},
-    {0x1424,13},{0x1427,13},{0x1426,13},{0x000F, 4},
-    {0x001C, 5},{0x0003, 5},{0x0037, 6},{0x002B, 6},
-    {0x0013, 5},{0x0036, 6},{0x001D, 5},{0x0001, 5}
-  },
-  {
-    {0x0004, 3},{0x001F, 5},{0x003D, 6},{0x0006, 5},
-    {0x0016, 7},{0x0053, 9},{0x014A,11},{0x0034, 6},
-    {0x002A, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
-    {0x000C, 4},{0x001C, 5},{0x0037, 6},{0x0017, 7},
-    {0x002B, 8},{0x0028, 8},{0x00A4,10},{0x052D,13},
-    {0x052C,13},{0x052F,13},{0x052E,13},{0x0000, 3},
-    {0x001D, 5},{0x0007, 5},{0x0004, 5},{0x0035, 6},
-    {0x0014, 5},{0x0036, 6},{0x0015, 5},{0x003C, 6}
-  },
-  {
-    {0x0004, 3},{0x000A, 4},{0x0007, 5},{0x001D, 5},
-    {0x0009, 6},{0x01F3, 9},{0x07C7,11},{0x0008, 6},
-    {0x01F0, 9},{0x0003, 3},{0x0002, 3},{0x000D, 4},
-    {0x000C, 4},{0x0017, 5},{0x007D, 7},{0x01F2, 9},
-    {0x07C6,11},{0x07C5,11},{0x1F12,13},{0x3E27,14},
-    {0x3E26,14},{0x1F11,13},{0x1F10,13},{0x0000, 3},
-    {0x001E, 5},{0x0006, 5},{0x0039, 6},{0x0038, 6},
-    {0x003F, 6},{0x002C, 6},{0x0005, 5},{0x002D, 6}
-  },
-  {
-    {0x0002, 3},{0x0007, 4},{0x0018, 5},{0x0003, 4},
-    {0x0005, 5},{0x0035, 7},{0x004F, 9},{0x0012, 7},
-    {0x04E5,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
-    {0x000E, 4},{0x0033, 6},{0x0026, 8},{0x009D,10},
-    {0x04E4,13},{0x04E7,13},{0x04E6,13},{0x04E1,13},
-    {0x04E0,13},{0x04E3,13},{0x04E2,13},{0x0000, 3},
-    {0x001F, 5},{0x000C, 5},{0x003D, 6},{0x003C, 6},
-    {0x0032, 6},{0x0034, 7},{0x001B, 6},{0x0008, 6}
-  },
-  {
-    {0x0000, 3},{0x0004, 4},{0x001C, 5},{0x000F, 4},
-    {0x0002, 4},{0x0007, 5},{0x0075, 7},{0x00E8, 8},
-    {0x1D2A,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
-    {0x000C, 4},{0x0077, 7},{0x0E96,12},{0x3A57,14},
-    {0x3A56,14},{0x3A5D,14},{0x3A5C,14},{0x3A5F,14},
-    {0x3A5E,14},{0x1D29,13},{0x1D28,13},{0x0003, 3},
-    {0x0006, 5},{0x000A, 5},{0x002C, 7},{0x0017, 6},
-    {0x0076, 7},{0x01D3, 9},{0x03A4,10},{0x002D, 7}
-  },
-  {
-    {0x000A, 4},{0x0024, 6},{0x00BF, 8},{0x0085, 8},
-    {0x0211,10},{0x0842,12},{0x1087,13},{0x0018, 5},
-    {0x0020, 6},{0x0001, 3},{0x0002, 3},{0x000E, 4},
-    {0x000D, 4},{0x0007, 4},{0x0013, 5},{0x0025, 6},
-    {0x005E, 7},{0x0043, 7},{0x00BE, 8},{0x0109, 9},
-    {0x1086,13},{0x0841,12},{0x0840,12},{0x000F, 4},
-    {0x0001, 4},{0x0011, 5},{0x0000, 5},{0x002E, 6},
-    {0x0019, 5},{0x0001, 5},{0x0006, 4},{0x0016, 5}
-  },
-  {
-    {0x0002, 3},{0x000F, 5},{0x006F, 7},{0x0061, 7},
-    {0x0374,10},{0x1BA8,13},{0x3753,14},{0x0012, 5},
-    {0x0036, 6},{0x0000, 3},{0x0001, 3},{0x000A, 4},
-    {0x000B, 4},{0x001A, 5},{0x0031, 6},{0x0060, 7},
-    {0x00DC, 8},{0x01BB, 9},{0x06EB,11},{0x1BAB,13},
-    {0x3752,14},{0x3755,14},{0x3754,14},{0x000E, 4},
-    {0x0006, 4},{0x0013, 5},{0x000E, 5},{0x003E, 6},
-    {0x0008, 4},{0x001E, 5},{0x0019, 5},{0x003F, 6}
-  },
-  {
-    {0x0003, 3},{0x001C, 5},{0x0025, 6},{0x0024, 6},
-    {0x01DA, 9},{0x1DBD,13},{0x3B7C,14},{0x003C, 6},
-    {0x003D, 6},{0x0000, 3},{0x0001, 3},{0x000B, 4},
-    {0x000A, 4},{0x000B, 5},{0x0077, 7},{0x00EC, 8},
-    {0x03B6,10},{0x076E,11},{0x1DBF,13},{0x76FB,15},
-    {0x76FA,15},{0x3B79,14},{0x3B78,14},{0x000D, 4},
-    {0x001F, 5},{0x0013, 5},{0x000A, 5},{0x0008, 5},
-    {0x000C, 4},{0x0008, 4},{0x0009, 5},{0x003A, 6}
-  },
-  {
-    {0x0005, 3},{0x0003, 4},{0x0004, 5},{0x0010, 5},
-    {0x008F, 8},{0x0475,11},{0x11D1,13},{0x0079, 7},
-    {0x0027, 6},{0x0002, 3},{0x0003, 3},{0x0001, 4},
-    {0x0000, 4},{0x0026, 6},{0x0046, 7},{0x011C, 9},
-    {0x0477,11},{0x08ED,12},{0x11D0,13},{0x11D3,13},
-    {0x11D2,13},{0x11D9,13},{0x11D8,13},{0x000D, 4},
-    {0x001F, 5},{0x0012, 5},{0x0005, 5},{0x003D, 6},
-    {0x000C, 4},{0x000E, 4},{0x0022, 6},{0x0078, 7}
-  },
-  {
-    {0x0005, 3},{0x000C, 4},{0x001B, 5},{0x0000, 4},
-    {0x0006, 6},{0x03E2,10},{0x3E3D,14},{0x000F, 7},
-    {0x0034, 6},{0x0003, 3},{0x0002, 3},{0x001E, 5},
-    {0x001D, 5},{0x007D, 7},{0x01F0, 9},{0x07C6,11},
-    {0x3E3C,14},{0x3E3F,14},{0x3E3E,14},{0x3E39,14},
-    {0x3E38,14},{0x3E3B,14},{0x3E3A,14},{0x0008, 4},
-    {0x001C, 5},{0x0002, 5},{0x003F, 6},{0x0035, 6},
-    {0x0009, 4},{0x0001, 3},{0x000E, 7},{0x00F9, 8}
-  },
-  {
-    {0x0004, 3},{0x000B, 4},{0x0001, 4},{0x000A, 4},
-    {0x001E, 6},{0x00E0, 9},{0x0E1E,13},{0x0071, 8},
-    {0x0039, 7},{0x0007, 3},{0x0006, 3},{0x000D, 5},
-    {0x000C, 5},{0x0020, 7},{0x01C2,10},{0x1C3F,14},
-    {0x1C3E,14},{0x0E19,13},{0x0E18,13},{0x0E1B,13},
-    {0x0E1A,13},{0x0E1D,13},{0x0E1C,13},{0x0000, 4},
-    {0x0009, 5},{0x001D, 6},{0x001F, 6},{0x0011, 6},
-    {0x0005, 4},{0x0001, 3},{0x0043, 8},{0x0042, 8}
-  },
-  {
-    {0x0004, 3},{0x000D, 4},{0x0007, 4},{0x0002, 3},
-    {0x0014, 5},{0x016C, 9},{0x16D1,13},{0x02DF,10},
-    {0x016E, 9},{0x0000, 2},{0x0007, 3},{0x002C, 6},
-    {0x002B, 6},{0x02DE,10},{0x16D0,13},{0x16D3,13},
-    {0x16D2,13},{0x2DB5,14},{0x2DB4,14},{0x2DB7,14},
-    {0x2DB6,14},{0x16D9,13},{0x16D8,13},{0x000C, 5},
-    {0x002A, 6},{0x005A, 7},{0x001B, 6},{0x001A, 6},
-    {0x0017, 5},{0x000C, 4},{0x05B7,11},{0x05B5,11}
-  },
-  {
-    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
-    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
-    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
-    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
-    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
-    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
-    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
-    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
-  },
-  {
-    {0x0000, 3},{0x0010, 5},{0x0072, 7},{0x0071, 7},
-    {0x0154, 9},{0x0AAB,12},{0x0AA8,12},{0x0014, 5},
-    {0x0070, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
-    {0x000B, 4},{0x0003, 4},{0x0011, 5},{0x0073, 7},
-    {0x0054, 7},{0x00AB, 8},{0x02AB,10},{0x1553,13},
-    {0x1552,13},{0x1555,13},{0x1554,13},{0x000D, 4},
-    {0x001E, 5},{0x0012, 5},{0x003E, 6},{0x002B, 6},
-    {0x0002, 4},{0x003F, 6},{0x001D, 5},{0x0013, 5}
-  },
-  {
-    {0x0003, 3},{0x001F, 5},{0x0029, 6},{0x003D, 6},
-    {0x000C, 7},{0x0069,10},{0x0345,13},{0x0002, 5},
-    {0x0028, 6},{0x0002, 3},{0x0001, 3},{0x000E, 4},
-    {0x000C, 4},{0x0015, 5},{0x0007, 6},{0x001B, 8},
-    {0x006B,10},{0x006A,10},{0x0344,13},{0x0347,13},
-    {0x0346,13},{0x01A1,12},{0x01A0,12},{0x000B, 4},
-    {0x001A, 5},{0x0012, 5},{0x0000, 5},{0x003C, 6},
-    {0x0008, 4},{0x001B, 5},{0x0013, 5},{0x0001, 5}
-  },
-  {
-    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0014, 5},
-    {0x0056, 7},{0x015C, 9},{0x15D5,13},{0x003C, 6},
-    {0x002A, 6},{0x0000, 3},{0x0001, 3},{0x000E, 4},
-    {0x000D, 4},{0x000C, 5},{0x00AF, 8},{0x02BB,10},
-    {0x15D4,13},{0x15D7,13},{0x15D6,13},{0x15D1,13},
-    {0x15D0,13},{0x15D3,13},{0x15D2,13},{0x000B, 4},
-    {0x0019, 5},{0x000D, 5},{0x003E, 6},{0x0031, 6},
-    {0x0007, 4},{0x0005, 4},{0x003D, 6},{0x0030, 6}
-  },
-  {
-    {0x0005, 3},{0x0008, 4},{0x001A, 5},{0x0000, 4},
-    {0x0036, 6},{0x0011, 8},{0x0106,12},{0x000A, 7},
-    {0x006E, 7},{0x0002, 3},{0x0003, 3},{0x0003, 4},
-    {0x0002, 4},{0x006F, 7},{0x0021, 9},{0x020F,13},
-    {0x020E,13},{0x0101,12},{0x0100,12},{0x0103,12},
-    {0x0102,12},{0x0105,12},{0x0104,12},{0x000C, 4},
-    {0x001E, 5},{0x0003, 5},{0x003E, 6},{0x003F, 6},
-    {0x0009, 4},{0x000E, 4},{0x000B, 7},{0x0009, 7}
-  },
-  {
-    {0x0002, 3},{0x000E, 4},{0x001E, 5},{0x000C, 4},
-    {0x001F, 5},{0x006E, 7},{0x00AD,10},{0x00AF,10},
-    {0x0014, 7},{0x0004, 3},{0x0003, 3},{0x001A, 5},
-    {0x0017, 5},{0x002A, 8},{0x0576,13},{0x0AEF,14},
-    {0x0AEE,14},{0x0571,13},{0x0570,13},{0x0573,13},
-    {0x0572,13},{0x0575,13},{0x0574,13},{0x0003, 4},
-    {0x0016, 5},{0x0004, 5},{0x0036, 6},{0x000B, 6},
-    {0x000A, 4},{0x0000, 3},{0x006F, 7},{0x00AC,10}
-  },
-  {
-    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
-    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
-    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
-    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
-    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
-    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
-    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
-    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
-  },
-  {
-    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
-    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
-    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
-    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
-    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
-    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
-    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
-    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
-  },
-  {
-    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
-    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
-    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
-    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
-    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
-    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
-    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
-    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
-  },
-  {
-    {0x0003, 3},{0x0011, 5},{0x0020, 6},{0x0074, 7},
-    {0x010D, 9},{0x0863,12},{0x0860,12},{0x000A, 5},
-    {0x0075, 7},{0x0001, 3},{0x0000, 3},{0x000B, 4},
-    {0x000A, 4},{0x0018, 5},{0x0038, 6},{0x0042, 7},
-    {0x010F, 9},{0x010E, 9},{0x0219,10},{0x10C3,13},
-    {0x10C2,13},{0x10C5,13},{0x10C4,13},{0x000F, 4},
-    {0x0004, 4},{0x0019, 5},{0x000B, 5},{0x0039, 6},
-    {0x0009, 4},{0x001B, 5},{0x001A, 5},{0x003B, 6}
-  },
-  {
-    {0x0005, 3},{0x0001, 4},{0x003E, 6},{0x0001, 5},
-    {0x00E2, 8},{0x1C6F,13},{0x38D9,14},{0x0039, 6},
-    {0x001F, 6},{0x0002, 3},{0x0001, 3},{0x0009, 4},
-    {0x0008, 4},{0x0000, 5},{0x0070, 7},{0x01C7, 9},
-    {0x038C,10},{0x071A,11},{0x38D8,14},{0x38DB,14},
-    {0x38DA,14},{0x38DD,14},{0x38DC,14},{0x000D, 4},
-    {0x001D, 5},{0x000E, 5},{0x003F, 6},{0x003C, 6},
-    {0x000C, 4},{0x0006, 4},{0x003D, 6},{0x001E, 6}
-  },
-  {
-    {0x0006, 3},{0x000B, 4},{0x0011, 5},{0x001E, 5},
-    {0x0074, 7},{0x03AA,10},{0x1D5C,13},{0x0001, 6},
-    {0x0021, 6},{0x0001, 3},{0x0002, 3},{0x0007, 4},
-    {0x0006, 4},{0x003E, 6},{0x00EB, 8},{0x01D4, 9},
-    {0x0EAF,12},{0x3ABB,14},{0x3ABA,14},{0x1D59,13},
-    {0x1D58,13},{0x1D5B,13},{0x1D5A,13},{0x000A, 4},
-    {0x001C, 5},{0x0001, 5},{0x003F, 6},{0x003B, 6},
-    {0x0001, 4},{0x0009, 4},{0x0020, 6},{0x0000, 6}
-  },
-  {
-    {0x0004, 3},{0x000A, 4},{0x0017, 5},{0x0004, 4},
-    {0x0016, 6},{0x016A, 9},{0x16B1,13},{0x0017, 7},
-    {0x005B, 7},{0x0006, 3},{0x0007, 3},{0x0001, 4},
-    {0x0000, 4},{0x000A, 6},{0x02D7,10},{0x0B5A,12},
-    {0x16B0,13},{0x16B3,13},{0x16B2,13},{0x2D6D,14},
-    {0x2D6C,14},{0x2D6F,14},{0x2D6E,14},{0x0006, 4},
-    {0x000A, 5},{0x0004, 5},{0x002C, 6},{0x0017, 6},
-    {0x0003, 4},{0x0007, 4},{0x0016, 7},{0x00B4, 8}
-  },
-  {
-    {0x0005, 3},{0x000D, 4},{0x0005, 4},{0x0009, 4},
-    {0x0033, 6},{0x0193, 9},{0x192C,13},{0x0061, 8},
-    {0x0031, 7},{0x0000, 2},{0x0007, 3},{0x0010, 5},
-    {0x0011, 5},{0x00C8, 8},{0x192F,13},{0x325B,14},
-    {0x325A,14},{0x1929,13},{0x1928,13},{0x192B,13},
-    {0x192A,13},{0x325D,14},{0x325C,14},{0x0018, 5},
-    {0x001A, 6},{0x001B, 6},{0x0065, 7},{0x0019, 6},
-    {0x0004, 4},{0x0007, 4},{0x0060, 8},{0x0324,10}
-  },
-  {
-    {0x0006, 3},{0x0000, 3},{0x0002, 4},{0x000F, 4},
-    {0x0039, 6},{0x01D9, 9},{0x1D82,13},{0x0761,11},
-    {0x03BE,10},{0x0001, 2},{0x0002, 2},{0x000F, 6},
-    {0x000E, 6},{0x0762,11},{0x3B07,14},{0x3B06,14},
-    {0x3B1D,14},{0x3B1C,14},{0x3B1F,14},{0x3B1E,14},
-    {0x3B19,14},{0x3B18,14},{0x3B1B,14},{0x0038, 6},
-    {0x01DE, 9},{0x00ED, 8},{0x03BF,10},{0x00EE, 8},
-    {0x003A, 6},{0x0006, 5},{0x0EC0,12},{0x3B1A,14}
-  },
-  {
-    {0x0000, 2},{0x0002, 3},{0x000F, 5},{0x0006, 4},
-    {0x001C, 6},{0x01D0,10},{0x0E8C,13},{0x1D1B,14},
-    {0x1D1A,14},{0x0003, 2},{0x0002, 2},{0x00EA, 9},
-    {0x00E9, 9},{0x0E89,13},{0x0E88,13},{0x0E8B,13},
-    {0x0E8A,13},{0x1D65,14},{0x1D64,14},{0x1D67,14},
-    {0x1D66,14},{0x1D61,14},{0x1D60,14},{0x03AD,11},
-    {0x1D63,14},{0x1D62,14},{0x1D1D,14},{0x1D1C,14},
-    {0x003B, 7},{0x01D7,10},{0x1D1F,14},{0x1D1E,14}
-  },
-  {
-    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
-    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
-    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
-    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
-    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
-    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
-    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
-    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
-  }
-};
-
-
-
 static void th_info2theora_info(theora_info *_ci,const th_info *_info){
   _ci->version_major=_info->version_major;
   _ci->version_minor=_info->version_minor;

Copied: branches/theora-thusnelda/lib/enc/encfrag.c (from rev 15940, branches/theora-thusnelda/lib/enc/dsp.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/encfrag.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/encfrag.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,151 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id$
+
+ ********************************************************************/
+#include <stdlib.h>
+#include <string.h>
+#include "codec_internal.h"
+
+
+void oc_enc_frag_sub(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  (*_cpi->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride);
+}
+
+void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+}
+
+void oc_enc_frag_sub_128(const CP_INSTANCE *_cpi,ogg_int16_t _diff[64],
+ const unsigned char *_src,int _ystride){
+  (*_cpi->opt_vtable.frag_sub_128)(_diff,_src,_ystride);
+}
+
+void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
+ const unsigned char *_src,int _ystride){
+  int i;
+  for(i=0;i<8;i++){
+    int j;
+    for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-128);
+    _src+=_ystride;
+  }
+}
+
+unsigned oc_enc_frag_sad(const CP_INSTANCE *_cpi,const unsigned char *_x,
+ const unsigned char *_y,int _ystride){
+  return (*_cpi->opt_vtable.frag_sad)(_x,_y,_ystride);
+}
+
+unsigned oc_enc_frag_sad_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad_thresh(const CP_INSTANCE *_cpi,
+ const unsigned char *_src,const unsigned char *_ref,int _ystride,
+ unsigned _thresh){
+  return (*_cpi->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh);
+}
+
+unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref+=_ystride;
+  }
+  return sad;
+}
+
+unsigned oc_enc_frag_sad2_thresh(const CP_INSTANCE *_cpi,
+ const unsigned char *_src,const unsigned char *_ref1,
+ const unsigned char *_ref2,int _ystride,unsigned _thresh){
+  return (*_cpi->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,
+   _thresh);
+}
+
+unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  unsigned sad;
+  int      i;
+  sad=0;
+  for(i=8;i-->0;){
+    int j;
+    for(j=0;j<8;j++)sad+=abs(_src[j]-(_ref1[j]+_ref2[j]>>1));
+    if(sad>_thresh)break;
+    _src+=_ystride;
+    _ref1+=_ystride;
+    _ref2+=_ystride;
+  }
+  return sad;
+}
+
+void oc_enc_frag_copy(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src,int _ystride){
+  (*_cpi->opt_vtable.frag_copy)(_dst,_src,_ystride);
+}
+
+void oc_enc_frag_copy2(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  (*_cpi->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride);
+}
+
+void oc_enc_frag_copy2_c(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  int i;
+  int j;
+  for(i=8;i-->0;){
+    for(j=0;j<8;j++)_dst[j]=_src1[j]+_src2[j]>>1;
+    _dst+=_ystride;
+    _src1+=_ystride;
+    _src2+=_ystride;
+  }
+}
+
+void oc_enc_frag_recon_intra(const CP_INSTANCE *_cpi,
+ unsigned char *_dst,int _ystride,const ogg_int16_t _residue[64]){
+  (*_cpi->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue);
+}
+
+void oc_enc_frag_recon_inter(const CP_INSTANCE *_cpi,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
+  (*_cpi->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue);
+}
+
+void oc_enc_restore_fpu(const CP_INSTANCE *_cpi){
+  (*_cpi->opt_vtable.restore_fpu)();
+}

Modified: branches/theora-thusnelda/lib/enc/encode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encode.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encode.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -149,19 +149,20 @@
 
   for(i=0; i<cpi->dct_token_ycount[group]; i++){
     if(token[i] < DCT_NOOP){
-      oggpackB_write( opb, cpi->HuffCodeArray_VP3x[huffY][token[i]],
-                      cpi->HuffCodeLengthArray_VP3x[huffY][token[i]] );
-      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0)
-        oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
+      oggpackB_write(opb,cpi->huff_codes[huffY][token[i]].pattern,
+       cpi->huff_codes[huffY][token[i]].nbits);
+      if(OC_DCT_TOKEN_EXTRA_BITS[token[i]]>0){
+        oggpackB_write(opb,eb[i],OC_DCT_TOKEN_EXTRA_BITS[token[i]]);
+      }
     }
   }
 
   for(; i<cpi->dct_token_count[group]; i++){
     if(token[i] < DCT_NOOP){
-      oggpackB_write( opb, cpi->HuffCodeArray_VP3x[huffC][token[i]],
-                      cpi->HuffCodeLengthArray_VP3x[huffC][token[i]] );
-      if (cpi->ExtraBitLengths_VP3x[token[i]] > 0)
-        oggpackB_write( opb, eb[i], cpi->ExtraBitLengths_VP3x[token[i]] );
+      oggpackB_write(opb,cpi->huff_codes[huffC][token[i]].pattern,
+       cpi->huff_codes[huffC][token[i]].nbits);
+      if (OC_DCT_TOKEN_EXTRA_BITS[token[i]] > 0)
+        oggpackB_write( opb, eb[i], OC_DCT_TOKEN_EXTRA_BITS[token[i]] );
     }
   }
 }
@@ -404,7 +405,7 @@
   }
 #endif
 #endif
-  dsp_restore_fpu (cpi->dsp);
+  oc_enc_restore_fpu(cpi);
 }
 
 void WriteFrameHeader( CP_INSTANCE *cpi) {
@@ -426,3 +427,31 @@
   }
 }
 
+void oc_enc_dequant_idct8x8(const CP_INSTANCE *_cpi,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
+  (*_cpi->opt_vtable.dequant_idct8x8)(_y,_x,_last_zzi,_ncoefs,
+   _dc_quant,_ac_quant);
+}
+
+void oc_enc_loop_filter(CP_INSTANCE *_cpi,int _flimit){
+  (*_cpi->opt_vtable.enc_loop_filter)(_cpi,_flimit);
+}
+
+void oc_enc_vtable_init_c(CP_INSTANCE *_cpi){
+  /*The implementations prefixed with oc_enc_ are encoder-specific.
+    The rest we re-use from the decoder.*/
+  _cpi->opt_vtable.frag_sad=oc_enc_frag_sad_c;
+  _cpi->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_c;
+  _cpi->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_c;
+  _cpi->opt_vtable.frag_sub=oc_enc_frag_sub_c;
+  _cpi->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_c;
+  _cpi->opt_vtable.frag_copy=oc_frag_copy_c;
+  _cpi->opt_vtable.frag_copy2=oc_enc_frag_copy2_c;
+  _cpi->opt_vtable.frag_recon_intra=oc_frag_recon_intra_c;
+  _cpi->opt_vtable.frag_recon_inter=oc_frag_recon_inter_c;
+  _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_c;
+  _cpi->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_c;
+  _cpi->opt_vtable.enc_loop_filter=oc_enc_loop_filter_c;
+  _cpi->opt_vtable.restore_fpu=oc_restore_fpu_c;
+}

Deleted: branches/theora-thusnelda/lib/enc/encoder_huffman.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_huffman.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encoder_huffman.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,252 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <stdlib.h>
-#include <stdio.h>
-#include "codec_internal.h"
-#include "hufftables.h"
-
-static void CreateHuffmanList(HUFF_ENTRY ** HuffRoot,
-                              ogg_uint32_t HIndex,
-                              const ogg_uint32_t *FreqList ) {
-  int i;
-  HUFF_ENTRY *entry_ptr;
-  HUFF_ENTRY *search_ptr;
-
-  /* Create a HUFF entry for token zero. */
-  HuffRoot[HIndex] = (HUFF_ENTRY *)_ogg_calloc(1,sizeof(*HuffRoot[HIndex]));
-
-  HuffRoot[HIndex]->Previous = NULL;
-  HuffRoot[HIndex]->Next = NULL;
-  HuffRoot[HIndex]->ZeroChild = NULL;
-  HuffRoot[HIndex]->OneChild = NULL;
-  HuffRoot[HIndex]->Value = 0;
-  HuffRoot[HIndex]->Frequency = FreqList[0];
-
-  if ( HuffRoot[HIndex]->Frequency == 0 )
-    HuffRoot[HIndex]->Frequency = 1;
-
-  /* Now add entries for all the other possible tokens. */
-  for ( i = 1; i < MAX_ENTROPY_TOKENS; i++ ) {
-    entry_ptr = (HUFF_ENTRY *)_ogg_calloc(1,sizeof(*entry_ptr));
-
-    entry_ptr->Value = i;
-    entry_ptr->Frequency = FreqList[i];
-    entry_ptr->ZeroChild = NULL;
-    entry_ptr->OneChild = NULL;
-
-    /* Force min value of 1. This prevents the tree getting too deep. */
-    if ( entry_ptr->Frequency == 0 )
-      entry_ptr->Frequency = 1;
-
-    if ( entry_ptr->Frequency <= HuffRoot[HIndex]->Frequency ){
-      entry_ptr->Next = HuffRoot[HIndex];
-      HuffRoot[HIndex]->Previous = entry_ptr;
-      entry_ptr->Previous = NULL;
-      HuffRoot[HIndex] = entry_ptr;
-    }else{
-      search_ptr = HuffRoot[HIndex];
-      while ( (search_ptr->Next != NULL) &&
-              (search_ptr->Frequency < entry_ptr->Frequency) ){
-        search_ptr = (HUFF_ENTRY *)search_ptr->Next;
-      }
-
-      if ( search_ptr->Frequency < entry_ptr->Frequency ){
-        entry_ptr->Next = NULL;
-        entry_ptr->Previous = search_ptr;
-        search_ptr->Next = entry_ptr;
-      }else{
-        entry_ptr->Next = search_ptr;
-        entry_ptr->Previous = search_ptr->Previous;
-        search_ptr->Previous->Next = entry_ptr;
-        search_ptr->Previous = entry_ptr;
-      }
-    }
-  }
-}
-
-static void CreateCodeArray( HUFF_ENTRY * HuffRoot,
-                      ogg_uint32_t *HuffCodeArray,
-                      unsigned char *HuffCodeLengthArray,
-                      ogg_uint32_t CodeValue,
-                      unsigned char CodeLength ) {
-
-  /* If we are at a leaf then fill in a code array entry. */
-  if ( ( HuffRoot->ZeroChild == NULL ) && ( HuffRoot->OneChild == NULL ) ){
-    HuffCodeArray[HuffRoot->Value] = CodeValue;
-    HuffCodeLengthArray[HuffRoot->Value] = CodeLength;
-  }else{
-    /* Recursive calls to scan down the tree. */
-    CodeLength++;
-    CreateCodeArray(HuffRoot->ZeroChild, HuffCodeArray, HuffCodeLengthArray,
-                    ((CodeValue << 1) + 0), CodeLength);
-    CreateCodeArray(HuffRoot->OneChild, HuffCodeArray, HuffCodeLengthArray,
-                    ((CodeValue << 1) + 1), CodeLength);
-  }
-}
-
-static void  BuildHuffmanTree( HUFF_ENTRY **HuffRoot,
-                        ogg_uint32_t *HuffCodeArray,
-                        unsigned char *HuffCodeLengthArray,
-                        ogg_uint32_t HIndex,
-                        const ogg_uint32_t *FreqList ){
-
-  HUFF_ENTRY *entry_ptr;
-  HUFF_ENTRY *search_ptr;
-
-  /* First create a sorted linked list representing the frequencies of
-     each token. */
-  CreateHuffmanList( HuffRoot, HIndex, FreqList );
-
-  /* Now build the tree from the list. */
-
-  /* While there are at least two items left in the list. */
-  while ( HuffRoot[HIndex]->Next != NULL ){
-    /* Create the new node as the parent of the first two in the list. */
-    entry_ptr = (HUFF_ENTRY *)_ogg_calloc(1,sizeof(*entry_ptr));
-    entry_ptr->Value = -1;
-    entry_ptr->Frequency = HuffRoot[HIndex]->Frequency +
-      HuffRoot[HIndex]->Next->Frequency ;
-    entry_ptr->ZeroChild = HuffRoot[HIndex];
-    entry_ptr->OneChild = HuffRoot[HIndex]->Next;
-
-    /* If there are still more items in the list then insert the new
-       node into the list. */
-    if (entry_ptr->OneChild->Next != NULL ){
-      /* Set up the provisional 'new root' */
-      HuffRoot[HIndex] = entry_ptr->OneChild->Next;
-      HuffRoot[HIndex]->Previous = NULL;
-
-      /* Now scan through the remaining list to insert the new entry
-         at the appropriate point. */
-      if ( entry_ptr->Frequency <= HuffRoot[HIndex]->Frequency ){
-        entry_ptr->Next = HuffRoot[HIndex];
-        HuffRoot[HIndex]->Previous = entry_ptr;
-        entry_ptr->Previous = NULL;
-        HuffRoot[HIndex] = entry_ptr;
-      }else{
-        search_ptr = HuffRoot[HIndex];
-        while ( (search_ptr->Next != NULL) &&
-                (search_ptr->Frequency < entry_ptr->Frequency) ){
-          search_ptr = search_ptr->Next;
-        }
-
-        if ( search_ptr->Frequency < entry_ptr->Frequency ){
-          entry_ptr->Next = NULL;
-          entry_ptr->Previous = search_ptr;
-          search_ptr->Next = entry_ptr;
-        }else{
-          entry_ptr->Next = search_ptr;
-          entry_ptr->Previous = search_ptr->Previous;
-          search_ptr->Previous->Next = entry_ptr;
-          search_ptr->Previous = entry_ptr;
-        }
-      }
-    }else{
-      /* Build has finished. */
-      entry_ptr->Next = NULL;
-      entry_ptr->Previous = NULL;
-      HuffRoot[HIndex] = entry_ptr;
-    }
-
-    /* Delete the Next/Previous properties of the children (PROB NOT NEC). */
-    entry_ptr->ZeroChild->Next = NULL;
-    entry_ptr->ZeroChild->Previous = NULL;
-    entry_ptr->OneChild->Next = NULL;
-    entry_ptr->OneChild->Previous = NULL;
-
-  }
-
-  /* Now build a code array from the tree. */
-  CreateCodeArray( HuffRoot[HIndex], HuffCodeArray,
-                   HuffCodeLengthArray, 0, 0);
-}
-
-static void  DestroyHuffTree(HUFF_ENTRY *root_ptr){
-  if (root_ptr){
-    if ( root_ptr->ZeroChild )
-      DestroyHuffTree(root_ptr->ZeroChild);
-
-    if ( root_ptr->OneChild )
-      DestroyHuffTree(root_ptr->OneChild);
-
-    _ogg_free(root_ptr);
-  }
-}
-
-static void  ClearHuffmanTrees(HUFF_ENTRY *HuffRoot[NUM_HUFF_TABLES]){
-  int i;
-  for(i=0; i<NUM_HUFF_TABLES; i++) {
-    DestroyHuffTree(HuffRoot[i]);
-    HuffRoot[i] = NULL;
-  }
-}
-
-void ClearHuffmanSet( CP_INSTANCE *cpi ){
-  int i;
-
-  ClearHuffmanTrees(cpi->HuffRoot_VP3x);
-
-  for ( i = 0; i < NUM_HUFF_TABLES; i++ )
-    if (cpi->HuffCodeArray_VP3x[i])
-      _ogg_free (cpi->HuffCodeArray_VP3x[i]);
-
-  for ( i = 0; i < NUM_HUFF_TABLES; i++ )
-    if (cpi->HuffCodeLengthArray_VP3x[i])
-      _ogg_free (cpi->HuffCodeLengthArray_VP3x[i]);
-}
-
-void InitHuffmanSet( CP_INSTANCE *cpi ){
-  int i;
-
-  ClearHuffmanSet(cpi);
-
-  cpi->ExtraBitLengths_VP3x = ExtraBitLengths_VP31;
-
-  for ( i = 0; i < NUM_HUFF_TABLES; i++ ){
-    cpi->HuffCodeArray_VP3x[i] =
-      _ogg_calloc(MAX_ENTROPY_TOKENS,
-                  sizeof(*cpi->HuffCodeArray_VP3x[i]));
-    cpi->HuffCodeLengthArray_VP3x[i] =
-      _ogg_calloc(MAX_ENTROPY_TOKENS,
-                  sizeof(*cpi->HuffCodeLengthArray_VP3x[i]));
-    BuildHuffmanTree( cpi->HuffRoot_VP3x,
-                      cpi->HuffCodeArray_VP3x[i],
-                      cpi->HuffCodeLengthArray_VP3x[i],
-                      i, FrequencyCounts_VP3[i]);
-  }
-}
-
-static void WriteHuffTree(HUFF_ENTRY *HuffRoot, oggpack_buffer *opb) {
-  if (HuffRoot->Value >= 0) {
-    oggpackB_write(opb, 1, 1);
-    oggpackB_write(opb, HuffRoot->Value, 5);
-  } else {
-    oggpackB_write(opb, 0, 1);
-    WriteHuffTree(HuffRoot->ZeroChild, opb);
-    WriteHuffTree(HuffRoot->OneChild, opb);
-  }
-}
-
-void WriteHuffmanTrees(HUFF_ENTRY *HuffRoot[NUM_HUFF_TABLES],
-                       oggpack_buffer *opb) {
-  int i;
-  for(i=0; i<NUM_HUFF_TABLES; i++) {
-    WriteHuffTree(HuffRoot[i], opb);
-  }
-}
-

Deleted: branches/theora-thusnelda/lib/enc/encoder_idct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_idct.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encoder_idct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,569 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function: C implementation of the Theora iDCT
-  last mod: $Id$
-
- ********************************************************************/
-
-#include <string.h>
-#include "codec_internal.h"
-
-#include "quant_lookup.h"
-
-#define IdctAdjustBeforeShift 8
-/* cos(n*pi/16) or sin(8-n)*pi/16) */
-#define xC1S7 64277
-#define xC2S6 60547
-#define xC3S5 54491
-#define xC4S4 46341
-#define xC5S3 36410
-#define xC6S2 25080
-#define xC7S1 12785
-
-/* compute the 16 bit signed 1D inverse DCT - spec version */
-/*
-static void idct_short__c ( ogg_int16_t * InputData, ogg_int16_t * OutputData ) {
-  ogg_int32_t t[8], r;
-  ogg_int16_t *y = InputData;
-  ogg_int16_t *x = OutputData;
-
-  t[0] = y[0] + y[4];
-  t[0] &= 0xffff;
-  t[0] = (xC4S4 * t[0]) >> 16;
-
-  t[1] = y[0] - y[4];
-  t[1] &= 0xffff;
-  t[1] = (xC4S4 * t[1]) >> 16;
-
-  t[2] = ((xC6S2 * y[2]) >> 16) - ((xC2S6 * y[6]) >> 16);
-  t[3] = ((xC2S6 * y[2]) >> 16) + ((xC6S2 * y[6]) >> 16);
-  t[4] = ((xC7S1 * y[1]) >> 16) - ((xC1S7 * y[7]) >> 16);
-  t[5] = ((xC3S5 * y[5]) >> 16) - ((xC5S3 * y[3]) >> 16);
-  t[6] = ((xC5S3 * y[5]) >> 16) + ((xC3S5 * y[3]) >> 16);
-  t[7] = ((xC1S7 * y[1]) >> 16) + ((xC7S1 * y[7]) >> 16);
-
-  r = t[4] + t[5];
-  t[5] = t[4] - t[5];
-  t[5] &= 0xffff;
-  t[5] = (xC4S4 * (-t[5])) >> 16;
-  t[4] = r;
-
-  r = t[7] + t[6];
-  t[6] = t[7] - t[6];
-  t[6] &= 0xffff;
-  t[6] = (xC4S4 * t[6]) >> 16;
-  t[7] = r;
-
-  r = t[0] + t[3];
-  t[3] = t[0] - t[3];
-  t[0] = r;
-
-  r = t[1] + t[2];
-  t[2] = t[1] - t[2];
-  t[1] = r;
-
-  r = t[6] + t[5];
-  t[5] = t[6] - t[5];
-  t[6] = r;
-
-  r = t[0] + t[7];
-  r &= 0xffff;
-  x[0] = r;
-
-  r = t[1] + t[6];
-  r &= 0xffff;
-  x[1] = r;
-
-  r = t[2] + t[5];
-  r &= 0xffff;
-  x[2] = r;
-
-  r = t[3] + t[4];
-  r &= 0xffff;
-  x[3] = r;
-
-  r = t[3] - t[4];
-  r &= 0xffff;
-  x[4] = r;
-
-  r = t[2] - t[5];
-  r &= 0xffff;
-  x[5] = r;
-
-  r = t[1] - t[6];
-  r &= 0xffff;
-  x[6] = r;
-
-  r = t[0] - t[7];
-  r &= 0xffff;
-  x[7] = r;
-
-}
-*/
-
-static void dequant_slow( const ogg_int16_t * dequant_coeffs,
-			  const ogg_int16_t * quantized_list,
-			  ogg_int32_t * DCT_block) {
-  int i;
-  for(i=0;i<64;i++)
-    DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
-}
-
-
-
-void IDctSlow__c(  const ogg_int16_t * InputData,
-		   const ogg_int16_t *QuantMatrix,
-		   ogg_int16_t * OutputData ) {
-  ogg_int32_t IntermediateData[64];
-  ogg_int32_t * ip = IntermediateData;
-  ogg_int16_t * op = OutputData;
-
-  ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
-  ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
-  ogg_int32_t t1, t2;
-
-  int loop;
-
-  dequant_slow( QuantMatrix, InputData, IntermediateData);
-
-  /* Inverse DCT on the rows now */
-  for ( loop = 0; loop < 8; loop++){
-    /* Check for non-zero values */
-    if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
-      t1 = (xC1S7 * ip[1]);
-      t2 = (xC7S1 * ip[7]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _A = t1 + t2;
-
-      t1 = (xC7S1 * ip[1]);
-      t2 = (xC1S7 * ip[7]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _B = t1 - t2;
-
-      t1 = (xC3S5 * ip[3]);
-      t2 = (xC5S3 * ip[5]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _C = t1 + t2;
-
-      t1 = (xC3S5 * ip[5]);
-      t2 = (xC5S3 * ip[3]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _D = t1 - t2;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
-      t1 >>= 16;
-      _Ad = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
-      t1 >>= 16;
-      _Bd = t1;
-
-
-      _Cd = _A + _C;
-      _Dd = _B + _D;
-
-      t1 = (xC4S4 * (ogg_int16_t)(ip[0] + ip[4]));
-      t1 >>= 16;
-      _E = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(ip[0] - ip[4]));
-      t1 >>= 16;
-      _F = t1;
-
-      t1 = (xC2S6 * ip[2]);
-      t2 = (xC6S2 * ip[6]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _G = t1 + t2;
-
-      t1 = (xC6S2 * ip[2]);
-      t2 = (xC2S6 * ip[6]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _H = t1 - t2;
-
-
-      _Ed = _E - _G;
-      _Gd = _E + _G;
-
-      _Add = _F + _Ad;
-      _Bdd = _Bd - _H;
-
-      _Fd = _F - _Ad;
-      _Hd = _Bd + _H;
-
-      /* Final sequence of operations over-write original inputs. */
-      ip[0] = (ogg_int16_t)((_Gd + _Cd )   >> 0);
-      ip[7] = (ogg_int16_t)((_Gd - _Cd )   >> 0);
-
-      ip[1] = (ogg_int16_t)((_Add + _Hd )  >> 0);
-      ip[2] = (ogg_int16_t)((_Add - _Hd )  >> 0);
-
-      ip[3] = (ogg_int16_t)((_Ed + _Dd )   >> 0);
-      ip[4] = (ogg_int16_t)((_Ed - _Dd )   >> 0);
-
-      ip[5] = (ogg_int16_t)((_Fd + _Bdd )  >> 0);
-      ip[6] = (ogg_int16_t)((_Fd - _Bdd )  >> 0);
-
-    }
-
-    ip += 8;                    /* next row */
-  }
-
-  ip = IntermediateData;
-
-  for ( loop = 0; loop < 8; loop++){
-    /* Check for non-zero values (bitwise or faster than ||) */
-    if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
-         ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
-
-      t1 = (xC1S7 * ip[1*8]);
-      t2 = (xC7S1 * ip[7*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _A = t1 + t2;
-
-      t1 = (xC7S1 * ip[1*8]);
-      t2 = (xC1S7 * ip[7*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _B = t1 - t2;
-
-      t1 = (xC3S5 * ip[3*8]);
-      t2 = (xC5S3 * ip[5*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _C = t1 + t2;
-
-      t1 = (xC3S5 * ip[5*8]);
-      t2 = (xC5S3 * ip[3*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _D = t1 - t2;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
-      t1 >>= 16;
-      _Ad = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
-      t1 >>= 16;
-      _Bd = t1;
-
-
-      _Cd = _A + _C;
-      _Dd = _B + _D;
-
-      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] + ip[4*8]));
-      t1 >>= 16;
-      _E = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(ip[0*8] - ip[4*8]));
-      t1 >>= 16;
-      _F = t1;
-
-      t1 = (xC2S6 * ip[2*8]);
-      t2 = (xC6S2 * ip[6*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _G = t1 + t2;
-
-      t1 = (xC6S2 * ip[2*8]);
-      t2 = (xC2S6 * ip[6*8]);
-      t1 >>= 16;
-      t2 >>= 16;
-      _H = t1 - t2;
-
-      _Ed = _E - _G;
-      _Gd = _E + _G;
-
-      _Add = _F + _Ad;
-      _Bdd = _Bd - _H;
-
-      _Fd = _F - _Ad;
-      _Hd = _Bd + _H;
-
-      _Gd += IdctAdjustBeforeShift;
-      _Add += IdctAdjustBeforeShift;
-      _Ed += IdctAdjustBeforeShift;
-      _Fd += IdctAdjustBeforeShift;
-
-      /* Final sequence of operations over-write original inputs. */
-      op[0*8] = (ogg_int16_t)((_Gd + _Cd )   >> 4);
-      op[7*8] = (ogg_int16_t)((_Gd - _Cd )   >> 4);
-
-      op[1*8] = (ogg_int16_t)((_Add + _Hd )  >> 4);
-      op[2*8] = (ogg_int16_t)((_Add - _Hd )  >> 4);
-
-      op[3*8] = (ogg_int16_t)((_Ed + _Dd )   >> 4);
-      op[4*8] = (ogg_int16_t)((_Ed - _Dd )   >> 4);
-
-      op[5*8] = (ogg_int16_t)((_Fd + _Bdd )  >> 4);
-      op[6*8] = (ogg_int16_t)((_Fd - _Bdd )  >> 4);
-    }else{
-      op[0*8] = 0;
-      op[7*8] = 0;
-      op[1*8] = 0;
-      op[2*8] = 0;
-      op[3*8] = 0;
-      op[4*8] = 0;
-      op[5*8] = 0;
-      op[6*8] = 0;
-    }
-
-    ip++;                       /* next column */
-    op++;
-  }
-}
-
-/************************
-  x  x  x  x  0  0  0  0
-  x  x  x  0  0  0  0  0
-  x  x  0  0  0  0  0  0
-  x  0  0  0  0  0  0  0
-  0  0  0  0  0  0  0  0
-  0  0  0  0  0  0  0  0
-  0  0  0  0  0  0  0  0
-  0  0  0  0  0  0  0  0
-*************************/
-
-static void dequant_slow10( const ogg_int16_t * dequant_coeffs,
-			    const ogg_int16_t * quantized_list,
-			    ogg_int32_t * DCT_block){
-  int i;
-  memset(DCT_block,0, 128);
-  for(i=0;i<10;i++)
-    DCT_block[dezigzag_index[i]] = quantized_list[i] * dequant_coeffs[i];
-
-}
-
-void IDct10__c( const ogg_int16_t * InputData,
-		const ogg_int16_t *QuantMatrix,
-		ogg_int16_t * OutputData ){
-  ogg_int32_t IntermediateData[64];
-  ogg_int32_t * ip = IntermediateData;
-  ogg_int16_t * op = OutputData;
-
-  ogg_int32_t _A, _B, _C, _D, _Ad, _Bd, _Cd, _Dd, _E, _F, _G, _H;
-  ogg_int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
-  ogg_int32_t t1, t2;
-
-  int loop;
-
-  dequant_slow10( QuantMatrix, InputData, IntermediateData);
-
-  /* Inverse DCT on the rows now */
-  for ( loop = 0; loop < 4; loop++){
-    /* Check for non-zero values */
-    if ( ip[0] | ip[1] | ip[2] | ip[3] ){
-      t1 = (xC1S7 * ip[1]);
-      t1 >>= 16;
-      _A = t1;
-
-      t1 = (xC7S1 * ip[1]);
-      t1 >>= 16;
-      _B = t1 ;
-
-      t1 = (xC3S5 * ip[3]);
-      t1 >>= 16;
-      _C = t1;
-
-      t2 = (xC5S3 * ip[3]);
-      t2 >>= 16;
-      _D = -t2;
-
-
-      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
-      t1 >>= 16;
-      _Ad = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
-      t1 >>= 16;
-      _Bd = t1;
-
-
-      _Cd = _A + _C;
-      _Dd = _B + _D;
-
-      t1 = (xC4S4 * ip[0] );
-      t1 >>= 16;
-      _E = t1;
-
-      _F = t1;
-
-      t1 = (xC2S6 * ip[2]);
-      t1 >>= 16;
-      _G = t1;
-
-      t1 = (xC6S2 * ip[2]);
-      t1 >>= 16;
-      _H = t1 ;
-
-
-      _Ed = _E - _G;
-      _Gd = _E + _G;
-
-      _Add = _F + _Ad;
-      _Bdd = _Bd - _H;
-
-      _Fd = _F - _Ad;
-      _Hd = _Bd + _H;
-
-      /* Final sequence of operations over-write original inputs. */
-      ip[0] = (ogg_int16_t)((_Gd + _Cd )   >> 0);
-      ip[7] = (ogg_int16_t)((_Gd - _Cd )   >> 0);
-
-      ip[1] = (ogg_int16_t)((_Add + _Hd )  >> 0);
-      ip[2] = (ogg_int16_t)((_Add - _Hd )  >> 0);
-
-      ip[3] = (ogg_int16_t)((_Ed + _Dd )   >> 0);
-      ip[4] = (ogg_int16_t)((_Ed - _Dd )   >> 0);
-
-      ip[5] = (ogg_int16_t)((_Fd + _Bdd )  >> 0);
-      ip[6] = (ogg_int16_t)((_Fd - _Bdd )  >> 0);
-
-    }
-
-    ip += 8;                    /* next row */
-  }
-
-  ip = IntermediateData;
-
-  for ( loop = 0; loop < 8; loop++) {
-    /* Check for non-zero values (bitwise or faster than ||) */
-    if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] ) {
-
-      t1 = (xC1S7 * ip[1*8]);
-      t1 >>= 16;
-      _A = t1 ;
-
-      t1 = (xC7S1 * ip[1*8]);
-      t1 >>= 16;
-      _B = t1 ;
-
-      t1 = (xC3S5 * ip[3*8]);
-      t1 >>= 16;
-      _C = t1 ;
-
-      t2 = (xC5S3 * ip[3*8]);
-      t2 >>= 16;
-      _D = - t2;
-
-
-      t1 = (xC4S4 * (ogg_int16_t)(_A - _C));
-      t1 >>= 16;
-      _Ad = t1;
-
-      t1 = (xC4S4 * (ogg_int16_t)(_B - _D));
-      t1 >>= 16;
-      _Bd = t1;
-
-
-      _Cd = _A + _C;
-      _Dd = _B + _D;
-
-      t1 = (xC4S4 * ip[0*8]);
-      t1 >>= 16;
-      _E = t1;
-      _F = t1;
-
-      t1 = (xC2S6 * ip[2*8]);
-      t1 >>= 16;
-      _G = t1;
-
-      t1 = (xC6S2 * ip[2*8]);
-      t1 >>= 16;
-      _H = t1;
-
-
-      _Ed = _E - _G;
-      _Gd = _E + _G;
-
-      _Add = _F + _Ad;
-      _Bdd = _Bd - _H;
-
-      _Fd = _F - _Ad;
-      _Hd = _Bd + _H;
-
-      _Gd += IdctAdjustBeforeShift;
-      _Add += IdctAdjustBeforeShift;
-      _Ed += IdctAdjustBeforeShift;
-      _Fd += IdctAdjustBeforeShift;
-
-      /* Final sequence of operations over-write original inputs. */
-      op[0*8] = (ogg_int16_t)((_Gd + _Cd )   >> 4);
-      op[7*8] = (ogg_int16_t)((_Gd - _Cd )   >> 4);
-
-      op[1*8] = (ogg_int16_t)((_Add + _Hd )  >> 4);
-      op[2*8] = (ogg_int16_t)((_Add - _Hd )  >> 4);
-
-      op[3*8] = (ogg_int16_t)((_Ed + _Dd )   >> 4);
-      op[4*8] = (ogg_int16_t)((_Ed - _Dd )   >> 4);
-
-      op[5*8] = (ogg_int16_t)((_Fd + _Bdd )  >> 4);
-      op[6*8] = (ogg_int16_t)((_Fd - _Bdd )  >> 4);
-    }else{
-      op[0*8] = 0;
-      op[7*8] = 0;
-      op[1*8] = 0;
-      op[2*8] = 0;
-      op[3*8] = 0;
-      op[4*8] = 0;
-      op[5*8] = 0;
-      op[6*8] = 0;
-    }
-
-    ip++;                       /* next column */
-    op++;
-  }
-}
-
-/***************************
-  x   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-  0   0   0  0  0  0  0  0
-**************************/
-
-void IDct1( const ogg_int16_t * InputData,
-            const ogg_int16_t *QuantMatrix,
-            ogg_int16_t * OutputData ){
-  int loop;
-
-  ogg_int16_t  OutD;
-  
-  OutD=(ogg_int16_t) ((ogg_int32_t)(InputData[0]*QuantMatrix[0]+15)>>5);
-  
-  for(loop=0;loop<64;loop++)
-    OutputData[loop]=OutD;
-  
-}
-
-void dsp_idct_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
-{
-  funcs->IDctSlow = IDctSlow__c;
-  funcs->IDct10 = IDct10__c;
-  funcs->IDct3 = IDct10__c;
-#if defined(USE_ASM)
-  if (cpu_flags & OC_CPU_X86_MMX) {
-    dsp_mmx_idct_init(funcs);
-  }
-#endif
-}

Modified: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -263,7 +263,7 @@
     /*Now compute an "average" quantizer for each qi level.
       We do one for INTER and one for INTRA, since their behavior is very
        different, but average across chroma channels.
-      The basic approach is to compute a geometric average of the squared
+      The basic approach is to compute a harmonic average of the squared
        quantizer, weighted by the expected squared magnitude of the DCT
        coefficients.
       Under the (not quite true) assumption that DCT coefficients are

Modified: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -23,10 +23,12 @@
 #include <string.h>
 #include "toplevel_lookup.h"
 #include "../internal.h"
-#include "dsp.h"
 #include "codec_internal.h"
 #include "mathops.h"
 #include "../dec/ocintrin.h"
+#if defined(OC_X86_ASM)
+# include "x86/x86enc.h"
+#endif
 
 
 
@@ -174,15 +176,12 @@
   nframes[1]=buf_delay-nframes[0];
   rate_total=cpi->rc.fullness-cpi->rc.target
    +buf_delay*cpi->rc.bits_per_frame;
-  /*Downgrade the frame rate to correspond to the current dup count.
+  /*Downgrade the delta frame rate to correspond to the current dup count.
     This will way over-estimate the bits to use for an occasional dup (as
      opposed to a consistent dup count, as used with VFR input), but the
      hysteresis on the quantizer below will keep us from going out of control,
      and we _do_ have more bits to spend after all.*/
-  if(cpi->dup_count>0){
-    nframes[0]=(nframes[0]+cpi->dup_count)/(cpi->dup_count+1);
-    nframes[1]=(nframes[1]+cpi->dup_count)/(cpi->dup_count+1);
-  }
+  if(cpi->dup_count>0)nframes[1]=(nframes[1]+cpi->dup_count)/(cpi->dup_count+1);
   /*If there aren't enough bits to achieve our desired fullness level, use the
      minimum quality permitted.*/
   if(rate_total<=0)log_qtarget=OC_QUANT_MAX_LOG;
@@ -343,9 +342,12 @@
   th->internal_encode=cpi=_ogg_calloc(1,sizeof(*cpi));
   theora_encode_dispatch_init(cpi);
   oc_mode_scheme_chooser_init(&cpi->chooser);
+#if defined(OC_X86_ASM)
+  oc_enc_vtable_init_x86(cpi);
+#else
+  oc_enc_vtable_init_c(cpi);
+#endif
 
-  dsp_static_init (&cpi->dsp);
-
   c->version_major=TH_VERSION_MAJOR;
   c->version_minor=TH_VERSION_MINOR;
   c->version_subminor=TH_VERSION_SUB;
@@ -393,7 +395,7 @@
   /* We always start at frame 1 */
   cpi->CurrentFrame = 1;
 
-  InitHuffmanSet(cpi);
+  memcpy(cpi->huff_codes,TH_VP31_HUFF_CODES,sizeof(cpi->huff_codes));
 
   /* This makes sure encoder version specific tables are initialised */
   memcpy(&cpi->quant_info, &TH_VP31_QUANT_INFO, sizeof(th_quant_info));
@@ -668,7 +670,7 @@
   _tp_writebuffer(cpi->oggbuffer,"theora",6);
 
   oc_quant_params_pack(cpi->oggbuffer,&cpi->quant_info);
-  WriteHuffmanTrees(cpi->HuffRoot_VP3x,cpi->oggbuffer);
+  oc_huff_codes_pack(cpi->oggbuffer,(const th_huff_table *)cpi->huff_codes);
 
   op->packet=oggpackB_get_buffer(cpi->oggbuffer);
   op->bytes=oggpackB_bytes(cpi->oggbuffer);
@@ -691,7 +693,6 @@
   cpi=(CP_INSTANCE *)th->internal_encode;
   if(cpi){
 
-    ClearHuffmanSet(cpi);
     ClearFrameInfo(cpi);
 
     oggpackB_writeclear(cpi->oggbuffer);
@@ -763,6 +764,21 @@
       InitQTables(cpi);
 
       return 0;
+    case TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE:{
+      ogg_uint32_t keyframe_frequency_force;
+      if(buf==NULL)return TH_EFAULT;
+      if(buf_sz!=sizeof(keyframe_frequency_force))return TH_EINVAL;
+      keyframe_frequency_force=*(ogg_uint32_t *)buf;
+      if(cpi->HeadersWritten){
+        /*It's still early enough to enlarge keyframe_granule_shift.*/
+        cpi->keyframe_granule_shift=OC_CLAMPI(cpi->keyframe_granule_shift,
+         OC_ILOG_32(keyframe_frequency_force-1),31);
+      }
+      cpi->info.keyframe_frequency_force=OC_MINI(keyframe_frequency_force,
+       (ogg_uint32_t)1U<<cpi->keyframe_granule_shift);
+      *(ogg_uint32_t *)buf=cpi->info.keyframe_frequency_force;
+      return 0;
+    }
     case TH_ENCCTL_SET_VP3_COMPATIBLE:
       if(cpi->HeadersWritten)
         return TH_EINVAL;

Copied: branches/theora-thusnelda/lib/enc/huffenc.c (from rev 15592, trunk/theora-exp/lib/huffenc.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/huffenc.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/huffenc.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,910 @@
+#include <stdlib.h>
+#include <string.h>
+#include <ogg/ogg.h>
+#include "huffenc.h"
+
+
+
+/*The default Huffman codes used for VP3.1.*/
+const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]={
+  {
+    {0x002D, 6},{0x0026, 7},{0x0166, 9},{0x004E, 8},
+    {0x02CE,10},{0x059E,11},{0x027D,11},{0x0008, 5},
+    {0x04F9,12},{0x000F, 4},{0x000E, 4},{0x001B, 5},
+    {0x0006, 4},{0x0008, 4},{0x0005, 4},{0x001A, 5},
+    {0x0015, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x0029, 6},
+    {0x0028, 6},{0x00B2, 8},{0x04F8,12},{0x059F,11},
+    {0x009E, 9},{0x013F,10},{0x0012, 6},{0x0058, 7}
+  },
+  {
+    {0x0010, 5},{0x0047, 7},{0x01FF, 9},{0x008C, 8},
+    {0x03FC,10},{0x046A,11},{0x0469,11},{0x0022, 6},
+    {0x11A1,13},{0x000E, 4},{0x000D, 4},{0x0004, 4},
+    {0x0005, 4},{0x0009, 4},{0x0006, 4},{0x001E, 5},
+    {0x0016, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x000A, 4},{0x0017, 5},{0x007D, 7},
+    {0x007E, 7},{0x011B, 9},{0x08D1,12},{0x03FD,10},
+    {0x046B,11},{0x11A0,13},{0x007C, 7},{0x00FE, 8}
+  },
+  {
+    {0x0016, 5},{0x0020, 6},{0x0086, 8},{0x0087, 8},
+    {0x0367,10},{0x06CC,11},{0x06CB,11},{0x006E, 7},
+    {0x366D,14},{0x000F, 4},{0x000E, 4},{0x0004, 4},
+    {0x0005, 4},{0x000A, 4},{0x0006, 4},{0x001A, 5},
+    {0x0011, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x006F, 7},
+    {0x006D, 7},{0x0364,10},{0x0D9A,12},{0x06CA,11},
+    {0x1B37,13},{0x366C,14},{0x0042, 7},{0x00D8, 8}
+  },
+  {
+    {0x0000, 4},{0x002D, 6},{0x00F7, 8},{0x0058, 7},
+    {0x0167, 9},{0x02CB,10},{0x02CA,10},{0x000E, 6},
+    {0x1661,13},{0x0003, 3},{0x0002, 3},{0x0008, 4},
+    {0x0009, 4},{0x000D, 4},{0x0002, 4},{0x001F, 5},
+    {0x0017, 5},{0x0001, 4},{0x000C, 4},{0x000E, 4},
+    {0x000A, 4},{0x0006, 5},{0x0078, 7},{0x000F, 6},
+    {0x007A, 7},{0x0164, 9},{0x0599,11},{0x02CD,10},
+    {0x0B31,12},{0x1660,13},{0x0079, 7},{0x00F6, 8}
+  },
+  {
+    {0x0003, 4},{0x003C, 6},{0x000F, 7},{0x007A, 7},
+    {0x001D, 8},{0x0020, 9},{0x0072,10},{0x0006, 6},
+    {0x0399,13},{0x0004, 3},{0x0005, 3},{0x0005, 4},
+    {0x0006, 4},{0x000E, 4},{0x0004, 4},{0x0000, 4},
+    {0x0019, 5},{0x0002, 4},{0x000D, 4},{0x0007, 4},
+    {0x001F, 5},{0x0030, 6},{0x0011, 8},{0x0031, 6},
+    {0x0005, 6},{0x0021, 9},{0x00E7,11},{0x0038, 9},
+    {0x01CD,12},{0x0398,13},{0x007B, 7},{0x0009, 7}
+  },
+  {
+    {0x0009, 4},{0x0002, 5},{0x0074, 7},{0x0007, 6},
+    {0x00EC, 8},{0x00D1, 9},{0x01A6,10},{0x0006, 6},
+    {0x0D21,13},{0x0005, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x000F, 4},{0x0004, 4},{0x0000, 4},
+    {0x001C, 5},{0x0002, 4},{0x0005, 4},{0x0003, 4},
+    {0x000C, 5},{0x0035, 7},{0x01A7,10},{0x001B, 6},
+    {0x0077, 7},{0x01A5,10},{0x0349,11},{0x00D0, 9},
+    {0x0691,12},{0x0D20,13},{0x0075, 7},{0x00ED, 8}
+  },
+  {
+    {0x000A, 4},{0x000C, 5},{0x0012, 6},{0x001B, 6},
+    {0x00B7, 8},{0x016C, 9},{0x0099, 9},{0x005A, 7},
+    {0x16D8,13},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 3},{0x0005, 4},{0x0017, 5},
+    {0x000E, 5},{0x0002, 4},{0x0003, 4},{0x000F, 5},
+    {0x001A, 6},{0x004D, 8},{0x2DB3,14},{0x002C, 6},
+    {0x0011, 6},{0x02DA,10},{0x05B7,11},{0x0098, 9},
+    {0x0B6D,12},{0x2DB2,14},{0x0010, 6},{0x0027, 7}
+  },
+  {
+    {0x000D, 4},{0x000F, 5},{0x001D, 6},{0x0008, 5},
+    {0x0051, 7},{0x0056, 8},{0x00AF, 9},{0x002A, 7},
+    {0x148A,13},{0x0007, 3},{0x0000, 2},{0x0008, 4},
+    {0x0009, 4},{0x000C, 4},{0x0006, 4},{0x0017, 5},
+    {0x000B, 5},{0x0016, 5},{0x0015, 5},{0x0009, 5},
+    {0x0050, 7},{0x00AE, 9},{0x2917,14},{0x001C, 6},
+    {0x0014, 6},{0x0290,10},{0x0523,11},{0x0149, 9},
+    {0x0A44,12},{0x2916,14},{0x0053, 7},{0x00A5, 8}
+  },
+  {
+    {0x0001, 4},{0x001D, 6},{0x00F5, 8},{0x00F4, 8},
+    {0x024D,10},{0x0499,11},{0x0498,11},{0x0001, 5},
+    {0x0021, 6},{0x0006, 3},{0x0005, 3},{0x0006, 4},
+    {0x0005, 4},{0x0002, 4},{0x0007, 5},{0x0025, 6},
+    {0x007B, 7},{0x001C, 6},{0x0020, 6},{0x000D, 6},
+    {0x0048, 7},{0x0092, 8},{0x0127, 9},{0x000E, 4},
+    {0x0004, 4},{0x0011, 5},{0x000C, 6},{0x003C, 6},
+    {0x000F, 5},{0x0000, 5},{0x001F, 5},{0x0013, 5}
+  },
+  {
+    {0x0005, 4},{0x003C, 6},{0x0040, 7},{0x000D, 7},
+    {0x0031, 9},{0x0061,10},{0x0060,10},{0x0002, 5},
+    {0x00F5, 8},{0x0006, 3},{0x0005, 3},{0x0007, 4},
+    {0x0006, 4},{0x0002, 4},{0x0009, 5},{0x0025, 6},
+    {0x0007, 6},{0x0021, 6},{0x0024, 6},{0x0010, 6},
+    {0x0041, 7},{0x00F4, 8},{0x0019, 8},{0x000E, 4},
+    {0x0003, 4},{0x0011, 5},{0x0011, 6},{0x003F, 6},
+    {0x003E, 6},{0x007B, 7},{0x0000, 4},{0x0013, 5}
+  },
+  {
+    {0x000A, 4},{0x0007, 5},{0x0001, 6},{0x0009, 6},
+    {0x0131, 9},{0x0261,10},{0x0260,10},{0x0015, 6},
+    {0x0001, 7},{0x0007, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x0006, 4},{0x0012, 5},{0x002F, 6},
+    {0x0014, 6},{0x0027, 6},{0x002D, 6},{0x0016, 6},
+    {0x004D, 7},{0x0099, 8},{0x0000, 7},{0x0004, 4},
+    {0x0001, 4},{0x0005, 5},{0x0017, 6},{0x002E, 6},
+    {0x002C, 6},{0x0008, 6},{0x0006, 5},{0x0001, 5}
+  },
+  {
+    {0x0000, 3},{0x000E, 5},{0x0017, 6},{0x002A, 6},
+    {0x0010, 7},{0x00F9,10},{0x00F8,10},{0x001E, 7},
+    {0x003F, 8},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0006, 4},{0x000F, 5},{0x0005, 5},
+    {0x0016, 6},{0x0029, 6},{0x002B, 6},{0x0015, 6},
+    {0x0050, 7},{0x0011, 7},{0x007D, 9},{0x0004, 4},
+    {0x0017, 5},{0x0006, 5},{0x0014, 6},{0x002C, 6},
+    {0x002D, 6},{0x000E, 6},{0x0009, 6},{0x0051, 7}
+  },
+  {
+    {0x0002, 3},{0x0018, 5},{0x002F, 6},{0x000D, 5},
+    {0x0053, 7},{0x0295,10},{0x0294,10},{0x00A4, 8},
+    {0x007C, 8},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x000C, 5},{0x0028, 6},
+    {0x006A, 7},{0x001E, 6},{0x001D, 6},{0x0069, 7},
+    {0x00D7, 8},{0x007D, 8},{0x014B, 9},{0x0019, 5},
+    {0x0016, 5},{0x002E, 6},{0x001C, 6},{0x002B, 6},
+    {0x002A, 6},{0x0068, 7},{0x003F, 7},{0x00D6, 8}
+  },
+  {
+    {0x0002, 3},{0x001B, 5},{0x000C, 5},{0x0018, 5},
+    {0x0029, 6},{0x007F, 8},{0x02F0,10},{0x0198, 9},
+    {0x0179, 9},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001A, 5},{0x000D, 5},{0x002A, 6},
+    {0x0064, 7},{0x001E, 6},{0x0067, 7},{0x005F, 7},
+    {0x00CD, 8},{0x007E, 8},{0x02F1,10},{0x0016, 5},
+    {0x000E, 5},{0x002E, 6},{0x0065, 7},{0x002B, 6},
+    {0x0028, 6},{0x003E, 7},{0x00BD, 8},{0x0199, 9}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0016, 5},{0x0006, 4},
+    {0x0036, 6},{0x005C, 7},{0x015D, 9},{0x015C, 9},
+    {0x02BF,10},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x0018, 5},{0x0034, 6},{0x002A, 6},
+    {0x005E, 7},{0x006A, 7},{0x0064, 7},{0x005D, 7},
+    {0x00CB, 8},{0x00AD, 8},{0x02BE,10},{0x0014, 5},
+    {0x0033, 6},{0x006E, 7},{0x005F, 7},{0x006F, 7},
+    {0x006B, 7},{0x00CA, 8},{0x00AC, 8},{0x015E, 9}
+  },
+  {
+    {0x000F, 4},{0x001D, 5},{0x0018, 5},{0x000B, 4},
+    {0x0019, 5},{0x0029, 6},{0x00D6, 8},{0x0551,11},
+    {0x0AA1,12},{0x0001, 2},{0x0000, 2},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x0038, 6},{0x0028, 6},
+    {0x0057, 7},{0x006A, 7},{0x0068, 7},{0x0056, 7},
+    {0x00E5, 8},{0x0155, 9},{0x0AA0,12},{0x0073, 7},
+    {0x0069, 7},{0x00D7, 8},{0x00AB, 8},{0x00E4, 8},
+    {0x00A9, 8},{0x0151, 9},{0x0150, 9},{0x02A9,10}
+  },
+  {
+    {0x0008, 5},{0x0025, 7},{0x017A, 9},{0x02F7,10},
+    {0x0BDB,12},{0x17B4,13},{0x2F6B,14},{0x001D, 5},
+    {0x2F6A,14},{0x0008, 4},{0x0007, 4},{0x0001, 4},
+    {0x0002, 4},{0x000A, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0009, 4},{0x000D, 4},{0x000F, 4},
+    {0x000C, 4},{0x0003, 4},{0x000A, 5},{0x0016, 5},
+    {0x0013, 6},{0x005D, 7},{0x0024, 7},{0x00BC, 8},
+    {0x005C, 7},{0x05EC,11},{0x000B, 5},{0x005F, 7}
+  },
+  {
+    {0x000F, 5},{0x0010, 6},{0x004B, 8},{0x00C6, 8},
+    {0x031D,10},{0x0C71,12},{0x0C70,12},{0x0001, 4},
+    {0x0C73,12},{0x0008, 4},{0x0009, 4},{0x0002, 4},
+    {0x0003, 4},{0x000B, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0005, 4},{0x000D, 4},{0x000F, 4},
+    {0x000A, 4},{0x0019, 5},{0x0013, 6},{0x001D, 5},
+    {0x0030, 6},{0x0062, 7},{0x0024, 7},{0x004A, 8},
+    {0x018F, 9},{0x0C72,12},{0x000E, 5},{0x0011, 6}
+  },
+  {
+    {0x001B, 5},{0x0003, 6},{0x008D, 8},{0x0040, 7},
+    {0x0239,10},{0x0471,11},{0x08E0,12},{0x0003, 4},
+    {0x11C3,13},{0x000A, 4},{0x0009, 4},{0x0004, 4},
+    {0x0005, 4},{0x000E, 4},{0x0007, 4},{0x0001, 4},
+    {0x001E, 5},{0x0006, 4},{0x000C, 4},{0x000B, 4},
+    {0x0002, 4},{0x0000, 5},{0x0041, 7},{0x001F, 5},
+    {0x0022, 6},{0x0002, 6},{0x008F, 8},{0x008C, 8},
+    {0x011D, 9},{0x11C2,13},{0x001A, 5},{0x0021, 6}
+  },
+  {
+    {0x001F, 5},{0x0003, 6},{0x0003, 7},{0x0043, 7},
+    {0x000B, 9},{0x0015,10},{0x0051,12},{0x0003, 4},
+    {0x0050,12},{0x000D, 4},{0x000C, 4},{0x0004, 4},
+    {0x0006, 4},{0x000E, 4},{0x000A, 4},{0x0001, 4},
+    {0x001E, 5},{0x0005, 4},{0x0009, 4},{0x0007, 4},
+    {0x0011, 5},{0x0002, 6},{0x0004, 8},{0x0002, 4},
+    {0x002D, 6},{0x0020, 6},{0x0042, 7},{0x0001, 7},
+    {0x0000, 7},{0x0029,11},{0x0017, 5},{0x002C, 6}
+  },
+  {
+    {0x0003, 4},{0x001F, 6},{0x003A, 7},{0x005D, 7},
+    {0x0173, 9},{0x02E4,10},{0x172D,13},{0x0004, 4},
+    {0x172C,13},{0x000F, 4},{0x000E, 4},{0x0009, 4},
+    {0x0008, 4},{0x000C, 4},{0x000A, 4},{0x0001, 4},
+    {0x0016, 5},{0x0002, 4},{0x0005, 4},{0x001A, 5},
+    {0x002F, 6},{0x0038, 7},{0x05CA,11},{0x0006, 4},
+    {0x0037, 6},{0x001E, 6},{0x003B, 7},{0x0039, 7},
+    {0x00B8, 8},{0x0B97,12},{0x0000, 4},{0x0036, 6}
+  },
+  {
+    {0x0006, 4},{0x0037, 6},{0x005D, 7},{0x000C, 6},
+    {0x00B9, 8},{0x02E3,10},{0x05C4,11},{0x0004, 4},
+    {0x1715,13},{0x0000, 3},{0x000F, 4},{0x0008, 4},
+    {0x0007, 4},{0x000C, 4},{0x0009, 4},{0x001D, 5},
+    {0x0016, 5},{0x001C, 5},{0x001A, 5},{0x000B, 5},
+    {0x005E, 7},{0x0170, 9},{0x1714,13},{0x000A, 4},
+    {0x000A, 5},{0x0036, 6},{0x005F, 7},{0x001B, 7},
+    {0x001A, 7},{0x0B8B,12},{0x0002, 4},{0x0007, 5}
+  },
+  {
+    {0x000C, 4},{0x000B, 5},{0x0079, 7},{0x0022, 6},
+    {0x00F0, 8},{0x0119, 9},{0x0230,10},{0x001D, 5},
+    {0x08C4,12},{0x0001, 3},{0x0000, 3},{0x000A, 4},
+    {0x0009, 4},{0x000B, 4},{0x0007, 4},{0x001C, 5},
+    {0x003D, 6},{0x000D, 5},{0x0008, 5},{0x0015, 6},
+    {0x008D, 8},{0x118B,13},{0x118A,13},{0x000D, 4},
+    {0x0010, 5},{0x0009, 5},{0x0014, 6},{0x0047, 7},
+    {0x00F1, 8},{0x0463,11},{0x001F, 5},{0x000C, 5}
+  },
+  {
+    {0x0000, 3},{0x001A, 5},{0x0033, 6},{0x000C, 5},
+    {0x0046, 7},{0x01E3, 9},{0x03C5,10},{0x0017, 5},
+    {0x1E21,13},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x000A, 4},{0x0007, 4},{0x001B, 5},{0x003D, 6},
+    {0x001B, 6},{0x0022, 6},{0x0079, 7},{0x00F0, 8},
+    {0x1E20,13},{0x1E23,13},{0x1E22,13},{0x000E, 4},
+    {0x0016, 5},{0x0018, 5},{0x0032, 6},{0x001A, 6},
+    {0x0047, 7},{0x0789,11},{0x001F, 5},{0x0010, 5}
+  },
+  {
+    {0x001D, 5},{0x0061, 7},{0x004E, 8},{0x009E, 9},
+    {0x027C,11},{0x09F5,13},{0x09F4,13},{0x0003, 4},
+    {0x0060, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000A, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0031, 6},{0x0008, 5},{0x0038, 6},{0x0012, 6},
+    {0x0026, 7},{0x013F,10},{0x04FB,12},{0x000D, 4},
+    {0x0002, 4},{0x000C, 5},{0x0039, 6},{0x001C, 6},
+    {0x000F, 5},{0x001D, 6},{0x0008, 4},{0x0019, 5}
+  },
+  {
+    {0x0007, 4},{0x0019, 6},{0x00AB, 8},{0x00AA, 8},
+    {0x0119,10},{0x0461,12},{0x0460,12},{0x001B, 5},
+    {0x0047, 8},{0x0001, 3},{0x0000, 3},{0x000C, 4},
+    {0x000B, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0035, 6},{0x003D, 6},{0x003C, 6},{0x0018, 6},
+    {0x0022, 7},{0x008D, 9},{0x0231,11},{0x000E, 4},
+    {0x001F, 5},{0x0009, 5},{0x002B, 6},{0x0010, 6},
+    {0x0034, 6},{0x0054, 7},{0x0008, 4},{0x0014, 5}
+  },
+  {
+    {0x000C, 4},{0x0005, 5},{0x0008, 6},{0x005B, 7},
+    {0x004D, 9},{0x0131,11},{0x0261,12},{0x001A, 5},
+    {0x0012, 7},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x0009, 4},{0x0006, 4},{0x001B, 5},{0x0006, 5},
+    {0x001C, 6},{0x002C, 6},{0x0015, 6},{0x005A, 7},
+    {0x0027, 8},{0x0099,10},{0x0260,12},{0x000E, 4},
+    {0x0004, 4},{0x000F, 5},{0x0007, 5},{0x001D, 6},
+    {0x000B, 5},{0x0014, 6},{0x0008, 4},{0x0017, 5}
+  },
+  {
+    {0x000F, 4},{0x0013, 5},{0x0075, 7},{0x0024, 6},
+    {0x0095, 8},{0x0251,10},{0x04A0,11},{0x0010, 5},
+    {0x00C8, 8},{0x0002, 3},{0x0001, 3},{0x0001, 4},
+    {0x0000, 4},{0x001A, 5},{0x0011, 5},{0x002C, 6},
+    {0x0065, 7},{0x0074, 7},{0x004B, 7},{0x00C9, 8},
+    {0x0129, 9},{0x0943,12},{0x0942,12},{0x0003, 3},
+    {0x000A, 4},{0x001C, 5},{0x0018, 5},{0x0033, 6},
+    {0x0017, 5},{0x002D, 6},{0x001B, 5},{0x003B, 6}
+  },
+  {
+    {0x0003, 3},{0x001A, 5},{0x002D, 6},{0x0038, 6},
+    {0x0028, 7},{0x0395,10},{0x0E51,12},{0x0037, 6},
+    {0x00E4, 8},{0x0001, 3},{0x0000, 3},{0x001F, 5},
+    {0x001E, 5},{0x0017, 5},{0x003A, 6},{0x0073, 7},
+    {0x002A, 7},{0x002B, 7},{0x0029, 7},{0x01CB, 9},
+    {0x0729,11},{0x1CA1,13},{0x1CA0,13},{0x0004, 3},
+    {0x000A, 4},{0x0004, 4},{0x0018, 5},{0x0036, 6},
+    {0x000B, 5},{0x002C, 6},{0x0019, 5},{0x003B, 6}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0017, 5},
+    {0x0075, 7},{0x01F5, 9},{0x07D1,11},{0x0017, 6},
+    {0x01F6, 9},{0x0001, 3},{0x0000, 3},{0x001B, 5},
+    {0x001A, 5},{0x000A, 5},{0x0032, 6},{0x0074, 7},
+    {0x00F8, 8},{0x00F9, 8},{0x01F7, 9},{0x03E9,10},
+    {0x0FA0,12},{0x1F43,13},{0x1F42,13},{0x0003, 3},
+    {0x000A, 4},{0x001E, 5},{0x001C, 5},{0x003B, 6},
+    {0x0018, 5},{0x0016, 6},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0004, 3},{0x0007, 4},{0x0018, 5},{0x001E, 5},
+    {0x0036, 6},{0x0031, 7},{0x0177, 9},{0x0077, 7},
+    {0x0176, 9},{0x0001, 3},{0x0000, 3},{0x001A, 5},
+    {0x0019, 5},{0x003A, 6},{0x0019, 6},{0x005C, 7},
+    {0x00BA, 8},{0x0061, 8},{0x00C1, 9},{0x0180,10},
+    {0x0302,11},{0x0607,12},{0x0606,12},{0x0002, 3},
+    {0x000A, 4},{0x001F, 5},{0x001C, 5},{0x0037, 6},
+    {0x0016, 5},{0x0076, 7},{0x000D, 5},{0x002F, 6}
+  },
+  {
+    {0x0000, 3},{0x000A, 4},{0x001A, 5},{0x000C, 4},
+    {0x001D, 5},{0x0039, 6},{0x0078, 7},{0x005E, 7},
+    {0x0393,11},{0x0002, 3},{0x0001, 3},{0x0016, 5},
+    {0x000F, 5},{0x002E, 6},{0x005F, 7},{0x0073, 8},
+    {0x00E5, 9},{0x01C8,10},{0x0E4A,13},{0x1C97,14},
+    {0x1C96,14},{0x0E49,13},{0x0E48,13},{0x0004, 3},
+    {0x0006, 4},{0x001F, 5},{0x001B, 5},{0x001D, 6},
+    {0x0038, 6},{0x0038, 7},{0x003D, 6},{0x0079, 7}
+  },
+  {
+    {0x000B, 5},{0x002B, 7},{0x0054, 8},{0x01B7, 9},
+    {0x06D9,11},{0x0DB1,12},{0x0DB0,12},{0x0002, 4},
+    {0x00AB, 9},{0x0009, 4},{0x000A, 4},{0x0007, 4},
+    {0x0008, 4},{0x000F, 4},{0x000C, 4},{0x0003, 4},
+    {0x001D, 5},{0x0004, 4},{0x000B, 4},{0x0006, 4},
+    {0x001A, 5},{0x0003, 6},{0x00AA, 9},{0x0001, 4},
+    {0x0000, 5},{0x0014, 6},{0x006C, 7},{0x00DA, 8},
+    {0x0002, 6},{0x036D,10},{0x001C, 5},{0x0037, 6}
+  },
+  {
+    {0x001D, 5},{0x0004, 6},{0x00B6, 8},{0x006A, 8},
+    {0x05B9,11},{0x16E1,13},{0x16E0,13},{0x0007, 4},
+    {0x016F, 9},{0x000C, 4},{0x000D, 4},{0x0009, 4},
+    {0x0008, 4},{0x000F, 4},{0x000A, 4},{0x0003, 4},
+    {0x0017, 5},{0x0002, 4},{0x0004, 4},{0x001C, 5},
+    {0x002C, 6},{0x006B, 8},{0x0B71,12},{0x0005, 4},
+    {0x0003, 5},{0x001B, 6},{0x005A, 7},{0x0034, 7},
+    {0x0005, 6},{0x02DD,10},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x0003, 4},{0x007F, 7},{0x00A1, 8},{0x00A0, 8},
+    {0x020C,10},{0x0834,12},{0x106B,13},{0x0007, 4},
+    {0x0082, 8},{0x000E, 4},{0x000D, 4},{0x000B, 4},
+    {0x000C, 4},{0x0000, 3},{0x0009, 4},{0x0002, 4},
+    {0x0011, 5},{0x001E, 5},{0x0015, 5},{0x003E, 6},
+    {0x0040, 7},{0x041B,11},{0x106A,13},{0x0006, 4},
+    {0x000A, 5},{0x0029, 6},{0x007E, 7},{0x0051, 7},
+    {0x0021, 6},{0x0107, 9},{0x0004, 4},{0x000B, 5}
+  },
+  {
+    {0x0007, 4},{0x001B, 6},{0x00F6, 8},{0x00E9, 8},
+    {0x03A1,10},{0x0740,11},{0x0E82,12},{0x001F, 5},
+    {0x01EF, 9},{0x0001, 3},{0x0002, 3},{0x000B, 4},
+    {0x000C, 4},{0x000D, 4},{0x0008, 4},{0x001C, 5},
+    {0x0003, 5},{0x0012, 5},{0x0002, 5},{0x0075, 7},
+    {0x01D1, 9},{0x1D07,13},{0x1D06,13},{0x000A, 4},
+    {0x0013, 5},{0x003B, 6},{0x001A, 6},{0x007A, 7},
+    {0x003C, 6},{0x01EE, 9},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x000D, 4},{0x003D, 6},{0x0042, 7},{0x0037, 7},
+    {0x00D9, 9},{0x0362,11},{0x06C6,12},{0x001F, 5},
+    {0x0086, 8},{0x0001, 3},{0x0002, 3},{0x000C, 4},
+    {0x000B, 4},{0x000A, 4},{0x0001, 4},{0x000F, 5},
+    {0x0025, 6},{0x003C, 6},{0x001A, 6},{0x0087, 8},
+    {0x01B0,10},{0x0D8F,13},{0x0D8E,13},{0x000E, 4},
+    {0x0013, 5},{0x000C, 5},{0x0024, 6},{0x0020, 6},
+    {0x0011, 5},{0x006D, 8},{0x0000, 4},{0x000E, 5}
+  },
+  {
+    {0x0000, 3},{0x0012, 5},{0x0076, 7},{0x0077, 7},
+    {0x014D, 9},{0x0533,11},{0x14C9,13},{0x0013, 5},
+    {0x00A5, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x0008, 4},{0x001A, 5},{0x002B, 6},
+    {0x0075, 7},{0x0074, 7},{0x00A7, 8},{0x0298,10},
+    {0x14C8,13},{0x14CB,13},{0x14CA,13},{0x000F, 4},
+    {0x001C, 5},{0x0007, 5},{0x002A, 6},{0x0028, 6},
+    {0x001B, 5},{0x00A4, 8},{0x0002, 4},{0x0006, 5}
+  },
+  {
+    {0x0002, 3},{0x001A, 5},{0x002B, 6},{0x003A, 6},
+    {0x00ED, 8},{0x0283,10},{0x0A0A,12},{0x0004, 5},
+    {0x00A1, 8},{0x0004, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001F, 5},{0x0006, 5},{0x0077, 7},
+    {0x00A3, 8},{0x00A2, 8},{0x0140, 9},{0x1417,13},
+    {0x1416,13},{0x0A09,12},{0x0A08,12},{0x0000, 3},
+    {0x001E, 5},{0x0007, 5},{0x002A, 6},{0x0029, 6},
+    {0x001C, 5},{0x00EC, 8},{0x001B, 5},{0x0005, 5}
+  },
+  {
+    {0x0002, 3},{0x0002, 4},{0x0018, 5},{0x001D, 5},
+    {0x0035, 6},{0x00E4, 8},{0x01CF,11},{0x001D, 7},
+    {0x0072, 9},{0x0004, 3},{0x0005, 3},{0x0006, 4},
+    {0x0007, 4},{0x0006, 5},{0x0073, 7},{0x0038, 8},
+    {0x01CE,11},{0x039B,12},{0x0398,12},{0x0733,13},
+    {0x0732,13},{0x0735,13},{0x0734,13},{0x0000, 3},
+    {0x001F, 5},{0x001B, 5},{0x0034, 6},{0x000F, 6},
+    {0x001E, 5},{0x00E5, 8},{0x0019, 5},{0x0038, 6}
+  },
+  {
+    {0x0016, 5},{0x0050, 7},{0x0172, 9},{0x02E7,10},
+    {0x1732,13},{0x2E67,14},{0x2E66,14},{0x0006, 4},
+    {0x0051, 7},{0x0001, 3},{0x0000, 3},{0x000D, 4},
+    {0x000C, 4},{0x0009, 4},{0x001C, 5},{0x0009, 5},
+    {0x001C, 6},{0x001D, 6},{0x005D, 7},{0x00B8, 8},
+    {0x05CD,11},{0x1731,13},{0x1730,13},{0x000F, 4},
+    {0x0005, 4},{0x000F, 5},{0x0008, 5},{0x0029, 6},
+    {0x001D, 5},{0x002F, 6},{0x0008, 4},{0x0015, 5}
+  },
+  {
+    {0x0009, 4},{0x0021, 6},{0x0040, 7},{0x00AD, 8},
+    {0x02B0,10},{0x1589,13},{0x1588,13},{0x001C, 5},
+    {0x005F, 7},{0x0000, 3},{0x000F, 4},{0x000D, 4},
+    {0x000C, 4},{0x0006, 4},{0x0011, 5},{0x002A, 6},
+    {0x0057, 7},{0x005E, 7},{0x0041, 7},{0x0159, 9},
+    {0x0563,11},{0x158B,13},{0x158A,13},{0x0001, 3},
+    {0x0005, 4},{0x0014, 5},{0x003B, 6},{0x002E, 6},
+    {0x0004, 4},{0x003A, 6},{0x0007, 4},{0x0016, 5}
+  },
+  {
+    {0x000E, 4},{0x0007, 5},{0x0046, 7},{0x0045, 7},
+    {0x0064, 9},{0x032A,12},{0x0657,13},{0x0018, 5},
+    {0x000D, 6},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0036, 6},{0x0047, 7},
+    {0x0044, 7},{0x0018, 7},{0x0033, 8},{0x00CB,10},
+    {0x0656,13},{0x0329,12},{0x0328,12},{0x0002, 3},
+    {0x0006, 4},{0x0019, 5},{0x000E, 5},{0x0037, 6},
+    {0x0009, 4},{0x000F, 5},{0x0002, 4},{0x0010, 5}
+  },
+  {
+    {0x0003, 3},{0x0018, 5},{0x0023, 6},{0x0077, 7},
+    {0x0194, 9},{0x1956,13},{0x32AF,14},{0x003A, 6},
+    {0x0076, 7},{0x0002, 3},{0x0001, 3},{0x001F, 5},
+    {0x001E, 5},{0x0014, 5},{0x0022, 6},{0x0064, 7},
+    {0x0197, 9},{0x0196, 9},{0x032B,10},{0x0654,11},
+    {0x32AE,14},{0x1955,13},{0x1954,13},{0x0000, 3},
+    {0x0009, 4},{0x001C, 5},{0x0015, 5},{0x0010, 5},
+    {0x000D, 4},{0x0017, 5},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0005, 3},{0x0006, 4},{0x003E, 6},{0x0010, 5},
+    {0x0048, 7},{0x093F,12},{0x24FA,14},{0x0032, 6},
+    {0x0067, 7},{0x0002, 3},{0x0001, 3},{0x001B, 5},
+    {0x001E, 5},{0x0034, 6},{0x0066, 7},{0x0092, 8},
+    {0x0126, 9},{0x024E,10},{0x049E,11},{0x49F7,15},
+    {0x49F6,15},{0x24F9,14},{0x24F8,14},{0x0000, 3},
+    {0x0007, 4},{0x0018, 5},{0x0011, 5},{0x003F, 6},
+    {0x000E, 4},{0x0013, 5},{0x0035, 6},{0x0025, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x0012, 5},{0x001C, 5},
+    {0x001C, 6},{0x00EA, 9},{0x1D75,14},{0x001E, 6},
+    {0x0066, 7},{0x0001, 3},{0x0002, 3},{0x001B, 5},
+    {0x001A, 5},{0x001F, 6},{0x003B, 7},{0x0074, 8},
+    {0x01D6,10},{0x03AF,11},{0x1D74,14},{0x1D77,14},
+    {0x1D76,14},{0x0EB9,13},{0x0EB8,13},{0x000F, 4},
+    {0x0006, 4},{0x0013, 5},{0x003B, 6},{0x003A, 6},
+    {0x0000, 3},{0x0018, 5},{0x0032, 6},{0x0067, 7}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x001B, 5},{0x000C, 4},
+    {0x000D, 5},{0x00E6, 8},{0x0684,11},{0x0072, 7},
+    {0x00E7, 8},{0x0002, 3},{0x0001, 3},{0x0017, 5},
+    {0x0016, 5},{0x0018, 6},{0x00D1, 8},{0x01A0, 9},
+    {0x0686,11},{0x0D0F,12},{0x0D0A,12},{0x1A17,13},
+    {0x1A16,13},{0x1A1D,13},{0x1A1C,13},{0x000F, 4},
+    {0x001D, 5},{0x000E, 5},{0x0035, 6},{0x0038, 6},
+    {0x0000, 3},{0x000F, 5},{0x0019, 6},{0x0069, 7}
+  },
+  {
+    {0x0003, 3},{0x000C, 4},{0x001B, 5},{0x0000, 3},
+    {0x0003, 4},{0x002E, 6},{0x0051, 9},{0x00BC, 8},
+    {0x0053, 9},{0x0004, 3},{0x0002, 3},{0x0016, 5},
+    {0x0015, 5},{0x0015, 7},{0x0050, 9},{0x00A4,10},
+    {0x0294,12},{0x052B,13},{0x052A,13},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x000E, 4},
+    {0x001A, 5},{0x0004, 5},{0x0028, 6},{0x0029, 6},
+    {0x000F, 4},{0x000B, 6},{0x005F, 7},{0x00BD, 8}
+  },
+  {
+    {0x0003, 4},{0x0009, 6},{0x00D0, 8},{0x01A3, 9},
+    {0x0344,10},{0x0D14,12},{0x1A2B,13},{0x0004, 4},
+    {0x0015, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000C, 4},{0x000E, 4},{0x0009, 4},{0x001B, 5},
+    {0x000A, 5},{0x0014, 5},{0x000D, 5},{0x002A, 6},
+    {0x0014, 7},{0x068B,11},{0x1A2A,13},{0x0008, 4},
+    {0x000B, 5},{0x002B, 6},{0x000B, 6},{0x0069, 7},
+    {0x0035, 6},{0x0008, 6},{0x0007, 4},{0x000C, 5}
+  },
+  {
+    {0x000A, 4},{0x003C, 6},{0x0032, 7},{0x0030, 7},
+    {0x00C5, 9},{0x0621,12},{0x0620,12},{0x001F, 5},
+    {0x0033, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 4},{0x0004, 4},{0x000D, 5},
+    {0x0026, 6},{0x0027, 6},{0x0014, 6},{0x0063, 8},
+    {0x0189,10},{0x0623,12},{0x0622,12},{0x000B, 4},
+    {0x0012, 5},{0x003D, 6},{0x0022, 6},{0x0015, 6},
+    {0x000B, 5},{0x0023, 6},{0x0007, 4},{0x0010, 5}
+  },
+  {
+    {0x000F, 4},{0x000C, 5},{0x0043, 7},{0x0010, 6},
+    {0x0044, 8},{0x0114,10},{0x0455,12},{0x0018, 5},
+    {0x0023, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x0009, 4},{0x0019, 5},{0x0009, 5},
+    {0x0017, 6},{0x0016, 6},{0x0042, 7},{0x008B, 9},
+    {0x0454,12},{0x0457,12},{0x0456,12},{0x000B, 4},
+    {0x0015, 5},{0x000A, 5},{0x0029, 6},{0x0020, 6},
+    {0x000D, 5},{0x0028, 6},{0x0007, 4},{0x0011, 5}
+  },
+  {
+    {0x0001, 3},{0x001A, 5},{0x0029, 6},{0x002A, 6},
+    {0x00A0, 8},{0x0285,10},{0x1425,13},{0x0002, 5},
+    {0x0000, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0008, 4},{0x0012, 5},{0x0001, 6},
+    {0x0051, 7},{0x0001, 7},{0x0143, 9},{0x0508,11},
+    {0x1424,13},{0x1427,13},{0x1426,13},{0x000F, 4},
+    {0x001C, 5},{0x0003, 5},{0x0037, 6},{0x002B, 6},
+    {0x0013, 5},{0x0036, 6},{0x001D, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x001F, 5},{0x003D, 6},{0x0006, 5},
+    {0x0016, 7},{0x0053, 9},{0x014A,11},{0x0034, 6},
+    {0x002A, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001C, 5},{0x0037, 6},{0x0017, 7},
+    {0x002B, 8},{0x0028, 8},{0x00A4,10},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x0000, 3},
+    {0x001D, 5},{0x0007, 5},{0x0004, 5},{0x0035, 6},
+    {0x0014, 5},{0x0036, 6},{0x0015, 5},{0x003C, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0007, 5},{0x001D, 5},
+    {0x0009, 6},{0x01F3, 9},{0x07C7,11},{0x0008, 6},
+    {0x01F0, 9},{0x0003, 3},{0x0002, 3},{0x000D, 4},
+    {0x000C, 4},{0x0017, 5},{0x007D, 7},{0x01F2, 9},
+    {0x07C6,11},{0x07C5,11},{0x1F12,13},{0x3E27,14},
+    {0x3E26,14},{0x1F11,13},{0x1F10,13},{0x0000, 3},
+    {0x001E, 5},{0x0006, 5},{0x0039, 6},{0x0038, 6},
+    {0x003F, 6},{0x002C, 6},{0x0005, 5},{0x002D, 6}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0018, 5},{0x0003, 4},
+    {0x0005, 5},{0x0035, 7},{0x004F, 9},{0x0012, 7},
+    {0x04E5,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000E, 4},{0x0033, 6},{0x0026, 8},{0x009D,10},
+    {0x04E4,13},{0x04E7,13},{0x04E6,13},{0x04E1,13},
+    {0x04E0,13},{0x04E3,13},{0x04E2,13},{0x0000, 3},
+    {0x001F, 5},{0x000C, 5},{0x003D, 6},{0x003C, 6},
+    {0x0032, 6},{0x0034, 7},{0x001B, 6},{0x0008, 6}
+  },
+  {
+    {0x0000, 3},{0x0004, 4},{0x001C, 5},{0x000F, 4},
+    {0x0002, 4},{0x0007, 5},{0x0075, 7},{0x00E8, 8},
+    {0x1D2A,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000C, 4},{0x0077, 7},{0x0E96,12},{0x3A57,14},
+    {0x3A56,14},{0x3A5D,14},{0x3A5C,14},{0x3A5F,14},
+    {0x3A5E,14},{0x1D29,13},{0x1D28,13},{0x0003, 3},
+    {0x0006, 5},{0x000A, 5},{0x002C, 7},{0x0017, 6},
+    {0x0076, 7},{0x01D3, 9},{0x03A4,10},{0x002D, 7}
+  },
+  {
+    {0x000A, 4},{0x0024, 6},{0x00BF, 8},{0x0085, 8},
+    {0x0211,10},{0x0842,12},{0x1087,13},{0x0018, 5},
+    {0x0020, 6},{0x0001, 3},{0x0002, 3},{0x000E, 4},
+    {0x000D, 4},{0x0007, 4},{0x0013, 5},{0x0025, 6},
+    {0x005E, 7},{0x0043, 7},{0x00BE, 8},{0x0109, 9},
+    {0x1086,13},{0x0841,12},{0x0840,12},{0x000F, 4},
+    {0x0001, 4},{0x0011, 5},{0x0000, 5},{0x002E, 6},
+    {0x0019, 5},{0x0001, 5},{0x0006, 4},{0x0016, 5}
+  },
+  {
+    {0x0002, 3},{0x000F, 5},{0x006F, 7},{0x0061, 7},
+    {0x0374,10},{0x1BA8,13},{0x3753,14},{0x0012, 5},
+    {0x0036, 6},{0x0000, 3},{0x0001, 3},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0031, 6},{0x0060, 7},
+    {0x00DC, 8},{0x01BB, 9},{0x06EB,11},{0x1BAB,13},
+    {0x3752,14},{0x3755,14},{0x3754,14},{0x000E, 4},
+    {0x0006, 4},{0x0013, 5},{0x000E, 5},{0x003E, 6},
+    {0x0008, 4},{0x001E, 5},{0x0019, 5},{0x003F, 6}
+  },
+  {
+    {0x0003, 3},{0x001C, 5},{0x0025, 6},{0x0024, 6},
+    {0x01DA, 9},{0x1DBD,13},{0x3B7C,14},{0x003C, 6},
+    {0x003D, 6},{0x0000, 3},{0x0001, 3},{0x000B, 4},
+    {0x000A, 4},{0x000B, 5},{0x0077, 7},{0x00EC, 8},
+    {0x03B6,10},{0x076E,11},{0x1DBF,13},{0x76FB,15},
+    {0x76FA,15},{0x3B79,14},{0x3B78,14},{0x000D, 4},
+    {0x001F, 5},{0x0013, 5},{0x000A, 5},{0x0008, 5},
+    {0x000C, 4},{0x0008, 4},{0x0009, 5},{0x003A, 6}
+  },
+  {
+    {0x0005, 3},{0x0003, 4},{0x0004, 5},{0x0010, 5},
+    {0x008F, 8},{0x0475,11},{0x11D1,13},{0x0079, 7},
+    {0x0027, 6},{0x0002, 3},{0x0003, 3},{0x0001, 4},
+    {0x0000, 4},{0x0026, 6},{0x0046, 7},{0x011C, 9},
+    {0x0477,11},{0x08ED,12},{0x11D0,13},{0x11D3,13},
+    {0x11D2,13},{0x11D9,13},{0x11D8,13},{0x000D, 4},
+    {0x001F, 5},{0x0012, 5},{0x0005, 5},{0x003D, 6},
+    {0x000C, 4},{0x000E, 4},{0x0022, 6},{0x0078, 7}
+  },
+  {
+    {0x0005, 3},{0x000C, 4},{0x001B, 5},{0x0000, 4},
+    {0x0006, 6},{0x03E2,10},{0x3E3D,14},{0x000F, 7},
+    {0x0034, 6},{0x0003, 3},{0x0002, 3},{0x001E, 5},
+    {0x001D, 5},{0x007D, 7},{0x01F0, 9},{0x07C6,11},
+    {0x3E3C,14},{0x3E3F,14},{0x3E3E,14},{0x3E39,14},
+    {0x3E38,14},{0x3E3B,14},{0x3E3A,14},{0x0008, 4},
+    {0x001C, 5},{0x0002, 5},{0x003F, 6},{0x0035, 6},
+    {0x0009, 4},{0x0001, 3},{0x000E, 7},{0x00F9, 8}
+  },
+  {
+    {0x0004, 3},{0x000B, 4},{0x0001, 4},{0x000A, 4},
+    {0x001E, 6},{0x00E0, 9},{0x0E1E,13},{0x0071, 8},
+    {0x0039, 7},{0x0007, 3},{0x0006, 3},{0x000D, 5},
+    {0x000C, 5},{0x0020, 7},{0x01C2,10},{0x1C3F,14},
+    {0x1C3E,14},{0x0E19,13},{0x0E18,13},{0x0E1B,13},
+    {0x0E1A,13},{0x0E1D,13},{0x0E1C,13},{0x0000, 4},
+    {0x0009, 5},{0x001D, 6},{0x001F, 6},{0x0011, 6},
+    {0x0005, 4},{0x0001, 3},{0x0043, 8},{0x0042, 8}
+  },
+  {
+    {0x0004, 3},{0x000D, 4},{0x0007, 4},{0x0002, 3},
+    {0x0014, 5},{0x016C, 9},{0x16D1,13},{0x02DF,10},
+    {0x016E, 9},{0x0000, 2},{0x0007, 3},{0x002C, 6},
+    {0x002B, 6},{0x02DE,10},{0x16D0,13},{0x16D3,13},
+    {0x16D2,13},{0x2DB5,14},{0x2DB4,14},{0x2DB7,14},
+    {0x2DB6,14},{0x16D9,13},{0x16D8,13},{0x000C, 5},
+    {0x002A, 6},{0x005A, 7},{0x001B, 6},{0x001A, 6},
+    {0x0017, 5},{0x000C, 4},{0x05B7,11},{0x05B5,11}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  },
+  {
+    {0x0000, 3},{0x0010, 5},{0x0072, 7},{0x0071, 7},
+    {0x0154, 9},{0x0AAB,12},{0x0AA8,12},{0x0014, 5},
+    {0x0070, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0003, 4},{0x0011, 5},{0x0073, 7},
+    {0x0054, 7},{0x00AB, 8},{0x02AB,10},{0x1553,13},
+    {0x1552,13},{0x1555,13},{0x1554,13},{0x000D, 4},
+    {0x001E, 5},{0x0012, 5},{0x003E, 6},{0x002B, 6},
+    {0x0002, 4},{0x003F, 6},{0x001D, 5},{0x0013, 5}
+  },
+  {
+    {0x0003, 3},{0x001F, 5},{0x0029, 6},{0x003D, 6},
+    {0x000C, 7},{0x0069,10},{0x0345,13},{0x0002, 5},
+    {0x0028, 6},{0x0002, 3},{0x0001, 3},{0x000E, 4},
+    {0x000C, 4},{0x0015, 5},{0x0007, 6},{0x001B, 8},
+    {0x006B,10},{0x006A,10},{0x0344,13},{0x0347,13},
+    {0x0346,13},{0x01A1,12},{0x01A0,12},{0x000B, 4},
+    {0x001A, 5},{0x0012, 5},{0x0000, 5},{0x003C, 6},
+    {0x0008, 4},{0x001B, 5},{0x0013, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0014, 5},
+    {0x0056, 7},{0x015C, 9},{0x15D5,13},{0x003C, 6},
+    {0x002A, 6},{0x0000, 3},{0x0001, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 5},{0x00AF, 8},{0x02BB,10},
+    {0x15D4,13},{0x15D7,13},{0x15D6,13},{0x15D1,13},
+    {0x15D0,13},{0x15D3,13},{0x15D2,13},{0x000B, 4},
+    {0x0019, 5},{0x000D, 5},{0x003E, 6},{0x0031, 6},
+    {0x0007, 4},{0x0005, 4},{0x003D, 6},{0x0030, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x001A, 5},{0x0000, 4},
+    {0x0036, 6},{0x0011, 8},{0x0106,12},{0x000A, 7},
+    {0x006E, 7},{0x0002, 3},{0x0003, 3},{0x0003, 4},
+    {0x0002, 4},{0x006F, 7},{0x0021, 9},{0x020F,13},
+    {0x020E,13},{0x0101,12},{0x0100,12},{0x0103,12},
+    {0x0102,12},{0x0105,12},{0x0104,12},{0x000C, 4},
+    {0x001E, 5},{0x0003, 5},{0x003E, 6},{0x003F, 6},
+    {0x0009, 4},{0x000E, 4},{0x000B, 7},{0x0009, 7}
+  },
+  {
+    {0x0002, 3},{0x000E, 4},{0x001E, 5},{0x000C, 4},
+    {0x001F, 5},{0x006E, 7},{0x00AD,10},{0x00AF,10},
+    {0x0014, 7},{0x0004, 3},{0x0003, 3},{0x001A, 5},
+    {0x0017, 5},{0x002A, 8},{0x0576,13},{0x0AEF,14},
+    {0x0AEE,14},{0x0571,13},{0x0570,13},{0x0573,13},
+    {0x0572,13},{0x0575,13},{0x0574,13},{0x0003, 4},
+    {0x0016, 5},{0x0004, 5},{0x0036, 6},{0x000B, 6},
+    {0x000A, 4},{0x0000, 3},{0x006F, 7},{0x00AC,10}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0003, 3},{0x0011, 5},{0x0020, 6},{0x0074, 7},
+    {0x010D, 9},{0x0863,12},{0x0860,12},{0x000A, 5},
+    {0x0075, 7},{0x0001, 3},{0x0000, 3},{0x000B, 4},
+    {0x000A, 4},{0x0018, 5},{0x0038, 6},{0x0042, 7},
+    {0x010F, 9},{0x010E, 9},{0x0219,10},{0x10C3,13},
+    {0x10C2,13},{0x10C5,13},{0x10C4,13},{0x000F, 4},
+    {0x0004, 4},{0x0019, 5},{0x000B, 5},{0x0039, 6},
+    {0x0009, 4},{0x001B, 5},{0x001A, 5},{0x003B, 6}
+  },
+  {
+    {0x0005, 3},{0x0001, 4},{0x003E, 6},{0x0001, 5},
+    {0x00E2, 8},{0x1C6F,13},{0x38D9,14},{0x0039, 6},
+    {0x001F, 6},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 5},{0x0070, 7},{0x01C7, 9},
+    {0x038C,10},{0x071A,11},{0x38D8,14},{0x38DB,14},
+    {0x38DA,14},{0x38DD,14},{0x38DC,14},{0x000D, 4},
+    {0x001D, 5},{0x000E, 5},{0x003F, 6},{0x003C, 6},
+    {0x000C, 4},{0x0006, 4},{0x003D, 6},{0x001E, 6}
+  },
+  {
+    {0x0006, 3},{0x000B, 4},{0x0011, 5},{0x001E, 5},
+    {0x0074, 7},{0x03AA,10},{0x1D5C,13},{0x0001, 6},
+    {0x0021, 6},{0x0001, 3},{0x0002, 3},{0x0007, 4},
+    {0x0006, 4},{0x003E, 6},{0x00EB, 8},{0x01D4, 9},
+    {0x0EAF,12},{0x3ABB,14},{0x3ABA,14},{0x1D59,13},
+    {0x1D58,13},{0x1D5B,13},{0x1D5A,13},{0x000A, 4},
+    {0x001C, 5},{0x0001, 5},{0x003F, 6},{0x003B, 6},
+    {0x0001, 4},{0x0009, 4},{0x0020, 6},{0x0000, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0017, 5},{0x0004, 4},
+    {0x0016, 6},{0x016A, 9},{0x16B1,13},{0x0017, 7},
+    {0x005B, 7},{0x0006, 3},{0x0007, 3},{0x0001, 4},
+    {0x0000, 4},{0x000A, 6},{0x02D7,10},{0x0B5A,12},
+    {0x16B0,13},{0x16B3,13},{0x16B2,13},{0x2D6D,14},
+    {0x2D6C,14},{0x2D6F,14},{0x2D6E,14},{0x0006, 4},
+    {0x000A, 5},{0x0004, 5},{0x002C, 6},{0x0017, 6},
+    {0x0003, 4},{0x0007, 4},{0x0016, 7},{0x00B4, 8}
+  },
+  {
+    {0x0005, 3},{0x000D, 4},{0x0005, 4},{0x0009, 4},
+    {0x0033, 6},{0x0193, 9},{0x192C,13},{0x0061, 8},
+    {0x0031, 7},{0x0000, 2},{0x0007, 3},{0x0010, 5},
+    {0x0011, 5},{0x00C8, 8},{0x192F,13},{0x325B,14},
+    {0x325A,14},{0x1929,13},{0x1928,13},{0x192B,13},
+    {0x192A,13},{0x325D,14},{0x325C,14},{0x0018, 5},
+    {0x001A, 6},{0x001B, 6},{0x0065, 7},{0x0019, 6},
+    {0x0004, 4},{0x0007, 4},{0x0060, 8},{0x0324,10}
+  },
+  {
+    {0x0006, 3},{0x0000, 3},{0x0002, 4},{0x000F, 4},
+    {0x0039, 6},{0x01D9, 9},{0x1D82,13},{0x0761,11},
+    {0x03BE,10},{0x0001, 2},{0x0002, 2},{0x000F, 6},
+    {0x000E, 6},{0x0762,11},{0x3B07,14},{0x3B06,14},
+    {0x3B1D,14},{0x3B1C,14},{0x3B1F,14},{0x3B1E,14},
+    {0x3B19,14},{0x3B18,14},{0x3B1B,14},{0x0038, 6},
+    {0x01DE, 9},{0x00ED, 8},{0x03BF,10},{0x00EE, 8},
+    {0x003A, 6},{0x0006, 5},{0x0EC0,12},{0x3B1A,14}
+  },
+  {
+    {0x0000, 2},{0x0002, 3},{0x000F, 5},{0x0006, 4},
+    {0x001C, 6},{0x01D0,10},{0x0E8C,13},{0x1D1B,14},
+    {0x1D1A,14},{0x0003, 2},{0x0002, 2},{0x00EA, 9},
+    {0x00E9, 9},{0x0E89,13},{0x0E88,13},{0x0E8B,13},
+    {0x0E8A,13},{0x1D65,14},{0x1D64,14},{0x1D67,14},
+    {0x1D66,14},{0x1D61,14},{0x1D60,14},{0x03AD,11},
+    {0x1D63,14},{0x1D62,14},{0x1D1D,14},{0x1D1C,14},
+    {0x003B, 7},{0x01D7,10},{0x1D1F,14},{0x1D1E,14}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  }
+};
+
+
+
+/*A description of a Huffman code value used when encoding the tree.*/
+typedef struct{
+  /*The bit pattern, left-shifted so that the MSB of all patterns is
+     aligned.*/
+  ogg_uint32_t pattern;
+  /*The amount the bit pattern was shifted.*/
+  int          shift;
+  /*The token this bit pattern represents.*/
+  int          token;
+}oc_huff_entry;
+
+
+
+/*Compares two oc_huff_entry structures by their bit patterns.
+  _c1: The first entry to compare.
+  _c2: The second entry to compare.
+  Return: <0 if _c1<_c2, >0 if _c1>_c2.*/
+static int huff_entry_cmp(const void *_c1,const void *_c2){
+  ogg_uint32_t b1;
+  ogg_uint32_t b2;
+  b1=((const oc_huff_entry *)_c1)->pattern;
+  b2=((const oc_huff_entry *)_c2)->pattern;
+  return b1<b2?-1:b1>b2?1:0;
+}
+
+/*Encodes a description of the given Huffman tables.
+  Although the codes are stored in the encoder as flat arrays, in the bit
+   stream and in the decoder they are structured as a tree.
+  This function recovers the tree structure from the flat array and then
+   writes it out.
+  Note that the codes MUST form a Huffman code, and not merely a prefix-free
+   code, since the binary tree is assumed to be full.
+  _opb:   The buffer to store the tree in.
+  _codes: The Huffman tables to pack.
+  Return: 0 on success, or a negative value if one of the given Huffman tables
+   does not form a full, prefix-free code.*/
+int oc_huff_codes_pack(oggpack_buffer *_opb,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
+  int i;
+  for(i=0;i<TH_NHUFFMAN_TABLES;i++){
+    oc_huff_entry entries[TH_NDCT_TOKENS];
+    int           bpos;
+    int           maxlen;
+    int           mask;
+    int           j;
+    /*First, find the maximum code length so we can align all the bit
+       patterns.*/
+    maxlen=_codes[i][0].nbits;
+    for(j=1;j<TH_NDCT_TOKENS;j++){
+      maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
+    }
+    mask=(1<<maxlen)-1;
+    /*Copy over the codes into our temporary workspace.
+      The bit patterns are aligned, and the original entry each code is from
+       is stored as well.*/
+    for(j=0;j<TH_NDCT_TOKENS;j++){
+      entries[j].shift=maxlen-_codes[i][j].nbits;
+      entries[j].pattern=_codes[i][j].pattern<<entries[j].shift&mask;
+      entries[j].token=j;
+    }
+    /*Sort the codes into ascending order.
+      This is the order the leaves of the tree will be traversed.*/
+    qsort(entries,TH_NDCT_TOKENS,sizeof(entries[0]),huff_entry_cmp);
+    /*For each leaf of the tree:*/
+    bpos=maxlen;
+    for(j=0;j<TH_NDCT_TOKENS;j++){
+      int bit;
+      /*If this code has any bits at all.*/
+      if(entries[j].shift<maxlen){
+        /*Descend into the tree, writing a bit for each branch.*/
+        for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
+        /*Mark this as a leaf node, and write its value.*/
+        oggpackB_write(_opb,1,1);
+        oggpackB_write(_opb,entries[j].token,5);
+        /*For each 1 branch we've descended, back up the tree until we reach a
+           0 branch.*/
+        bit=1<<bpos;
+        for(;entries[j].pattern&bit;bpos++)bit<<=1;
+        /*Validate the code.*/
+        if(j+1<TH_NDCT_TOKENS){
+          mask=~(bit-1)<<1;
+          /*The next entry should have a 1 bit where we had a 0, and should
+             match our code above that bit.
+            This verifies both fullness and prefix-freeness simultaneously.*/
+          if(!(entries[j+1].pattern&bit)||
+           (entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
+            return TH_EINVAL;
+          }
+        }
+        /*If there are no more codes, we should have ascended back to the top
+           of the tree.*/
+        else if(bpos<maxlen)return TH_EINVAL;
+      }
+    }
+  }
+  return 0;
+}

Copied: branches/theora-thusnelda/lib/enc/huffenc.h (from rev 15592, trunk/theora-exp/lib/huffenc.h)
===================================================================
--- branches/theora-thusnelda/lib/enc/huffenc.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/huffenc.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,19 @@
+#if !defined(_huffenc_H)
+# define _huffenc_H (1)
+# include "../dec/huffman.h"
+
+
+
+typedef th_huff_code                  th_huff_table[TH_NDCT_TOKENS];
+
+
+
+extern const th_huff_code
+ TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+
+
+
+int oc_huff_codes_pack(oggpack_buffer *_opb,
+ const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
+
+#endif

Modified: branches/theora-thusnelda/lib/enc/mcenc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mcenc.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/mcenc.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -152,25 +152,25 @@
  int _mvoffset0,int _mvoffset1,int _goldenp,int _best_err){
   macroblock_t *mb;
   int           err;
-  int           i;
+  int           bi;
   mb=cpi->macro+mbi;
   err=0;
-  for(i=0;i<4;i++){
-    int fi = mb->Ryuv[0][i];
-    ogg_uint32_t base_offset = cpi->frag_buffer_index[fi];
-    const unsigned char *cur = cpi->frame + base_offset;
-    const unsigned char *ref = (_goldenp ? cpi->golden : cpi->lastrecon) + base_offset;
-    fi=mb->Ryuv[0][i];
+  for(bi=0;bi<4;bi++){
+    const unsigned char *cur;
+    const unsigned char *ref;
+    ogg_uint32_t         base_offset;
+    int                  fi;
+    fi=mb->Ryuv[0][bi];
     base_offset=cpi->frag_buffer_index[fi];
     cur=cpi->frame+base_offset;
     ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-    err+=dsp_sad8x8_xy2_thres(cpi->dsp,cur,
+    err+=oc_enc_frag_sad2_thresh(cpi,cur,
      ref+_mvoffset0,ref+_mvoffset1,cpi->stride[0],_best_err-err);
   }
   return err;
 }
 
-static int oc_mcenc_ysad_check_mbcandidate_fullpel(CP_INSTANCE *cpi, 
+static int oc_mcenc_ysad_check_mbcandidate_fullpel(CP_INSTANCE *cpi,
  mc_state *_mcenc,int _mbi,int _dx,int _dy,int _goldenp,int _block_err[4]){
   int           stride;
   int           mvoffset;
@@ -183,19 +183,16 @@
   mvoffset=_dx+_dy*stride;
   err=0;
   for(bi=0;bi<4;bi++){
-    int fi;
+    const unsigned char *cur;
+    const unsigned char *ref;
+    ogg_uint32_t         base_offset;
+    int                  fi;
     fi=mb->Ryuv[0][bi];
-    /*Only check valid fragments.*/
-    if(fi<cpi->frag_total){
-      ogg_uint32_t         base_offset;
-      const unsigned char *cur;
-      const unsigned char *ref;
-      base_offset=cpi->frag_buffer_index[fi];
-      cur=cpi->frame+base_offset;
-      ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
-      _block_err[bi]=dsp_sad8x8_thres(cpi->dsp,cur,ref+mvoffset,stride,16384);
-      err+=_block_err[bi];
-    }
+    base_offset=cpi->frag_buffer_index[fi];
+    cur=cpi->frame+base_offset;
+    ref=(_goldenp?cpi->golden:cpi->lastrecon)+base_offset;
+    _block_err[bi]=oc_enc_frag_sad(cpi,cur,ref+mvoffset,stride);
+    err+=_block_err[bi];
   }
   return err;
 }
@@ -259,7 +256,6 @@
   mb=cpi->macro+_mbi;
   stride=cpi->stride[0];
   fi=mb->Ryuv[0][_bi];
-  if(fi>=cpi->frag_total)return _best_err;
   mvoffset_base=_vec[0]+_vec[1]*stride;
   offset_y[0]=offset_y[1]=offset_y[2]=-stride;
   offset_y[3]=offset_y[5]=0;
@@ -292,7 +288,7 @@
     ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
     mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
     mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
-    err=dsp_sad8x8_xy2_thres(cpi->dsp,cur,
+    err=oc_enc_frag_sad2_thresh(cpi,cur,
      ref+mvoffset0,ref+mvoffset1,stride,_best_err);
     if(err<_best_err){
       _best_err=err;

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/mode.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -299,12 +299,12 @@
     int offs[2];
     if(oc_get_mv_offsets(offs,_dx,_dy,
      cpi->stride[_pli],_pli,cpi->info.pixelformat)>1){
-      sad=dsp_sad8x8_xy2_thres(cpi->dsp,b,r+offs[0],r+offs[1],stride,16384);
+      sad=oc_enc_frag_sad2_thresh(cpi,b,r+offs[0],r+offs[1],stride,0x3FC0);
     }
-    else sad=dsp_sad8x8(cpi->dsp,b,r+offs[0],stride);
+    else sad=oc_enc_frag_sad(cpi,b,r+offs[0],stride);
   }
   /*TODO: Is this special case worth it?*/
-  else sad=dsp_sad8x8(cpi->dsp,b,r,stride);
+  else sad=oc_enc_frag_sad(cpi,b,r,stride);
   /*TODO: <<2? Really? Why?*/
   if(_pli)return sad<<2;
   else return sad;
@@ -381,7 +381,7 @@
         int offs;
         int sad;
         offs=cpi->frag_buffer_index[fi];
-        sad=dsp_sad8x8(cpi->dsp,cpi->frame+offs,cpi->lastrecon+offs,stride);
+        sad=oc_enc_frag_sad(cpi,cpi->frame+offs,cpi->lastrecon+offs,stride);
         if(pli)sad<<=2;
         cost+=BINMAP(mode_rate[_qi][pli][0],sad);
       }
@@ -489,11 +489,12 @@
 #include "quant_lookup.h"
 
 static void uncode_frag(CP_INSTANCE *cpi, int fi, int plane){
-  int bi = cpi->frag_buffer_index[fi];
-  int stride = cpi->stride[plane];
-
+  int bi;
+  int stride;
+  bi=cpi->frag_buffer_index[fi];
+  stride=cpi->stride[plane];
   cpi->frag_coded[fi]=0;
-  dsp_copy8x8 (cpi->dsp, cpi->lastrecon+bi, cpi->recon+bi, stride);
+  oc_enc_frag_copy(cpi,cpi->recon+bi,cpi->lastrecon+bi,stride);
 }
 
 typedef struct{
@@ -546,8 +547,8 @@
  token_checkpoint_t **stack){
   const int keyframe = (cpi->FrameType == KEY_FRAME);
   const oc_iquant *iq = ps->iq[mode != CODE_INTRA];
-  ogg_int16_t buffer[64];
-  ogg_int16_t data[64];
+  ogg_int16_t buffer[64]OC_ALIGN16;
+  ogg_int16_t data[64]OC_ALIGN16;
   const int bi = cpi->frag_buffer_index[fi];
   const int stride = cpi->stride[ps->plane];
   const unsigned char *frame_ptr = &cpi->frame[bi];
@@ -561,6 +562,8 @@
   int uncoded_dc=0,coded_dc=0,dc_flag=0;
   int lambda = cpi->lambda;
   token_checkpoint_t *checkpoint=*stack;
+  int mv_offs[2];
+  int nmv_offs;
   int cost;
   int ci;
   int pi;
@@ -578,36 +581,26 @@
 
   /* motion comp */
   switch(mode){
-  case CODE_INTER_PLUS_MV:
-  case CODE_INTER_LAST_MV:
-  case CODE_INTER_PRIOR_LAST:
-  case CODE_GOLDEN_MV:
-  case CODE_INTER_FOURMV:
-
-    {
-      int offs[2];
-      if(oc_get_mv_offsets(offs,_dx,_dy,
-       stride,ps->plane,cpi->info.pixelformat)>1){
-        dsp_copy8x8_half(cpi->dsp,
-         lastrecon+offs[0],lastrecon+offs[1],thisrecon,stride);
-        dsp_sub8x8(cpi->dsp,frame_ptr,thisrecon,data,stride);
+    case CODE_INTRA:{
+      nmv_offs=0;
+      oc_enc_frag_sub_128(cpi,data,frame_ptr,stride);
+    }break;
+    case CODE_USING_GOLDEN:
+    case CODE_INTER_NO_MV:{
+      nmv_offs=1;
+      mv_offs[0]=0;
+      oc_enc_frag_sub(cpi,data,frame_ptr,lastrecon,stride);
+    }break;
+    default:{
+      nmv_offs=oc_get_mv_offsets(mv_offs,_dx,_dy,
+       stride,ps->plane,cpi->info.pixelformat);
+      if(nmv_offs>1){
+        oc_enc_frag_copy2(cpi,thisrecon,
+         lastrecon+mv_offs[0],lastrecon+mv_offs[1],stride);
+        oc_enc_frag_sub(cpi,data,frame_ptr,thisrecon,stride);
       }
-      else{
-        dsp_copy8x8(cpi->dsp,lastrecon+offs[0],thisrecon,stride);
-        dsp_sub8x8(cpi->dsp,frame_ptr,lastrecon+offs[0],data,stride);
-      }
-    }
-    break;
-
-  case CODE_USING_GOLDEN:
-  case CODE_INTER_NO_MV:
-    dsp_copy8x8 (cpi->dsp, lastrecon, thisrecon, stride);
-    dsp_sub8x8(cpi->dsp, frame_ptr, lastrecon, data, stride);
-    break;
-  case CODE_INTRA:
-    dsp_sub8x8_128(cpi->dsp, frame_ptr, data, stride);
-    dsp_set8x8(cpi->dsp, 128, thisrecon, stride);
-    break;
+      else oc_enc_frag_sub(cpi,data,frame_ptr,lastrecon+mv_offs[0],stride);
+    }break;
   }
 
 #ifdef COLLECT_METRICS
@@ -636,7 +629,7 @@
         uncoded_dc += data[pi];
       }
     }else{
-      dsp_sub8x8(cpi->dsp, frame_ptr, cpi->lastrecon+bi, buffer, stride);
+      oc_enc_frag_sub(cpi,buffer,frame_ptr,cpi->lastrecon+bi,stride);
       for(pi=0;pi<64;pi++){
         uncoded_ssd += buffer[pi]*buffer[pi];
         uncoded_dc += buffer[pi];
@@ -646,7 +639,7 @@
   }
 
   /* transform */
-  dsp_fdct_short(cpi->dsp, data, buffer);
+  oc_enc_fdct8x8(cpi,buffer,data);
 
   /* collect rho metrics, quantize */
   {
@@ -693,26 +686,18 @@
   /* tokenize */
   cost = dct_tokenize_AC(cpi, fi, data, dequant, buffer, fi>=cpi->frag_n[0], stack);
 
-  /* reconstruct */
-  switch(nonzero){
-  case 0:
-    IDct1( data, dequant, buffer );
-    break;
-  case 1: case 2:
-    dsp_IDct3(cpi->dsp, data, dequant, buffer );
-    break;
-  case 3:case 4:case 5:case 6:case 7:case 8: case 9:
-    dsp_IDct10(cpi->dsp, data, dequant, buffer );
-    break;
-  default:
-    dsp_IDctSlow(cpi->dsp, data, dequant, buffer );
+  /*Reconstruct.*/
+  oc_enc_dequant_idct8x8(cpi,buffer,data,
+   nonzero+1,nonzero+1,dequant[0],(ogg_uint16_t *)dequant);
+  if(mode==CODE_INTRA)oc_enc_frag_recon_intra(cpi,thisrecon,stride,buffer);
+  else{
+    oc_enc_frag_recon_inter(cpi,thisrecon,
+     nmv_offs==1?lastrecon+mv_offs[0]:thisrecon,stride,buffer);
   }
 
-  dsp_recon8x8 (cpi->dsp, thisrecon, buffer, stride);
-
   if(!keyframe){
     /* in retrospect, should we have skipped this block? */
-    dsp_sub8x8(cpi->dsp, frame_ptr, thisrecon, buffer, stride);
+    oc_enc_frag_sub(cpi,buffer,frame_ptr,thisrecon,stride);
     for(pi=0;pi<64;pi++){
       coded_ssd+=buffer[pi]*buffer[pi];
       coded_dc+=buffer[pi];
@@ -1283,7 +1268,7 @@
 
   for(ti=0;ti<tn;ti++){
     int token = cpi->dct_token[group][ti];
-    int bits = cpi->HuffCodeLengthArray_VP3x[(ti<ty ? huffY : huffC)][token] + cpi->ExtraBitLengths_VP3x[token];
+    int bits = cpi->huff_codes[(ti<ty ? huffY : huffC)][token].nbits + OC_DCT_TOKEN_EXTRA_BITS[token];
 
     if(token>DCT_REPEAT_RUN4_TOKEN){
       /* not an EOB run; this token belongs to a single fragment */

Deleted: branches/theora-thusnelda/lib/enc/reconstruct.c
===================================================================
--- branches/theora-thusnelda/lib/enc/reconstruct.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/reconstruct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,87 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id$
-
- ********************************************************************/
-
-#include "codec_internal.h"
-
-static void copy8x8__c (const unsigned char *src,
-			unsigned char *dest,
-			unsigned int stride)
-{
-  int j;
-  for ( j = 8; j ; --j ){
-    ((ogg_uint32_t*)dest)[0] = ((ogg_uint32_t*)src)[0];
-    ((ogg_uint32_t*)dest)[1] = ((ogg_uint32_t*)src)[1];
-    src+=stride;
-    dest+=stride;
-  }
-}
-
-static void copy8x8_half__c (const unsigned char *src1,
-			     const unsigned char *src2, 
-			     unsigned char *dest,
-			     unsigned int stride)
-{
-  int j;
-
-  for (j = 8; j; --j){
-    dest[0] = ((int)src1[0] + (int)src2[0]) >> 1;
-    dest[1] = ((int)src1[1] + (int)src2[1]) >> 1;
-    dest[2] = ((int)src1[2] + (int)src2[2]) >> 1;
-    dest[3] = ((int)src1[3] + (int)src2[3]) >> 1;
-    dest[4] = ((int)src1[4] + (int)src2[4]) >> 1;
-    dest[5] = ((int)src1[5] + (int)src2[5]) >> 1;
-    dest[6] = ((int)src1[6] + (int)src2[6]) >> 1;
-    dest[7] = ((int)src1[7] + (int)src2[7]) >> 1;
-
-    src1+=stride;
-    src2+=stride;
-    dest+=stride;
-  }
-}
-
-
-static void recon8x8__c (unsigned char *ReconPtr, 
-			 const ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
-{
-  ogg_uint32_t i;
-
-  for (i = 8; i; i--){
-    ReconPtr[0] = clamp255(ReconPtr[0] + ChangePtr[0]);
-    ReconPtr[1] = clamp255(ReconPtr[1] + ChangePtr[1]);
-    ReconPtr[2] = clamp255(ReconPtr[2] + ChangePtr[2]);
-    ReconPtr[3] = clamp255(ReconPtr[3] + ChangePtr[3]);
-    ReconPtr[4] = clamp255(ReconPtr[4] + ChangePtr[4]);
-    ReconPtr[5] = clamp255(ReconPtr[5] + ChangePtr[5]);
-    ReconPtr[6] = clamp255(ReconPtr[6] + ChangePtr[6]);
-    ReconPtr[7] = clamp255(ReconPtr[7] + ChangePtr[7]);
-
-    ChangePtr += 8;
-    ReconPtr += LineStep;
-  }
-}
-
-void dsp_recon_init (DspFunctions *funcs, ogg_uint32_t cpu_flags)
-{
-  funcs->copy8x8 = copy8x8__c;
-  funcs->copy8x8_half = copy8x8_half__c;
-  funcs->recon8x8 = recon8x8__c;
-#if defined(USE_ASM)
-  if (cpu_flags & OC_CPU_X86_MMX) {
-    dsp_mmx_recon_init(funcs);
-  }
-#endif
-}

Deleted: branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,397 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dct_decode_mmx.c 15078 2008-06-27 22:07:19Z xiphmont $
-
- ********************************************************************/
-
-#include <stdlib.h>
-
-#include "../codec_internal.h"
-
-#if defined(USE_ASM)
-
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
- 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
- 0x0004000400040004LL;
-
-static void loop_filter_v(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
-  long esi;
-  _pix-=_ystride*2;
-  __asm__ __volatile__(
-    /*mm0=0*/
-    "pxor %%mm0,%%mm0\n\t"
-    /*esi=_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*mm7=_pix[0...8]*/
-    "movq (%[pix]),%%mm7\n\t"
-    /*mm4=_pix[0...8+_ystride*3]*/
-    "movq (%[pix],%[s]),%%mm4\n\t"
-    /*mm6=_pix[0...8]*/
-    "movq %%mm7,%%mm6\n\t"
-    /*Expand unsigned _pix[0...3] to 16 bits.*/
-    "punpcklbw %%mm0,%%mm6\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    /*Expand unsigned _pix[4...8] to 16 bits.*/
-    "punpckhbw %%mm0,%%mm7\n\t"
-    /*Expand other arrays too.*/
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm5\n\t"
-    /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/
-    "psubw %%mm4,%%mm6\n\t"
-    "psubw %%mm5,%%mm7\n\t"
-    /*mm5=mm4=_pix[0...8+_ystride]*/
-    "movq (%[pix],%[ystride]),%%mm4\n\t"
-    /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/
-    "movq (%[pix],%[ystride],2),%%mm2\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    "movq %%mm2,%%mm3\n\t"
-    "movq %%mm2,%%mm1\n\t"
-    /*Expand these arrays.*/
-    "punpckhbw %%mm0,%%mm5\n\t"
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm3\n\t"
-    "punpcklbw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V3],%%mm0\n\t"
-    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
-    "psubw %%mm5,%%mm3\n\t"
-    "psubw %%mm4,%%mm2\n\t"
-    /*Scale by 3.*/
-    "pmullw %%mm0,%%mm3\n\t"
-    "pmullw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V4],%%mm0\n\t"
-    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
-       3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
-    "paddw %%mm7,%%mm3\n\t"
-    "paddw %%mm6,%%mm2\n\t"
-    /*Add 4.*/
-    "paddw %%mm0,%%mm3\n\t"
-    "paddw %%mm0,%%mm2\n\t"
-    /*"Divide" by 8.*/
-    "psraw $3,%%mm3\n\t"
-    "psraw $3,%%mm2\n\t"
-    /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/
-    /*Free up mm5.*/
-    "packuswb %%mm5,%%mm4\n\t"
-    /*mm0=L L L L*/
-    "movq (%[ll]),%%mm0\n\t"
-    /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    "movq %%mm2,%%mm5\n\t"
-    "pxor %%mm6,%%mm6\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    "psllw $1,%%mm7\n\t"
-    "psllw $1,%%mm6\n\t"
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    "pcmpgtw %%mm2,%%mm7\n\t"
-    "pcmpgtw %%mm6,%%mm5\n\t"
-    "pand %%mm7,%%mm2\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "pand %%mm5,%%mm2\n\t"
-    "psllw $1,%%mm7\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-2L -2L -2L -2L*/
-    /*mm7==2L 2L 2L 2L*/
-    "pcmpgtw %%mm3,%%mm7\n\t"
-    "pcmpgtw %%mm6,%%mm5\n\t"
-    "pand %%mm7,%%mm3\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    "pand %%mm5,%%mm3\n\t"
-    /*if(R_i<-L)R_i'=R_i+2L;
-      if(R_i>L)R_i'=R_i-2L;
-      if(R_i<-L||R_i>L)R_i=-R_i':*/
-    "psraw $1,%%mm6\n\t"
-    "movq %%mm2,%%mm5\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm2==R_3 R_2 R_1 R_0*/
-    /*mm5==R_3 R_2 R_1 R_0*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm5=R_i>L?FF:00*/
-    "pcmpgtw %%mm0,%%mm5\n\t"
-    /*mm6=-L>R_i?FF:00*/
-    "pcmpgtw %%mm2,%%mm6\n\t"
-    /*mm7=R_i>L?2L:0*/
-    "pand %%mm5,%%mm7\n\t"
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm7,%%mm2\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    /*mm5=-L>R_i||R_i>L*/
-    "por %%mm6,%%mm5\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm7=-L>R_i?2L:0*/
-    "pand %%mm6,%%mm7\n\t"
-    "pxor %%mm6,%%mm6\n\t"
-    /*mm2=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm7,%%mm2\n\t"
-    "psubw %%mm0,%%mm6\n\t"
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    "pand %%mm2,%%mm5\n\t"
-    "movq %%mm0,%%mm7\n\t"
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm5,%%mm2\n\t"
-    "psllw $1,%%mm7\n\t"
-    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm5,%%mm2\n\t"
-    "movq %%mm3,%%mm5\n\t"
-    /*mm3==R_7 R_6 R_5 R_4*/
-    /*mm5==R_7 R_6 R_5 R_4*/
-    /*mm6==-L -L -L -L*/
-    /*mm0==L L L L*/
-    /*mm6=-L>R_i?FF:00*/
-    "pcmpgtw %%mm3,%%mm6\n\t"
-    /*mm5=R_i>L?FF:00*/
-    "pcmpgtw %%mm0,%%mm5\n\t"
-    /*mm7=R_i>L?2L:0*/
-    "pand %%mm5,%%mm7\n\t"
-    /*mm2=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm7,%%mm3\n\t"
-    "psllw $1,%%mm0\n\t"
-    /*mm5=-L>R_i||R_i>L*/
-    "por %%mm6,%%mm5\n\t"
-    /*mm0=-L>R_i?2L:0*/
-    "pand %%mm6,%%mm0\n\t"
-    /*mm3=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm0,%%mm3\n\t"
-    /*mm5=-L>R_i||R_i>L?-R_i':0*/
-    "pand %%mm3,%%mm5\n\t"
-    /*mm2=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm5,%%mm3\n\t"
-    /*mm2=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm5,%%mm3\n\t"
-    /*Unfortunately, there's no unsigned byte+signed byte with unsigned
-       saturation op code, so we have to promote things back 16 bits.*/
-    "pxor %%mm0,%%mm0\n\t"
-    "movq %%mm4,%%mm5\n\t"
-    "punpcklbw %%mm0,%%mm4\n\t"
-    "punpckhbw %%mm0,%%mm5\n\t"
-    "movq %%mm1,%%mm6\n\t"
-    "punpcklbw %%mm0,%%mm1\n\t"
-    "punpckhbw %%mm0,%%mm6\n\t"
-    /*_pix[0...8+_ystride]+=R_i*/
-    "paddw %%mm2,%%mm4\n\t"
-    "paddw %%mm3,%%mm5\n\t"
-    /*_pix[0...8+_ystride*2]-=R_i*/
-    "psubw %%mm2,%%mm1\n\t"
-    "psubw %%mm3,%%mm6\n\t"
-    "packuswb %%mm5,%%mm4\n\t"
-    "packuswb %%mm6,%%mm1\n\t"
-    /*Write it back out.*/
-    "movq %%mm4,(%[pix],%[ystride])\n\t"
-    "movq %%mm1,(%[pix],%[ystride],2)\n\t"
-    :[s]"=&S"(esi)
-    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
-     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
-    :"memory"
-  );
-}
-
-/*This code implements the bulk of loop_filter_h().
-  Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
-   four p0's to one register we must transpose the values in four mmx regs.
-  When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride,
-			   const ogg_int16_t *_ll){
-  long esi;
-  long edi;
-  __asm__ __volatile__(
-    /*x x x x 3 2 1 0*/
-    "movd (%[pix]),%%mm0\n\t"
-    /*esi=_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*x x x x 7 6 5 4*/
-    "movd (%[pix],%[ystride]),%%mm1\n\t"
-    /*x x x x B A 9 8*/
-    "movd (%[pix],%[ystride],2),%%mm2\n\t"
-    /*x x x x F E D C*/
-    "movd (%[pix],%[s]),%%mm3\n\t"
-    /*mm0=7 3 6 2 5 1 4 0*/
-    "punpcklbw %%mm1,%%mm0\n\t"
-    /*mm2=F B E A D 9 C 8*/
-    "punpcklbw %%mm3,%%mm2\n\t"
-    /*mm1=7 3 6 2 5 1 4 0*/
-    "movq %%mm0,%%mm1\n\t"
-    /*mm0=F B 7 3 E A 6 2*/
-    "punpckhwd %%mm2,%%mm0\n\t"
-    /*mm1=D 9 5 1 C 8 4 0*/
-    "punpcklwd %%mm2,%%mm1\n\t"
-    "pxor %%mm7,%%mm7\n\t"
-    /*mm5=D 9 5 1 C 8 4 0*/
-    "movq %%mm1,%%mm5\n\t"
-    /*mm1=x C x 8 x 4 x 0==pix[0]*/
-    "punpcklbw %%mm7,%%mm1\n\t"
-    /*mm5=x D x 9 x 5 x 1==pix[1]*/
-    "punpckhbw %%mm7,%%mm5\n\t"
-    /*mm3=F B 7 3 E A 6 2*/
-    "movq %%mm0,%%mm3\n\t"
-    /*mm0=x E x A x 6 x 2==pix[2]*/
-    "punpcklbw %%mm7,%%mm0\n\t"
-    /*mm3=x F x B x 7 x 3==pix[3]*/
-    "punpckhbw %%mm7,%%mm3\n\t"
-    /*mm1=mm1-mm3==pix[0]-pix[3]*/
-    "psubw %%mm3,%%mm1\n\t"
-    /*Save a copy of pix[2] for later.*/
-    "movq %%mm0,%%mm4\n\t"
-    /*mm0=mm0-mm5==pix[2]-pix[1]*/
-    "psubw %%mm5,%%mm0\n\t"
-    /*Scale by 3.*/
-    "pmullw %[OC_V3],%%mm0\n\t"
-    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
-    "paddw %%mm1,%%mm0\n\t"
-    /*Add 4.*/
-    "paddw %[OC_V4],%%mm0\n\t"
-    /*"Divide" by 8, producing the residuals R_i.*/
-    "psraw $3,%%mm0\n\t"
-    /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
-    /*mm6=L L L L*/
-    "movq (%[ll]),%%mm6\n\t"
-    /*if(R_i<-2L||R_i>2L)R_i=0:*/
-    "movq %%mm0,%%mm1\n\t"
-    "pxor %%mm2,%%mm2\n\t"
-    "movq %%mm6,%%mm3\n\t"
-    "psubw %%mm6,%%mm2\n\t"
-    "psllw $1,%%mm3\n\t"
-    "psllw $1,%%mm2\n\t"
-    /*mm0==R_3 R_2 R_1 R_0*/
-    /*mm1==R_3 R_2 R_1 R_0*/
-    /*mm2==-2L -2L -2L -2L*/
-    /*mm3==2L 2L 2L 2L*/
-    "pcmpgtw %%mm0,%%mm3\n\t"
-    "pcmpgtw %%mm2,%%mm1\n\t"
-    "pand %%mm3,%%mm0\n\t"
-    "pand %%mm1,%%mm0\n\t"
-    /*if(R_i<-L)R_i'=R_i+2L;
-      if(R_i>L)R_i'=R_i-2L;
-      if(R_i<-L||R_i>L)R_i=-R_i':*/
-    "psraw $1,%%mm2\n\t"
-    "movq %%mm0,%%mm1\n\t"
-    "movq %%mm6,%%mm3\n\t"
-    /*mm0==R_3 R_2 R_1 R_0*/
-    /*mm1==R_3 R_2 R_1 R_0*/
-    /*mm2==-L -L -L -L*/
-    /*mm6==L L L L*/
-    /*mm2=-L>R_i?FF:00*/
-    "pcmpgtw %%mm0,%%mm2\n\t"
-    /*mm1=R_i>L?FF:00*/
-    "pcmpgtw %%mm6,%%mm1\n\t"
-    /*mm3=2L 2L 2L 2L*/
-    "psllw $1,%%mm3\n\t"
-    /*mm6=2L 2L 2L 2L*/
-    "psllw $1,%%mm6\n\t"
-    /*mm3=R_i>L?2L:0*/
-    "pand %%mm1,%%mm3\n\t"
-    /*mm6=-L>R_i?2L:0*/
-    "pand %%mm2,%%mm6\n\t"
-    /*mm0=R_i>L?R_i-2L:R_i*/
-    "psubw %%mm3,%%mm0\n\t"
-    /*mm1=-L>R_i||R_i>L*/
-    "por %%mm2,%%mm1\n\t"
-    /*mm0=-L>R_i?R_i+2L:R_i*/
-    "paddw %%mm6,%%mm0\n\t"
-    /*mm1=-L>R_i||R_i>L?R_i':0*/
-    "pand %%mm0,%%mm1\n\t"
-    /*mm0=-L>R_i||R_i>L?0:R_i*/
-    "psubw %%mm1,%%mm0\n\t"
-    /*mm0=-L>R_i||R_i>L?-R_i':R_i*/
-    "psubw %%mm1,%%mm0\n\t"
-    /*_pix[1]+=R_i;*/
-    "paddw %%mm0,%%mm5\n\t"
-    /*_pix[2]-=R_i;*/
-    "psubw %%mm0,%%mm4\n\t"
-    /*mm5=x x x x D 9 5 1*/
-    "packuswb %%mm7,%%mm5\n\t"
-    /*mm4=x x x x E A 6 2*/
-    "packuswb %%mm7,%%mm4\n\t"
-    /*mm5=E D A 9 6 5 2 1*/
-    "punpcklbw %%mm4,%%mm5\n\t"
-    /*edi=6 5 2 1*/
-    "movd %%mm5,%%edi\n\t"
-    "movw %%di,1(%[pix])\n\t"
-    /*Why is there such a big stall here?*/
-    "psrlq $32,%%mm5\n\t"
-    "shrl $16,%%edi\n\t"
-    "movw %%di,1(%[pix],%[ystride])\n\t"
-    /*edi=E D A 9*/
-    "movd %%mm5,%%edi\n\t"
-    "movw %%di,1(%[pix],%[ystride],2)\n\t"
-    "shrl $16,%%edi\n\t"
-    "movw %%di,1(%[pix],%[s])\n\t"
-    :[s]"=&S"(esi),[d]"=&D"(edi),
-     [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
-    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
-    :"memory"
-  );
-}
-
-static void loop_filter_h(unsigned char *_pix,int _ystride,
-			  const ogg_int16_t *_ll){
-  _pix-=2;
-  loop_filter_h4(_pix,_ystride,_ll);
-  loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll);
-}
- 
-static void loop_filter_mmx(CP_INSTANCE *cpi, int FLimit){
-  int j;
-  ogg_int16_t __attribute__((aligned(8)))  ll[4];
-  unsigned char *cp = cpi->frag_coded;
-  ogg_uint32_t *bp = cpi->frag_buffer_index;
-
-  if ( FLimit == 0 ) return;
-  ll[0]=ll[1]=ll[2]=ll[3]=FLimit;
-
-  for ( j = 0; j < 3 ; j++){
-    ogg_uint32_t *bp_begin = bp;
-    ogg_uint32_t *bp_end = bp + cpi->frag_n[j];
-    int stride = cpi->stride[j];
-    int h = cpi->frag_h[j];
-
-    while(bp<bp_end){
-      ogg_uint32_t *bp_left = bp;
-      ogg_uint32_t *bp_right = bp + h;
-      while(bp<bp_right){
-	if(cp[0]){
-	  if(bp>bp_left)
-	    loop_filter_h(&cpi->lastrecon[bp[0]],stride,ll);
-	  if(bp_left>bp_begin)
-	    loop_filter_v(&cpi->lastrecon[bp[0]],stride,ll);
-	  if(bp+1<bp_right && !cp[1])
-	    loop_filter_h(&cpi->lastrecon[bp[0]]+8,stride,ll);
-	  if(bp+h<bp_end && !cp[h])
-	    loop_filter_v(&cpi->lastrecon[bp[h]],stride,ll);
-	}
-	bp++;
-	cp++;
-      }
-    }
-  }
-
-  /*This needs to be removed when decode specific functions are implemented:*/
-  __asm__ __volatile__("emms\n\t");
-}
-
-/* install our implementation in the function table */
-void dsp_mmx_dct_decode_init(DspFunctions *funcs)
-{
-  funcs->LoopFilter = loop_filter_mmx;
-}
-
-#endif /* USE_ASM */

Deleted: branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,134 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-
-#include <stdlib.h>
-
-#include "../codec_internal.h"
-#include "../dsp.h"
-
-#if defined(USE_ASM)
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x0080008000800080LL;
-
-#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
-#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))
-#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
-
-#define SUB_LOOP                                                                 \
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                    \
-    "  movq        (%1), %%mm1      \n\t" /* mm1 = ReconPtr */                   \
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */ \
-    "  movq        %%mm1, %%mm3     \n\t" /* dup to prepare for up conversion */ \
-    /* convert from UINT8 to INT16 */                                            \
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */             \
-    "  punpcklbw   %%mm7, %%mm1     \n\t" /* mm1 = INT16(ReconPtr) */            \
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */             \
-    "  punpckhbw   %%mm7, %%mm3     \n\t" /* mm3 = INT16(ReconPtr) */            \
-    /* start calculation */                                                      \
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - ReconPtr */         \
-    "  psubw       %%mm3, %%mm2     \n\t" /* mm2 = FiltPtr - ReconPtr */         \
-    "  movq        %%mm0,  (%2)     \n\t" /* write answer out */                 \
-    "  movq        %%mm2, 8(%2)     \n\t" /* write answer out */                 \
-    /* Increment pointers */                                                     \
-    "  add         $16, %2          \n\t"                                        \
-    "  add         %3, %0           \n\t"                                        \
-    "  add         %3, %1           \n\t"
-
-static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
-                         ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine)
-{
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t"
-
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-    SUB_LOOP
-
-
-     : "+r" (FiltPtr),
-       "+r" (ReconPtr),
-       "+r" (DctInputPtr)
-
-     : "r" ((unsigned long)PixelsPerLine)
-     : "memory"
-  );
-}
-
-#define SUB_128_LOOP                                                             \
-    "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */                    \
-    "  movq        %%mm0, %%mm2     \n\t" /* dup to prepare for up conversion */ \
-    /* convert from UINT8 to INT16 */                                            \
-    "  punpcklbw   %%mm7, %%mm0     \n\t" /* mm0 = INT16(FiltPtr) */             \
-    "  punpckhbw   %%mm7, %%mm2     \n\t" /* mm2 = INT16(FiltPtr) */             \
-    /* start calculation */                                                      \
-    "  psubw       %%mm1, %%mm0     \n\t" /* mm0 = FiltPtr - 128 */              \
-    "  psubw       %%mm1, %%mm2     \n\t" /* mm2 = FiltPtr - 128 */              \
-    "  movq        %%mm0,  (%1)     \n\t" /* write answer out */                 \
-    "  movq        %%mm2, 8(%1)     \n\t" /* write answer out */                 \
-    /* Increment pointers */                                                     \
-    "  add         $16, %1          \n\t"                                        \
-    "  add         %2, %0           \n\t"
-
-static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-                             ogg_uint32_t PixelsPerLine)
-{
-
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-
-    "  pxor        %%mm7, %%mm7     \n\t"
-    "  movq        %[V128], %%mm1   \n\t"
-
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-    SUB_128_LOOP
-
-     : "+r" (FiltPtr),
-       "+r" (DctInputPtr)
-     : "r" ((unsigned long)PixelsPerLine),
-       [V128] "m" (V128)
-     : "memory"
-  );
-}
-
-static void restore_fpu (void)
-{
-  __asm__ __volatile__ (
-    "  emms                         \n\t"
-  );
-}
-
-void dsp_mmx_init(DspFunctions *funcs)
-{
-  funcs->restore_fpu = restore_fpu;
-  funcs->sub8x8 = sub8x8__mmx;
-  funcs->sub8x8_128 = sub8x8_128__mmx;
-}
-
-#endif /* USE_ASM */

Deleted: branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,154 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: dsp_mmxext.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-
-#include <stdlib.h>
-
-#include "../codec_internal.h"
-#include "../dsp.h"
-
-#if defined(USE_ASM)
-
-#define SAD_LOOP                                                                    \
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
-    "  movq (%2), %%mm1             \n\t"                                           \
-    "  psadbw %%mm1, %%mm0          \n\t"                                           \
-    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */ \
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
-    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
-
-static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
-                                    ogg_uint32_t stride)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
-
-    SAD_LOOP
-    SAD_LOOP
-    SAD_LOOP
-    SAD_LOOP
-    SAD_LOOP
-    SAD_LOOP
-    SAD_LOOP
-
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
-    "  movq (%2), %%mm1             \n\t"
-    "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" ((unsigned long)stride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-#define SAD_THRES_LOOP                                                              \
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
-    "  movq (%2), %%mm1             \n\t"                                           \
-    "  psadbw %%mm1, %%mm0          \n\t"                                           \
-    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */ \
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
-    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
-
-static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
-                                          ogg_uint32_t stride, ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
-
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-    SAD_THRES_LOOP
-
-    "  movd %%mm7, %0               \n\t"
-
-     : "=r" (DiffVal),
-       "+r" (ptr1), 
-       "+r" (ptr2) 
-     : "r" ((unsigned long)stride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-#define SAD_XY2_THRES_LOOP                                                          \
-    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */                  \
-    "  movq (%2), %%mm1             \n\t"                                           \
-    "  movq (%3), %%mm2             \n\t"                                           \
-    "  pavgb %%mm2, %%mm1           \n\t"                                           \
-    "  psadbw %%mm1, %%mm0          \n\t"                                           \
-                                                                                    \
-    "  add %4, %1                   \n\t"       /* Inc pointer into the new data */ \
-    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */      \
-    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */     \
-    "  add %4, %3                   \n\t"       /* Inc pointer into ref data */
-
-static ogg_uint32_t sad8x8_xy2_thres__mmxext (const unsigned char *SrcData, const unsigned char *RefDataPtr1,
-                                              const unsigned char *RefDataPtr2, ogg_uint32_t Stride,
-                                              ogg_uint32_t thres)
-{
-  ogg_uint32_t  DiffVal;
-
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
-
-	SAD_XY2_THRES_LOOP
-    SAD_XY2_THRES_LOOP
-	SAD_XY2_THRES_LOOP
-    SAD_XY2_THRES_LOOP
-	SAD_XY2_THRES_LOOP
-    SAD_XY2_THRES_LOOP
-	SAD_XY2_THRES_LOOP
-    SAD_XY2_THRES_LOOP
-
-    "  movd %%mm7, %0               \n\t"
-     : "=m" (DiffVal),
-       "+r" (SrcData),
-       "+r" (RefDataPtr1),
-       "+r" (RefDataPtr2)
-     : "r" ((unsigned long)Stride)
-     : "memory"
-  );
-
-  return DiffVal;
-}
-
-void dsp_mmxext_init(DspFunctions *funcs)
-{
-  funcs->sad8x8 = sad8x8__mmxext;
-  funcs->sad8x8_thres = sad8x8_thres__mmxext;
-  funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmxext;
-}
-
-#endif /* USE_ASM */

Modified: branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_sse2.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -14,14 +14,11 @@
   last mod: $Id$
 
  ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
 
-#include <stdlib.h>
+#if defined(OC_X86_ASM)
 
-#include "codec_internal.h"
-#include "dsp.h"
-
-#if defined(USE_ASM)
-
 static int find_nonzero__sse2(ogg_int16_t *q, int in){
   int ret,tmp,tmp2;
 

Deleted: branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,335 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************/
-
-/* mmx fdct implementation for x86_64 */
-/* $Id: fdct_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ */
-
-#include "theora/theora.h"
-#include "../codec_internal.h"
-#include "../dsp.h"
-
-#if defined(USE_ASM)
-
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
-static const __attribute__ ((aligned(8),used)) ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
-
-/* execute stage 1 of forward DCT */
-#define Fdct_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,temp)                        \
-  "  movq      " #ip0 ", %%mm0      \n\t"                                     \
-  "  movq      " #ip1 ", %%mm1      \n\t"                                     \
-  "  movq      " #ip3 ", %%mm2      \n\t"                                     \
-  "  movq      " #ip5 ", %%mm3      \n\t"                                     \
-  "  movq        %%mm0, %%mm4       \n\t"                                     \
-  "  movq        %%mm1, %%mm5       \n\t"                                     \
-  "  movq        %%mm2, %%mm6       \n\t"                                     \
-  "  movq        %%mm3, %%mm7       \n\t"                                     \
-                                                                              \
-  "  paddsw    " #ip7 ", %%mm0      \n\t" /* mm0 = ip0 + ip7 = is07 */        \
-  "  paddsw    " #ip2 ", %%mm1      \n\t" /* mm1 = ip1 + ip2 = is12 */        \
-  "  paddsw    " #ip4 ", %%mm2      \n\t" /* mm2 = ip3 + ip4 = is34 */        \
-  "  paddsw    " #ip6 ", %%mm3      \n\t" /* mm3 = ip5 + ip6 = is56 */        \
-  "  psubsw    " #ip7 ", %%mm4      \n\t" /* mm4 = ip0 - ip7 = id07 */        \
-  "  psubsw    " #ip2 ", %%mm5      \n\t" /* mm5 = ip1 - ip2 = id12 */        \
-                                                                              \
-  "  psubsw      %%mm2, %%mm0       \n\t" /* mm0 = is07 - is34 */             \
-                                                                              \
-  "  paddsw      %%mm2, %%mm2       \n\t"                                     \
-                                                                              \
-  "  psubsw    " #ip4 ", %%mm6      \n\t" /* mm6 = ip3 - ip4 = id34 */        \
-                                                                              \
-  "  paddsw      %%mm0, %%mm2       \n\t" /* mm2 = is07 + is34 = is0734 */    \
-  "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
-  "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56  = is1256 */   \
-                                                                              \
-  "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm7, %%mm5       \n\t" /* mm5 = id12 - id56 */             \
-  "  paddsw      %%mm7, %%mm7       \n\t"                                     \
-  "  paddsw      %%mm5, %%mm7       \n\t" /* mm7 = id12 + id56 */             \
-  /* ------------------------------------------------------------------- */   \
-  "  psubsw      %%mm3, %%mm2       \n\t" /* mm2 = is0734 - is1256 */         \
-  "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm2, %%mm0       \n\t" /* make a copy */                   \
-  "  paddsw      %%mm2, %%mm3       \n\t" /* mm3 = is0734 + is1256 */         \
-                                                                              \
-  "  pmulhw      %[xC4S4], %%mm0    \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */ \
-  "  paddw       %%mm2, %%mm0       \n\t" /* mm0 = xC4S4 * ( is0734 - is1256 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncate mm0, now it is op[4] */ \
-                                                                              \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-  "  movq        %%mm0," #ip4 "     \n\t" /* save ip4, now mm0,mm2 are free */ \
-                                                                              \
-  "  movq        %%mm3, %%mm0       \n\t"                                     \
-  "  pmulhw      %[xC4S4], %%mm3    \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )    */ \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
-                                                                              \
-  "  movq        %%mm3," #ip0 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq      " #temp ", %%mm3     \n\t" /* mm3 = irot_input_y */            \
-  "  pmulhw      %[xC2S6], %%mm3     \n\t" /* mm3 = xC2S6 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq      " #temp ", %%mm2     \n\t"                                     \
-  "  movq        %%mm2, %%mm0       \n\t"                                     \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t" /* mm3 = xC2S6 * irot_input_y */    \
-  "  paddw       %%mm0, %%mm3       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-  "  pmulhw      %[xC6S2], %%mm0    \n\t" /* mm0 = xC6S2 * irot_input_x */    \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* ip[2] */                         \
-  "  movq        %%mm3," #ip2 "     \n\t" /* Save ip2 */                      \
-                                                                              \
-  "  movq        %%mm5, %%mm0       \n\t"                                     \
-  "  movq        %%mm5, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %[xC2S6], %%mm5     \n\t" /* mm5 = xC2S6 * irot_input_x - irot_input_x */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  movq      " #temp ", %%mm3     \n\t"                                     \
-  "  paddw       %%mm0, %%mm5       \n\t" /* mm5 = xC2S6 * irot_input_x */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-  "  movq        %%mm3, %%mm2       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %[xC6S2], %%mm3    \n\t" /* mm3 = xC6S2 * irot_input_y */    \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-  "  psubsw      %%mm5, %%mm3       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3," #ip6 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC4S4], %%mm0    \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
-                                                                              \
-  "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
-  "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  pxor        %%mm0, %%mm0       \n\t" /* Clear mm0 */                     \
-  "  psubsw      %%mm6, %%mm0       \n\t" /* mm0 = - id34 */                  \
-                                                                              \
-  "  psubsw      %%mm7, %%mm0       \n\t" /* mm0 = - ( id34 + idcommon_product2 ) */ \
-  "  paddsw      %%mm6, %%mm6       \n\t"                                     \
-  "  paddsw      %%mm0, %%mm6       \n\t" /* mm6 = id34 - icommon_product2 */ \
-                                                                              \
-  "  psubsw      %%mm1, %%mm4       \n\t" /* mm4 = id07 - icommon_product1 */ \
-  "  paddsw      %%mm1, %%mm1       \n\t"                                     \
-  "  paddsw      %%mm4, %%mm1       \n\t" /* mm1 = id07 + icommon_product1 */ \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC1S7], %%mm7    \n\t"                                     \
-  "  movq        %%mm1, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm1, %%mm3       \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - irot_input_x */ \
-                                                                              \
-  "  movq        %[xC7S1], %%mm7    \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm1       \n\t" /* Trucated */                      \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x */    \
-  "  paddw       %%mm2, %%mm3       \n\t" /* Truncated */                     \
-                                                                              \
-  "  movq        %%mm0, %%mm5       \n\t"                                     \
-  "  movq        %%mm0, %%mm2       \n\t"                                     \
-                                                                              \
-  "  movq        %[xC1S7], %%mm7    \n\t"                                     \
-  "  pmulhw      %%mm7, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  movq        %[xC7S1], %%mm7    \n\t"                                     \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm5, %%mm0       \n\t" /* mm0 = xC1S7 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm0       \n\t" /* Truncated */                     \
-                                                                              \
-  "  pmulhw      %%mm7, %%mm5       \n\t" /* mm5 = xC7S1 * irot_input_y */    \
-  "  paddw       %%mm2, %%mm5       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm5, %%mm1       \n\t" /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */ \
-  "  paddsw      %%mm0, %%mm3       \n\t" /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */ \
-                                                                              \
-  "  movq        %%mm1," #ip1 "     \n\t"                                     \
-  "  movq        %%mm3," #ip7 "     \n\t"                                     \
-  /* ------------------------------------------------------------------- */   \
-  "  movq        %[xC3S5], %%mm0    \n\t"                                     \
-  "  movq        %[xC5S3], %%mm1    \n\t"                                     \
-                                                                              \
-  "  movq        %%mm6, %%mm5       \n\t"                                     \
-  "  movq        %%mm6, %%mm7       \n\t"                                     \
-                                                                              \
-  "  movq        %%mm4, %%mm2       \n\t"                                     \
-  "  movq        %%mm4, %%mm3       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm0, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm1, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  psrlw       $15, %%mm5         \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm3, %%mm4       \n\t" /* mm4 = xC3S5 * irot_input_x */    \
-  "  paddw       %%mm7, %%mm6       \n\t" /* mm6 = xC5S3 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t" /* Truncated */                     \
-  "  paddw       %%mm5, %%mm6       \n\t" /* Truncated */                     \
-                                                                              \
-  "  psubsw      %%mm6, %%mm4       \n\t" /* ip3 */                           \
-  "  movq        %%mm4," #ip3 "     \n\t"                                     \
-                                                                              \
-  "  movq        %%mm3, %%mm4       \n\t"                                     \
-  "  movq        %%mm7, %%mm6       \n\t"                                     \
-                                                                              \
-  "  pmulhw      %%mm1, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x - irot_input_x */ \
-  "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y - irot_input_y */ \
-                                                                              \
-  "  paddw       %%mm2, %%mm4       \n\t"                                     \
-  "  paddw       %%mm5, %%mm6       \n\t"                                     \
-                                                                              \
-  "  paddw       %%mm4, %%mm3       \n\t" /* mm3 = xC5S3 * irot_input_x */    \
-  "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
-                                                                              \
-  "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t"
-
-#define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-                      op0,op1,op2,op3,op4,op5,op6,op7)                  \
-  "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
-  "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
-  "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
-  "  movq      " #ip5 ", %%mm5      \n\t" /* mm5 = f4 f5 f6 f7 */       \
-  "  movq      " #ip2 ", %%mm2      \n\t" /* mm2 = c0 c1 c2 c3 */       \
-  "  movq      " #ip6 ", %%mm6      \n\t" /* mm6 = g4 g5 g6 g7 */       \
-  "  movq      " #ip3 ", %%mm3      \n\t" /* mm3 = d0 d1 d2 d3 */       \
-  "  movq        %%mm1," #op1 "     \n\t" /* save  b0 b1 b2 b3 */       \
-  "  movq      " #ip7 ", %%mm7      \n\t" /* mm7 = h0 h1 h2 h3 */       \
-   /* Transpose 2x8 block */                                            \
-  "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
-  "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
-  "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd   %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
-  "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd   %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
-  "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
-  "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
-  "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
-  "  movq        %%mm1, %%mm6       \n\t" /* mm6 = f3 e3 f2 e2 */       \
-  "  movq        %%mm4," #op4 "     \n\t"                               \
-  "  punpckhwd   %%mm7, %%mm0       \n\t" /* mm0 = h3 g3 h2 g2 */       \
-  "  movq        %%mm5," #op5 "     \n\t"                               \
-  "  punpckhdq   %%mm0, %%mm6       \n\t" /* mm6 = h3 g3 f3 e3 = MM7 */ \
-  "  movq      " #op0 ", %%mm4      \n\t" /* mm4 = a3 a2 a1 a0 */       \
-  "  punpckldq   %%mm0, %%mm1       \n\t" /* mm1 = h2 g2 f2 e2 = MM6 */ \
-  "  movq      " #op1 ", %%mm5      \n\t" /* mm5 = b3 b2 b1 b0 */       \
-  "  movq        %%mm4, %%mm0       \n\t" /* mm0 = a3 a2 a1 a0 */       \
-  "  movq        %%mm6," #op7 "     \n\t"                               \
-  "  punpcklwd   %%mm5, %%mm0       \n\t" /* mm0 = b1 a1 b0 a0 */       \
-  "  movq        %%mm1," #op6 "     \n\t"                               \
-  "  punpckhwd   %%mm5, %%mm4       \n\t" /* mm4 = b3 a3 b2 a2 */       \
-  "  movq        %%mm2, %%mm5       \n\t" /* mm5 = c3 c2 c1 c0 */       \
-  "  punpcklwd   %%mm3, %%mm2       \n\t" /* mm2 = d1 c1 d0 c0 */       \
-  "  movq        %%mm0, %%mm1       \n\t" /* mm1 = b1 a1 b0 a0 */       \
-  "  punpckldq   %%mm2, %%mm0       \n\t" /* mm0 = d0 c0 b0 a0 = MM0 */ \
-  "  punpckhdq   %%mm2, %%mm1       \n\t" /* mm1 = d1 c1 b1 a1 = MM1 */ \
-  "  movq        %%mm4, %%mm2       \n\t" /* mm2 = b3 a3 b2 a2 */       \
-  "  movq        %%mm0," #op0 "     \n\t"                               \
-  "  punpckhwd   %%mm3, %%mm5       \n\t" /* mm5 = d3 c3 d2 c2 */       \
-  "  movq        %%mm1," #op1 "     \n\t"                               \
-  "  punpckhdq   %%mm5, %%mm4       \n\t" /* mm4 = d3 c3 b3 a3 = MM3 */ \
-  "  punpckldq   %%mm5, %%mm2       \n\t" /* mm2 = d2 c2 b2 a2 = MM2 */ \
-  "  movq        %%mm4," #op3 "     \n\t"                               \
-  "  movq        %%mm2," #op2 "     \n\t"
-
-
-/* This performs a 2D Forward DCT on an 8x8 block with short
-   coefficients. We try to do the truncation to match the C
-   version. */
-static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
-{
-  ogg_int16_t __attribute__((aligned(8))) temp[8*8];
-
-  __asm__ __volatile__ (
-    "  .p2align 4                   \n\t"
-    /*
-     * Input data is an 8x8 block.  To make processing of the data more efficent
-     * we will transpose the block of data to two 4x8 blocks???
-     */
-    Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-                     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
-    Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
-
-    Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-                   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-                    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
-    Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
-
-    Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-                    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
-    Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
-
-    "  emms                         \n\t"
-
-    : "+r" (InputData),
-      "+r" (OutputData)
-    : "r" (temp),
-      [xC1S7] "m" (xC1S7),      /* gcc 3.1+ allows named asm parameters */
-      [xC2S6] "m" (xC2S6),
-      [xC3S5] "m" (xC3S5),
-      [xC4S4] "m" (xC4S4),
-      [xC5S3] "m" (xC5S3),
-      [xC6S2] "m" (xC6S2),
-      [xC7S1] "m" (xC7S1)
-    : "memory"
-  );
-}
-
-/* install our implementation in the function table */
-void dsp_mmx_fdct_init(DspFunctions *funcs)
-{
-  funcs->fdct_short = fdct_short__mmx;
-}
-
-#endif /* USE_ASM */

Deleted: branches/theora-thusnelda/lib/enc/x86/fdct_sse2.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_sse2.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_sse2.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,569 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************/
-/*SSE2 fDCT implementation for x86_64.*/
-/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
-
-#include "../codec_internal.h"
-#include "../dsp.h"
-
-#if defined(USE_ASM)
-
-# define OC_FDCT8x8 \
- /*Note: xmm15={0,0,0,0,0,0,0,0} and xmm14={-1,-1,-1,-1,-1,-1,-1,-1} */ \
- "#OC_FDCT8\n\t" \
- /*Stage 1:*/ \
- "movdqa %%xmm0,%%xmm11\n\t" \
- "movdqa %%xmm1,%%xmm10\n\t" \
- "movdqa %%xmm2,%%xmm9\n\t" \
- "movdqa %%xmm3,%%xmm8\n\t" \
- /*xmm11=t7'=t0-t7*/ \
- "psubw %%xmm7,%%xmm11\n\t" \
- /*xmm10=t6'=t1-t6*/ \
- "psubw %%xmm6,%%xmm10\n\t" \
- /*xmm9=t5'=t2-t5*/ \
- "psubw %%xmm5,%%xmm9\n\t" \
- /*xmm8=t4'=t3-t4*/ \
- "psubw %%xmm4,%%xmm8\n\t" \
- /*xmm0=t0'=t0+t7*/ \
- "paddw %%xmm7,%%xmm0\n\t" \
- /*xmm1=t1'=t1+t6*/ \
- "paddw %%xmm6,%%xmm1\n\t" \
- /*xmm5=t2'=t2+t5*/ \
- "paddw %%xmm2,%%xmm5\n\t" \
- /*xmm4=t3'=t3+t4*/ \
- "paddw %%xmm3,%%xmm4\n\t" \
- /*xmm2,3,6,7 are now free.*/ \
- /*Stage 2:*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- "mov $0x6A0A6A0A,%k[a]\n\t" \
- "movdqa %%xmm1,%%xmm2\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "movdqa %%xmm10,%%xmm6\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm2=t2''=t1'-t2'*/ \
- "psubw %%xmm5,%%xmm2\n\t" \
- "mov $0xB500,%k[a]\n\t" \
- /*xmm3=t3''=t0'-t3'*/ \
- "psubw %%xmm4,%%xmm3\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- /*xmm10=t5''=t6'-t5'*/ \
- "psubw %%xmm9,%%xmm10\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- /*xmm4=t0''=t0'+t3'*/ \
- "paddw %%xmm0,%%xmm4\n\t" \
- /*xmm1=t1''=t1'+t2'*/ \
- "paddw %%xmm5,%%xmm1\n\t" \
- /*xmm6=t6''=t6'+t5'*/ \
- "paddw %%xmm9,%%xmm6\n\t" \
- /*xmm0,xmm5,xmm9 are now free.*/ \
- /*Stage 3:*/ \
- /*Just think, the next 12 instructions would be 1 or 2 instructions on a \
-    proper DSP with a signed 16x16->32 MAC.*/ \
- /*xmm10/xmm5=t5''*27146*/ \
- "movdqa %%xmm10,%%xmm5\n\t" \
- "movdqa %%xmm10,%%xmm0\n\t" \
- "pmullw %%xmm13,%%xmm5\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- /*xmm7,xmm5=t5''*27146+0xB500*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- "punpcklwd %%xmm10,%%xmm5\n\t" \
- "punpckhwd %%xmm10,%%xmm7\n\t" \
- "paddd %%xmm12,%%xmm5\n\t" \
- "paddd %%xmm12,%%xmm7\n\t" \
- /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
- "psrad $16,%%xmm5\n\t" \
- "psrad $16,%%xmm7\n\t" \
- "packssdw %%xmm7,%%xmm5\n\t" \
- "paddw %%xmm0,%%xmm5\n\t" \
- /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
- "pcmpeqw %%xmm15,%%xmm0\n\t" \
- "psubw %%xmm14,%%xmm0\n\t" \
- "paddw %%xmm5,%%xmm0\n\t" \
- "movdqa %%xmm8,%%xmm5\n\t" \
- "psraw $1,%%xmm0\n\t" \
- /*xmm5=t5'''=t4''-s*/ \
- "psubw %%xmm0,%%xmm5\n\t" \
- /*xmm8=t4'''=t4''+s*/ \
- "paddw %%xmm0,%%xmm8\n\t" \
- /*xmm0,10 are now free.*/ \
- /*xmm7/xmm9=t6''*27146*/ \
- "movdqa %%xmm6,%%xmm7\n\t" \
- "movdqa %%xmm6,%%xmm9\n\t" \
- "pmulhw %%xmm13,%%xmm7\n\t" \
- "pmullw %%xmm13,%%xmm9\n\t" \
- /*xmm0,xmm9=t6''*27146+0xB500*/ \
- "movdqa %%xmm9,%%xmm0\n\t" \
- "punpcklwd %%xmm7,%%xmm9\n\t" \
- "punpckhwd %%xmm7,%%xmm0\n\t" \
- "paddd %%xmm12,%%xmm9\n\t" \
- "paddd %%xmm12,%%xmm0\n\t" \
- /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
- "psrad $16,%%xmm9\n\t" \
- "psrad $16,%%xmm0\n\t" \
- "packssdw %%xmm0,%%xmm9\n\t" \
- "paddw %%xmm6,%%xmm9\n\t" \
- /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
- "pcmpeqw %%xmm15,%%xmm6\n\t" \
- "psubw %%xmm14,%%xmm6\n\t" \
- "paddw %%xmm6,%%xmm9\n\t" \
- "movdqa %%xmm11,%%xmm7\n\t" \
- "psraw $1,%%xmm9\n\t" \
- /*xmm7=t6'''=t7'-s*/ \
- "psubw %%xmm9,%%xmm7\n\t" \
- /*xmm9=t7''=t7'+s*/ \
- "paddw %%xmm11,%%xmm9\n\t" \
- /*xmm6,xmm11 are free.*/ \
- /*Stage 4:*/ \
- /*xmm11/xmm0=t1''*27146*/ \
- "movdqa %%xmm1,%%xmm0\n\t" \
- "movdqa %%xmm1,%%xmm11\n\t" \
- "pmullw %%xmm13,%%xmm0\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- /*xmm10,xmm0=t1''*27146+0xB500*/ \
- "movdqa %%xmm0,%%xmm10\n\t" \
- "punpcklwd %%xmm11,%%xmm0\n\t" \
- "punpckhwd %%xmm11,%%xmm10\n\t" \
- "paddd %%xmm12,%%xmm0\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
- "psrad $16,%%xmm0\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "packssdw %%xmm10,%%xmm0\n\t" \
- "paddw %%xmm1,%%xmm0\n\t" \
- /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "paddw %%xmm1,%%xmm0\n\t" \
- /*xmm11/xmm4=t0''*27146*/ \
- "movdqa %%xmm4,%%xmm1\n\t" \
- "movdqa %%xmm4,%%xmm11\n\t" \
- "mov $0x4000,%k[a]\n\t" \
- "pmullw %%xmm13,%%xmm4\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- /*xmm10,xmm4=t0''*27146+0x4000*/ \
- "movdqa %%xmm4,%%xmm10\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "punpcklwd %%xmm11,%%xmm4\n\t" \
- "punpckhwd %%xmm11,%%xmm10\n\t" \
- "paddd %%xmm12,%%xmm4\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
- "psrad $16,%%xmm4\n\t" \
- "psrad $16,%%xmm10\n\t" \
- "mov $0xEC83EC83,%k[a]\n\t" \
- "packssdw %%xmm10,%%xmm4\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- "paddw %%xmm1,%%xmm4\n\t" \
- /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "mov $0x61F861F8,%k[a]\n\t" \
- "paddw %%xmm1,%%xmm4\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- /*xmm0=_y[0]=u=r+s>>1*/ \
- "paddw %%xmm4,%%xmm0\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- "psraw $1,%%xmm0\n\t" \
- /*xmm4=_y[4]=v=r-u*/ \
- "movdqa %%xmm3,%%xmm1\n\t" \
- "psubw %%xmm0,%%xmm4\n\t" \
- /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
- /*xmm11,xmm10=60547*t3'''*/ \
- "movdqa %%xmm3,%%xmm10\n\t" \
- "pmulhw %%xmm12,%%xmm1\n\t" \
- "pmullw %%xmm12,%%xmm10\n\t" \
- "paddw %%xmm3,%%xmm1\n\t" \
- "movdqa %%xmm10,%%xmm11\n\t" \
- "punpcklwd %%xmm1,%%xmm10\n\t" \
- "punpckhwd %%xmm1,%%xmm11\n\t" \
- /*xmm2,xmm1=25080*t2'' \
-   xmm12=t2''*/ \
- "movdqa %%xmm2,%%xmm6\n\t" \
- "movdqa %%xmm2,%%xmm12\n\t" \
- "pmullw %%xmm13,%%xmm2\n\t" \
- "pmulhw %%xmm13,%%xmm6\n\t" \
- "movdqa %%xmm2,%%xmm1\n\t" \
- "mov $0x6CB7,%k[a]\n\t" \
- "punpckhwd %%xmm6,%%xmm2\n\t" \
- "punpcklwd %%xmm6,%%xmm1\n\t" \
- /*xmm11,xmm10=25080*t2''+60547*t3'''+0x6CB7*/ \
- "movd %k[a],%%xmm6\n\t" \
- "paddd %%xmm1,%%xmm10\n\t" \
- "pshufd $00,%%xmm6,%%xmm6\n\t" \
- "paddd %%xmm2,%%xmm11\n\t" \
- "paddd %%xmm6,%%xmm10\n\t" \
- "paddd %%xmm6,%%xmm11\n\t" \
- /*xmm10=u=(25080*t2''+60547*t3'''+0x6CB7>>16)+(t3'''!=0)*/ \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm3\n\t" \
- "psrad $16,%%xmm11\n\t" \
- "psubw %%xmm14,%%xmm3\n\t" \
- "packssdw %%xmm11,%%xmm10\n\t" \
- "paddw %%xmm3,%%xmm10\n\t" \
- /*xmm2=_y[2]=u \
-   xmm10=s=(25080*u>>16)-t2''*/ \
- "movdqa %%xmm10,%%xmm2\n\t" \
- "mov $0x54605460,%k[a]\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "psubw %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm11/xmm6=s*21600*/ \
- "movdqa %%xmm10,%%xmm6\n\t" \
- "mov $0x2800,%k[a]\n\t" \
- "movdqa %%xmm10,%%xmm11\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- "pmullw %%xmm13,%%xmm6\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- /*xmm1,xmm6=s*21600+0x2800*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- "punpcklwd %%xmm11,%%xmm6\n\t" \
- "punpckhwd %%xmm11,%%xmm1\n\t" \
- "paddd %%xmm12,%%xmm6\n\t" \
- "paddd %%xmm12,%%xmm1\n\t" \
- /*xmm6=(s*21600+0x2800>>18)+s*/ \
- "psrad $18,%%xmm6\n\t" \
- "psrad $18,%%xmm1\n\t" \
- "mov $0xD4DBD4DB,%k[a]\n\t" \
- "packssdw %%xmm1,%%xmm6\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- "paddw %%xmm10,%%xmm6\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm10\n\t" \
- "mov $0x8E3A8E3A,%k[a]\n\t" \
- "psubw %%xmm14,%%xmm10\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "paddw %%xmm10,%%xmm6\n\t " \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
- /*xmm11,xmm10=54491*t5'''*/ \
- "movdqa %%xmm5,%%xmm1\n\t" \
- "movdqa %%xmm5,%%xmm10\n\t" \
- "pmulhw %%xmm12,%%xmm1\n\t" \
- "pmullw %%xmm12,%%xmm10\n\t" \
- "paddw %%xmm5,%%xmm1\n\t" \
- "movdqa %%xmm10,%%xmm11\n\t" \
- "punpcklwd %%xmm1,%%xmm10\n\t" \
- "punpckhwd %%xmm1,%%xmm11\n\t" \
- /*xmm7,xmm12=36410*t6''' \
-   xmm1=t6'''*/ \
- "movdqa %%xmm7,%%xmm3\n\t" \
- "movdqa %%xmm7,%%xmm1\n\t" \
- "pmulhw %%xmm13,%%xmm3\n\t" \
- "pmullw %%xmm13,%%xmm7\n\t" \
- "paddw %%xmm1,%%xmm3\n\t" \
- "movdqa %%xmm7,%%xmm12\n\t" \
- "mov $0x0E3D,%k[a]\n\t" \
- "punpckhwd %%xmm3,%%xmm7\n\t" \
- "punpcklwd %%xmm3,%%xmm12\n\t" \
- /*xmm11,xmm10=54491*t5''+36410*t6'''+0x0E3D*/ \
- "movd %k[a],%%xmm3\n\t" \
- "paddd %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm3,%%xmm3\n\t" \
- "paddd %%xmm7,%%xmm11\n\t" \
- "paddd %%xmm3,%%xmm10\n\t" \
- "paddd %%xmm3,%%xmm11\n\t" \
- /*xmm10=u=(54491*t5''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm5\n\t" \
- "psrad $16,%%xmm11\n\t" \
- "psubw %%xmm14,%%xmm5\n\t" \
- "packssdw %%xmm11,%%xmm10\n\t" \
- "paddw %%xmm5,%%xmm10\n\t" \
- /*xmm5=_y[5]=u \
-   xmm1=s=t6'''-(36410*u>>16)*/ \
- "movdqa %%xmm10,%%xmm5\n\t" \
- "mov $0x67C867C8,%k[a]\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "paddw %%xmm5,%%xmm10\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "psubw %%xmm10,%%xmm1\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm11/xmm3=s*26568*/ \
- "movdqa %%xmm1,%%xmm3\n\t" \
- "mov $0x3400,%k[a]\n\t" \
- "movdqa %%xmm1,%%xmm11\n\t" \
- "movd %k[a],%%xmm7\n\t" \
- "pmullw %%xmm13,%%xmm3\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- /*xmm12,xmm3=s*26568+0x3400*/ \
- "pshufd $00,%%xmm7,%%xmm7\n\t" \
- "movdqa %%xmm3,%%xmm12\n\t" \
- "punpcklwd %%xmm11,%%xmm3\n\t" \
- "punpckhwd %%xmm11,%%xmm12\n\t" \
- "paddd %%xmm7,%%xmm3\n\t" \
- "paddd %%xmm7,%%xmm12\n\t" \
- /*xmm3=(s*26568+0x3400>>17)+s*/ \
- "psrad $17,%%xmm3\n\t" \
- "psrad $17,%%xmm12\n\t" \
- "mov $0xFB15FB15,%k[a]\n\t" \
- "packssdw %%xmm12,%%xmm3\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- "paddw %%xmm1,%%xmm3\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
- "pcmpeqw %%xmm15,%%xmm1\n\t" \
- "mov $0x31F131F1,%k[a]\n\t" \
- "psubw %%xmm14,%%xmm1\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "paddw %%xmm1,%%xmm3\n\t " \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
- /*xmm11,xmm10=64277*t7''*/ \
- "movdqa %%xmm9,%%xmm1\n\t" \
- "movdqa %%xmm9,%%xmm10\n\t" \
- "pmulhw %%xmm12,%%xmm1\n\t" \
- "pmullw %%xmm12,%%xmm10\n\t" \
- "paddw %%xmm9,%%xmm1\n\t" \
- "movdqa %%xmm10,%%xmm11\n\t" \
- "punpcklwd %%xmm1,%%xmm10\n\t" \
- "punpckhwd %%xmm1,%%xmm11\n\t" \
- /*xmm8,xmm1=12785*t4''' \
-   xmm12=t2''*/ \
- "movdqa %%xmm8,%%xmm7\n\t" \
- "movdqa %%xmm8,%%xmm12\n\t" \
- "pmullw %%xmm13,%%xmm8\n\t" \
- "pmulhw %%xmm13,%%xmm7\n\t" \
- "movdqa %%xmm8,%%xmm1\n\t" \
- "mov $0x7B1B,%k[a]\n\t" \
- "punpckhwd %%xmm7,%%xmm8\n\t" \
- "punpcklwd %%xmm7,%%xmm1\n\t" \
- /*xmm11,xmm10=12785*t4'''+64277*t7''+0x7B1B*/ \
- "movd %k[a],%%xmm7\n\t" \
- "paddd %%xmm1,%%xmm10\n\t" \
- "pshufd $00,%%xmm7,%%xmm7\n\t" \
- "paddd %%xmm8,%%xmm11\n\t" \
- "paddd %%xmm7,%%xmm10\n\t" \
- "paddd %%xmm7,%%xmm11\n\t" \
- /*xmm10=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
- "psrad $16,%%xmm10\n\t" \
- "pcmpeqw %%xmm15,%%xmm9\n\t" \
- "psrad $16,%%xmm11\n\t" \
- "psubw %%xmm14,%%xmm9\n\t" \
- "packssdw %%xmm11,%%xmm10\n\t" \
- "paddw %%xmm9,%%xmm10\n\t" \
- /*xmm1=_y[1]=u \
-   xmm10=s=(12785*u>>16)-t2''*/ \
- "movdqa %%xmm10,%%xmm1\n\t" \
- "mov $0x503B503B,%k[a]\n\t" \
- "pmulhw %%xmm13,%%xmm10\n\t" \
- "movd %k[a],%%xmm13\n\t" \
- "psubw %%xmm12,%%xmm10\n\t" \
- "pshufd $00,%%xmm13,%%xmm13\n\t" \
- /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
- "movdqa %%xmm10,%%xmm7\n\t" \
- "mov $0x3000,%k[a]\n\t" \
- "movdqa %%xmm10,%%xmm11\n\t" \
- "movd %k[a],%%xmm12\n\t" \
- "pmullw %%xmm13,%%xmm7\n\t" \
- "pshufd $00,%%xmm12,%%xmm12\n\t" \
- "pmulhw %%xmm13,%%xmm11\n\t" \
- /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
- "movdqa %%xmm7,%%xmm8\n\t" \
- "punpcklwd %%xmm11,%%xmm7\n\t" \
- "punpckhwd %%xmm11,%%xmm8\n\t" \
- "paddd %%xmm12,%%xmm7\n\t" \
- "paddd %%xmm12,%%xmm8\n\t" \
- /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
- "psrad $20,%%xmm7\n\t" \
- "psrad $20,%%xmm8\n\t" \
- "packssdw %%xmm8,%%xmm7\n\t" \
- "paddw %%xmm10,%%xmm7\n\t" \
- "pcmpeqw %%xmm15,%%xmm10\n\t" \
- "psubw %%xmm14,%%xmm10\n\t" \
- "paddw %%xmm10,%%xmm7\n\t " \
-
-# define OC_TRANSPOSE8x8 \
- "#OC_TRANSPOSE8x8\n\t" \
- "movdqa %%xmm4,%%xmm8\n\t" \
- /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
- "punpcklwd %%xmm5,%%xmm4\n\t" \
- /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
- "punpckhwd %%xmm5,%%xmm8\n\t" \
- /*xmm5 is free.*/ \
- "movdqa %%xmm0,%%xmm5\n\t" \
- /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
- "punpcklwd %%xmm1,%%xmm0\n\t" \
- /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
- "punpckhwd %%xmm1,%%xmm5\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm6,%%xmm1\n\t" \
- /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
- "punpcklwd %%xmm7,%%xmm6\n\t" \
- /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
- "punpckhwd %%xmm7,%%xmm1\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm2,%%xmm7\n\t" \
- /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
- "punpcklwd %%xmm3,%%xmm7\n\t" \
- /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
- "punpckhwd %%xmm3,%%xmm2\n\t" \
- /*xmm3 is free.*/ \
- "movdqa %%xmm0,%%xmm3\n\t" \
- /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
- "punpckldq %%xmm7,%%xmm0\n\t" \
- /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
- "punpckhdq %%xmm7,%%xmm3\n\t" \
- /*xmm7 is free.*/ \
- "movdqa %%xmm5,%%xmm7\n\t" \
- /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
- "punpckldq %%xmm2,%%xmm5\n\t" \
- /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
- "punpckhdq %%xmm2,%%xmm7\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm4,%%xmm2\n\t" \
- /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
- "punpckldq %%xmm6,%%xmm2\n\t" \
- /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
- "punpckhdq %%xmm6,%%xmm4\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm8,%%xmm6\n\t" \
- /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
- "punpckldq %%xmm1,%%xmm6\n\t" \
- /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
- "punpckhdq %%xmm1,%%xmm8\n\t" \
- /*xmm1 is free.*/ \
- "movdqa %%xmm0,%%xmm1\n\t" \
- /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
- "punpcklqdq %%xmm2,%%xmm0\n\t" \
- /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
- "punpckhqdq %%xmm2,%%xmm1\n\t" \
- /*xmm2 is free.*/ \
- "movdqa %%xmm3,%%xmm2\n\t" \
- /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
- "punpcklqdq %%xmm4,%%xmm2\n\t" \
- /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
- "punpckhqdq %%xmm4,%%xmm3\n\t" \
- /*xmm4 is free.*/ \
- "movdqa %%xmm5,%%xmm4\n\t" \
- /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
- "punpcklqdq %%xmm6,%%xmm4\n\t" \
- /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
- "punpckhqdq %%xmm6,%%xmm5\n\t" \
- /*xmm6 is free.*/ \
- "movdqa %%xmm7,%%xmm6\n\t" \
- /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
- "punpcklqdq %%xmm8,%%xmm6\n\t" \
- /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
- "punpckhqdq %%xmm8,%%xmm7\n\t" \
- /*xmm8 is free.*/ \
-
-/*SSE2 implementation of the fDCT for x86-64 only.
-  Because of the 8 extra XMM registers on x86-64, this version can operate
-   without any temporary stack access at all.*/
-static void oc_fdct8x8_x86_64sse2(const ogg_int16_t _x[64],ogg_int16_t _y[64]){
-  ptrdiff_t a;
-  __asm__ __volatile__(
-    /*Load the input.*/
-    "lea %[x],%[a]\n\t"
-    "movdqa 0x00(%[a]),%%xmm0\n\t"
-    "movdqa 0x10(%[a]),%%xmm1\n\t"
-    "movdqa 0x20(%[a]),%%xmm2\n\t"
-    "movdqa 0x30(%[a]),%%xmm3\n\t"
-    "movdqa 0x40(%[a]),%%xmm4\n\t"
-    "movdqa 0x50(%[a]),%%xmm5\n\t"
-    "movdqa 0x60(%[a]),%%xmm6\n\t"
-    "movdqa 0x70(%[a]),%%xmm7\n\t"
-    /*Add two extra bits of working precision to improve accuracy; any more and
-       we could overflow.*/
-    /*We also add biases to correct for some systematic error that remains in
-       the full fDCT->iDCT round trip.*/
-    /*xmm15={0}x8*/
-    "pxor %%xmm15,%%xmm15\n\t"
-    /*xmm14={-1}x8*/
-    "pcmpeqb %%xmm14,%%xmm14\n\t"
-    "psllw $2,%%xmm0\n\t"
-    /*xmm8=xmm0*/
-    "movdqa %%xmm0,%%xmm8\n\t"
-    "psllw $2,%%xmm1\n\t"
-    /*xmm8={_x[7...0]==0}*/
-    "pcmpeqw %%xmm15,%%xmm8\n\t"
-    "psllw $2,%%xmm2\n\t"
-    /*xmm8={_x[7...0]!=0}*/
-    "psubw %%xmm14,%%xmm8\n\t"
-    "psllw $2,%%xmm3\n\t"
-    /*%[a]=1*/
-    "mov $1,%k[a]\n\t"
-    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
-    "pslld $16,%%xmm8\n\t"
-    "psllw $2,%%xmm4\n\t"
-    /*xmm9={0,0,0,0,0,0,0,1}*/
-    "movd %k[a],%%xmm9\n\t"
-    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
-    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
-    "psllw $2,%%xmm5\n\t"
-    /*%[a]={1}x2*/
-    "mov $0x10001,%k[a]\n\t"
-    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
-    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
-    "psllw $2,%%xmm6\n\t"
-    /*xmm10={0,0,0,0,0,0,1,1}*/
-    "movd %k[a],%%xmm10\n\t"
-    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
-    "paddw %%xmm8,%%xmm0\n\t"
-    "psllw $2,%%xmm7\n\t"
-    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
-    "paddw %%xmm10,%%xmm0\n\t"
-    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
-    "psubw %%xmm9,%%xmm1\n\t"
-    /*Transform columns.*/
-    OC_FDCT8x8
-    /*Transform rows.*/
-    OC_TRANSPOSE8x8
-    OC_FDCT8x8
-    /*TODO: zig-zag ordering?*/
-    OC_TRANSPOSE8x8
-    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
-    "psllw $1,%%xmm14\n\t"
-    "psubw %%xmm14,%%xmm0\n\t"
-    "psubw %%xmm14,%%xmm1\n\t"
-    "psraw $2,%%xmm0\n\t"
-    "psubw %%xmm14,%%xmm2\n\t"
-    "psraw $2,%%xmm1\n\t"
-    "psubw %%xmm14,%%xmm3\n\t"
-    "psraw $2,%%xmm2\n\t"
-    "psubw %%xmm14,%%xmm4\n\t"
-    "psraw $2,%%xmm3\n\t"
-    "psubw %%xmm14,%%xmm5\n\t"
-    "psraw $2,%%xmm4\n\t"
-    "psubw %%xmm14,%%xmm6\n\t"
-    "psraw $2,%%xmm5\n\t"
-    "psubw %%xmm14,%%xmm7\n\t"
-    "psraw $2,%%xmm6\n\t"
-    "lea %[y],%[a]\n\t"
-    "psraw $2,%%xmm7\n\t"
-    /*Store the result.*/
-    "movdqa %%xmm0,0x00(%[a])\n\t"
-    "movdqa %%xmm1,0x10(%[a])\n\t"
-    "movdqa %%xmm2,0x20(%[a])\n\t"
-    "movdqa %%xmm3,0x30(%[a])\n\t"
-    "movdqa %%xmm4,0x40(%[a])\n\t"
-    "movdqa %%xmm5,0x50(%[a])\n\t"
-    "movdqa %%xmm6,0x60(%[a])\n\t"
-    "movdqa %%xmm7,0x70(%[a])\n\t"
-    :[a]"=&r"(a)
-    :[x]"m"(*_x),[y]"m"(*_y)
-  );
-}
-
-/*Install our implementation in the function table.*/
-void dsp_sse2_fdct_init(DspFunctions *funcs){
-# if defined(__amd64__)||defined(__x86_64__)
-  funcs->fdct_short=oc_fdct8x8_x86_64sse2;
-# endif
-}
-
-#endif

Deleted: branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/idct_mmx.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/idct_mmx.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,1124 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: idct_mmx.c 14783 2008-04-22 16:23:11Z xiphmont $
-
- ********************************************************************/
-
-#include "../codec_internal.h"
-
-#if defined(USE_ASM)
-
-#define MaskOffset 0        // 4 masks come in order low word to high
-#define CosineOffset 32     // 7 cosines come in order pi/16 * (1 ... 7)
-#define EightOffset 88
-#define IdctAdjustBeforeShift 8
-
-ogg_uint16_t idctconstants[(4+7+1) * 4] = {
-    65535,     0,     0,     0,     0, 65535,     0,     0,
-        0,     0, 65535,     0,     0,     0,     0, 65535,
-    64277, 64277, 64277, 64277, 60547, 60547, 60547, 60547,
-    54491, 54491, 54491, 54491, 46341, 46341, 46341, 46341,
-    36410, 36410, 36410, 36410, 25080, 25080, 25080, 25080,
-    12785, 12785, 12785, 12785,     8,     8,     8,     8,
-};
-
-/**************************************************************************************
- *
- *      Routine:        BeginIDCT
- *
- *      Description:    The Macro does IDct on 4 1-D Dcts
- *
- *      Input:          None
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   None
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-
-#define MtoSTR(s) #s
-
-#define BeginIDCT "#BeginIDCT\n"    \
-                                    \
-    "   movq    "I(3)",%%mm2\n"     \
-                                    \
-    "   movq    "C(3)",%%mm6\n"     \
-    "   movq    %%mm2,%%mm4\n"      \
-    "   movq    "J(5)",%%mm7\n"     \
-    "   pmulhw  %%mm6,%%mm4\n"      \
-    "   movq    "C(5)",%%mm1\n"     \
-    "   pmulhw  %%mm7,%%mm6\n"      \
-    "   movq    %%mm1,%%mm5\n"      \
-    "   pmulhw  %%mm2,%%mm1\n"      \
-    "   movq    "I(1)",%%mm3\n"     \
-    "   pmulhw  %%mm7,%%mm5\n"      \
-    "   movq    "C(1)",%%mm0\n"     \
-    "   paddw   %%mm2,%%mm4\n"      \
-    "   paddw   %%mm7,%%mm6\n"      \
-    "   paddw   %%mm1,%%mm2\n"      \
-    "   movq    "J(7)",%%mm1\n"     \
-    "   paddw   %%mm5,%%mm7\n"      \
-    "   movq    %%mm0,%%mm5\n"      \
-    "   pmulhw  %%mm3,%%mm0\n"      \
-    "   paddsw  %%mm7,%%mm4\n"      \
-    "   pmulhw  %%mm1,%%mm5\n"      \
-    "   movq    "C(7)",%%mm7\n"     \
-    "   psubsw  %%mm2,%%mm6\n"      \
-    "   paddw   %%mm3,%%mm0\n"      \
-    "   pmulhw  %%mm7,%%mm3\n"      \
-    "   movq    "I(2)",%%mm2\n"     \
-    "   pmulhw  %%mm1,%%mm7\n"      \
-    "   paddw   %%mm1,%%mm5\n"      \
-    "   movq    %%mm2,%%mm1\n"      \
-    "   pmulhw  "C(2)",%%mm2\n"     \
-    "   psubsw  %%mm5,%%mm3\n"      \
-    "   movq    "J(6)",%%mm5\n"     \
-    "   paddsw  %%mm7,%%mm0\n"      \
-    "   movq    %%mm5,%%mm7\n"      \
-    "   psubsw  %%mm4,%%mm0\n"      \
-    "   pmulhw  "C(2)",%%mm5\n"     \
-    "   paddw   %%mm1,%%mm2\n"      \
-    "   pmulhw  "C(6)",%%mm1\n"     \
-    "   paddsw  %%mm4,%%mm4\n"      \
-    "   paddsw  %%mm0,%%mm4\n"      \
-    "   psubsw  %%mm6,%%mm3\n"      \
-    "   paddw   %%mm7,%%mm5\n"      \
-    "   paddsw  %%mm6,%%mm6\n"      \
-    "   pmulhw  "C(6)",%%mm7\n"     \
-    "   paddsw  %%mm3,%%mm6\n"      \
-    "   movq    %%mm4,"I(1)"\n"     \
-    "   psubsw  %%mm5,%%mm1\n"      \
-    "   movq    "C(4)",%%mm4\n"     \
-    "   movq    %%mm3,%%mm5\n"      \
-    "   pmulhw  %%mm4,%%mm3\n"      \
-    "   paddsw  %%mm2,%%mm7\n"      \
-    "   movq    %%mm6,"I(2)"\n"     \
-    "   movq    %%mm0,%%mm2\n"      \
-    "   movq    "I(0)",%%mm6\n"     \
-    "   pmulhw  %%mm4,%%mm0\n"      \
-    "   paddw   %%mm3,%%mm5\n"      \
-    "\n"                            \
-    "   movq    "J(4)",%%mm3\n"     \
-    "   psubsw  %%mm1,%%mm5\n"      \
-    "   paddw   %%mm0,%%mm2\n"      \
-    "   psubsw  %%mm3,%%mm6\n"      \
-    "   movq    %%mm6,%%mm0\n"      \
-    "   pmulhw  %%mm4,%%mm6\n"      \
-    "   paddsw  %%mm3,%%mm3\n"      \
-    "   paddsw  %%mm1,%%mm1\n"      \
-    "   paddsw  %%mm0,%%mm3\n"      \
-    "   paddsw  %%mm5,%%mm1\n"      \
-    "   pmulhw  %%mm3,%%mm4\n"      \
-    "   paddsw  %%mm0,%%mm6\n"      \
-    "   psubsw  %%mm2,%%mm6\n"      \
-    "   paddsw  %%mm2,%%mm2\n"      \
-    "   movq    "I(1)",%%mm0\n"     \
-    "   paddsw  %%mm6,%%mm2\n"      \
-    "   paddw   %%mm3,%%mm4\n"      \
-    "   psubsw  %%mm1,%%mm2\n"      \
-    "#end BeginIDCT\n"
-// end BeginIDCT macro (38 cycles).
-
-
-// Two versions of the end of the idct depending on whether we're feeding
-// into a transpose or dividing the final results by 16 and storing them.
-
-/**************************************************************************************
- *
- *      Routine:        RowIDCT
- *
- *      Description:    The Macro does 1-D IDct on 4 Rows
- *
- *      Input:          None
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   None
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-
-// RowIDCT gets ready to transpose.
-
-#define RowIDCT "#RowIDCT\n"                              \
-    BeginIDCT                                             \
-    "\n"                                                  \
-    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */           \
-    "   psubsw  %%mm7,%%mm4\n"    /* r4 = E. = E - G */   \
-    "   paddsw  %%mm1,%%mm1\n"    /* r1 = H. + H. */      \
-    "   paddsw  %%mm7,%%mm7\n"    /* r7 = G + G */        \
-    "   paddsw  %%mm2,%%mm1\n"    /* r1 = R1 = A.. + H. */\
-    "   paddsw  %%mm4,%%mm7\n"    /* r7 = G. = E + G */   \
-    "   psubsw  %%mm3,%%mm4\n"    /* r4 = R4 = E. - D. */ \
-    "   paddsw  %%mm3,%%mm3\n"                            \
-    "   psubsw  %%mm5,%%mm6\n"    /* r6 = R6 = F. - B.. */\
-    "   paddsw  %%mm5,%%mm5\n"                            \
-    "   paddsw  %%mm4,%%mm3\n"    /* r3 = R3 = E. + D. */ \
-    "   paddsw  %%mm6,%%mm5\n"    /* r5 = R5 = F. + B.. */\
-    "   psubsw  %%mm0,%%mm7\n"    /* r7 = R7 = G. - C. */ \
-    "   paddsw  %%mm0,%%mm0\n"                            \
-    "   movq    %%mm1,"I(1)"\n"   /* save R1 */           \
-    "   paddsw  %%mm7,%%mm0\n"    /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT"
-
-// end RowIDCT macro (8 + 38 = 46 cycles)
-
-
-/**************************************************************************************
- *
- *      Routine:        ColumnIDCT
- *
- *      Description:    The Macro does 1-D IDct on 4 columns
- *
- *      Input:          None
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   None
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-// Column IDCT normalizes and stores final results.
-
-#define ColumnIDCT "#ColumnIDCT\n"                            \
-    BeginIDCT                                                 \
-    "\n"                                                      \
-    "   paddsw  "Eight",%%mm2\n"                              \
-    "   paddsw  %%mm1,%%mm1\n"        /* r1 = H. + H. */      \
-    "   paddsw  %%mm2,%%mm1\n"        /* r1 = R1 = A.. + H. */\
-    "   psraw   ""$4"",%%mm2\n"       /* r2 = NR2 */          \
-    "   psubsw  %%mm7,%%mm4\n"        /* r4 = E. = E - G */   \
-    "   psraw   ""$4"",%%mm1\n"       /* r1 = NR1 */          \
-    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */               \
-    "   paddsw  %%mm7,%%mm7\n"        /* r7 = G + G */        \
-    "   movq    %%mm2,"I(2)"\n"   /* store NR2 at I2 */       \
-    "   paddsw  %%mm4,%%mm7\n"        /* r7 = G. = E + G */   \
-    "   movq    %%mm1,"I(1)"\n"   /* store NR1 at I1 */       \
-    "   psubsw  %%mm3,%%mm4\n"        /* r4 = R4 = E. - D. */ \
-    "   paddsw  "Eight",%%mm4\n"                              \
-    "   paddsw  %%mm3,%%mm3\n"        /* r3 = D. + D. */      \
-    "   paddsw  %%mm4,%%mm3\n"        /* r3 = R3 = E. + D. */ \
-    "   psraw   ""$4"",%%mm4\n"       /* r4 = NR4 */          \
-    "   psubsw  %%mm5,%%mm6\n"        /* r6 = R6 = F. - B.. */\
-    "   psraw   ""$4"",%%mm3\n"       /* r3 = NR3 */          \
-    "   paddsw  "Eight",%%mm6\n"                              \
-    "   paddsw  %%mm5,%%mm5\n"        /* r5 = B.. + B.. */    \
-    "   paddsw  %%mm6,%%mm5\n"        /* r5 = R5 = F. + B.. */\
-    "   psraw   ""$4"",%%mm6\n"       /* r6 = NR6 */          \
-    "   movq    %%mm4,"J(4)"\n"   /* store NR4 at J4 */       \
-    "   psraw   ""$4"",%%mm5\n"       /* r5 = NR5 */          \
-    "   movq    %%mm3,"I(3)"\n"   /* store NR3 at I3 */       \
-    "   psubsw  %%mm0,%%mm7\n"        /* r7 = R7 = G. - C. */ \
-    "   paddsw  "Eight",%%mm7\n"                              \
-    "   paddsw  %%mm0,%%mm0\n"        /* r0 = C. + C. */      \
-    "   paddsw  %%mm7,%%mm0\n"        /* r0 = R0 = G. + C. */ \
-    "   psraw   ""$4"",%%mm7\n"       /* r7 = NR7 */          \
-    "   movq    %%mm6,"J(6)"\n"   /* store NR6 at J6 */       \
-    "   psraw   ""$4"",%%mm0\n"       /* r0 = NR0 */          \
-    "   movq    %%mm5,"J(5)"\n"   /* store NR5 at J5 */       \
-    "   movq    %%mm7,"J(7)"\n"   /* store NR7 at J7 */       \
-    "   movq    %%mm0,"I(0)"\n"   /* store NR0 at I0 */       \
-    "#end ColumnIDCT\n"
-
-// end ColumnIDCT macro (38 + 19 = 57 cycles)
-
-/**************************************************************************************
- *
- *      Routine:        Transpose
- *
- *      Description:    The Macro does two 4x4 transposes in place.
- *
- *      Input:          None
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   None
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-
-/* Following macro does two 4x4 transposes in place.
-
-  At entry (we assume):
-
-    r0 = a3 a2 a1 a0
-    I(1) = b3 b2 b1 b0
-    r2 = c3 c2 c1 c0
-    r3 = d3 d2 d1 d0
-
-    r4 = e3 e2 e1 e0
-    r5 = f3 f2 f1 f0
-    r6 = g3 g2 g1 g0
-    r7 = h3 h2 h1 h0
-
-   At exit, we have:
-
-    I(0) = d0 c0 b0 a0
-    I(1) = d1 c1 b1 a1
-    I(2) = d2 c2 b2 a2
-    I(3) = d3 c3 b3 a3
-
-    J(4) = h0 g0 f0 e0
-    J(5) = h1 g1 f1 e1
-    J(6) = h2 g2 f2 e2
-    J(7) = h3 g3 f3 e3
-
-   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
-   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.
-
-   Since r1 is free at entry, we calculate the Js first. */
-
-
-#define Transpose "#Transpose\n"              \
-    "   movq        %%mm4,%%mm1\n"            \
-    "   punpcklwd   %%mm5,%%mm4\n"            \
-    "   movq        %%mm0,"I(0)"\n"           \
-    "   punpckhwd   %%mm5,%%mm1\n"            \
-    "   movq        %%mm6,%%mm0\n"            \
-    "   punpcklwd   %%mm7,%%mm6\n"            \
-    "   movq        %%mm4,%%mm5\n"            \
-    "   punpckldq   %%mm6,%%mm4\n"            \
-    "   punpckhdq   %%mm6,%%mm5\n"            \
-    "   movq        %%mm1,%%mm6\n"            \
-    "   movq        %%mm4,"J(4)"\n"           \
-    "   punpckhwd   %%mm7,%%mm0\n"            \
-    "   movq        %%mm5,"J(5)"\n"           \
-    "   punpckhdq   %%mm0,%%mm6\n"            \
-    "   movq        "I(0)",%%mm4\n"           \
-    "   punpckldq   %%mm0,%%mm1\n"            \
-    "   movq        "I(1)",%%mm5\n"           \
-    "   movq        %%mm4,%%mm0\n"            \
-    "   movq        %%mm6,"J(7)"\n"           \
-    "   punpcklwd   %%mm5,%%mm0\n"            \
-    "   movq        %%mm1,"J(6)"\n"           \
-    "   punpckhwd   %%mm5,%%mm4\n"            \
-    "   movq        %%mm2,%%mm5\n"            \
-    "   punpcklwd   %%mm3,%%mm2\n"            \
-    "   movq        %%mm0,%%mm1\n"            \
-    "   punpckldq   %%mm2,%%mm0\n"            \
-    "   punpckhdq   %%mm2,%%mm1\n"            \
-    "   movq        %%mm4,%%mm2\n"            \
-    "   movq        %%mm0,"I(0)"\n"           \
-    "   punpckhwd   %%mm3,%%mm5\n"            \
-    "   movq        %%mm1,"I(1)"\n"           \
-    "   punpckhdq   %%mm5,%%mm4\n"            \
-    "   punpckldq   %%mm5,%%mm2\n"            \
-                                              \
-    "   movq        %%mm4,"I(3)"\n"           \
-                                              \
-    "   movq        %%mm2,"I(2)"\n"           \
-    "#end Transpose\n"
-// end Transpose macro (19 cycles).
-
-/**************************************************************************************
- *
- *      Routine:        MMX_idct
- *
- *      Description:    Perform IDCT on a 8x8 block
- *
- *      Input:          Pointer to input and output buffer
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   The input coefficients are in ZigZag order
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-void IDctSlow__mmx(const ogg_int16_t *in,
-                   const ogg_int16_t *q,
-                   ogg_int16_t *out ) {
-
-#   define MID(M,I)     MtoSTR(M+(I)*8)"(%[c])"
-#   define M(I)         MID( MaskOffset , I )
-#   define C(I)         MID( CosineOffset , I-1 )
-#   define Eight        MID(EightOffset,0)
-
-    /* eax = quantized input */
-    /* esi = quantization table */
-    /* edx = destination (= idct buffer) */
-    /* ecx = idctconstants */
-
-
-    __asm__ __volatile__ (
-    "# dequantize, de-zigzag\n"
-    "movq   (%[i]), %%mm0\n"
-    "pmullw (%[q]), %%mm0\n"     /* r0 = 03 02 01 00 */
-    "movq   16(%[i]), %%mm1\n"
-    "pmullw 16(%[q]), %%mm1\n"   /* r1 = 13 12 11 10 */
-    "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
-    "movq   8(%[i]), %%mm4\n"
-    "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
-    "pmullw 8(%[q]), %%mm4\n"    /* r4 = 07 06 05 04 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
-    "movq   %%mm1, %%mm6\n"      /* r6 = 13 12 11 10 */
-    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
-    "psllq  $32, %%mm6\n"        /* r6 = 11 10 __ __ */
-    "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
-    "pand   %%mm6, %%mm7\n"      /* r7 = 11 __ __ __ */
-    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
-    "pxor   %%mm7, %%mm6\n"      /* r6 = __ 10 __ __ */
-    "por    %%mm7, %%mm0\n"      /* r0 = 11 03 02 00 = R0 */
-    "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "movq   %%mm4, %%mm3\n"      /* r3 = 07 06 05 04 */
-    "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 04 */
-    "movq   32(%[i]), %%mm0\n"
-    "psllq  $16, %%mm3\n"        /* r3 = __ __ 04 __ */
-    "pmullw 32(%[q]), %%mm0\n"   /* r0 = 23 22 21 20 */
-    "pand   %%mm1, %%mm7\n"      /* r7 = 13 __ __ __ */
-    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 04 01 */
-    "por    %%mm6, %%mm7\n"      /* r7 = 13 10 __ __ */
-    "movq   24(%[i]), %%mm3\n"
-    "por    %%mm5, %%mm7\n"      /* r7 = 13 10 04 01 = R1 */
-    "pmullw 24(%[q]), %%mm3\n"   /* r3 = 17 16 15 14 */
-    "psrlq  $16, %%mm4\n"        /* r4 = __ 07 06 05 */
-    "movq   %%mm7, 16(%[o])\n"   /* write R1 = r7 */
-    "movq   %%mm4, %%mm5\n"      /* r5 = __ 07 06 05 */
-    "movq   %%mm0, %%mm7\n"      /* r7 = 23 22 21 20 */
-    "psrlq  $16, %%mm4\n"        /* r4 = __ __ 07 06 */
-    "psrlq  $48, %%mm7\n"        /* r7 = __ __ __ 23 */
-    "movq   %%mm2, %%mm6\n"      /* r6 = __ __ __ FF */
-    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 05 */
-    "pand   %%mm4, %%mm6\n"      /* r6 = __ __ __ 06 */
-    "movq   %%mm7, 80(%[o])\n"   /* partial R9 = __ __ __ 23 */
-    "pxor   %%mm6, %%mm4\n"      /* r4 = __ __ 07 __ */
-    "psrlq  $32, %%mm1\n"        /* r1 = __ __ 13 12 */
-    "por    %%mm5, %%mm4\n"      /* r4 = __ __ 07 05 */
-    "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 12 */
-    "movq   48(%[i]), %%mm5\n"
-    "psllq  $16, %%mm0\n"        /* r0 = 22 21 20 __ */
-    "pmullw 48(%[q]), %%mm5\n"   /* r5 = 33 32 31 30 */
-    "pand   %%mm0, %%mm7\n"      /* r7 = 22 __ __ __ */
-    "movq   %%mm1, 64(%[o])\n"   /* partial R8 = __ __ __ 12 */
-    "por    %%mm4, %%mm7\n"      /* r7 = 22 __ 07 05 */
-    "movq   %%mm3, %%mm4\n"      /* r4 = 17 16 15 14 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 14 */
-    "movq   "M(2)", %%mm1\n"     /* r1 = __ FF __ __ */
-    "psllq  $32, %%mm3\n"        /* r3 = __ 14 __ __ */
-    "por    %%mm3, %%mm7\n"      /* r7 = 22 14 07 05 = R2 */
-    "movq   %%mm5, %%mm3\n"      /* r3 = 33 32 31 30 */
-    "psllq  $48, %%mm3\n"        /* r3 = 30 __ __ __ */
-    "pand   %%mm0, %%mm1\n"      /* r1 = __ 21 __ __ */
-    "movq   %%mm7, 32(%[o])\n"   /* write R2 = r7 */
-    "por    %%mm3, %%mm6\n"      /* r6 = 30 __ __ 06 */
-    "movq   "M(1)", %%mm7\n"     /* r7 = __ __ FF __ */
-    "por    %%mm1, %%mm6\n"      /* r6 = 30 21 __ 06 */
-    "movq   56(%[i]), %%mm1\n"
-    "pand   %%mm4, %%mm7\n"      /* r7 = __ __ 15 __ */
-    "pmullw 56(%[q]), %%mm1\n"   /* r1 = 37 36 35 34 */
-    "por    %%mm6, %%mm7\n"      /* r7 = 30 21 15 06 = R3 */
-    "pand   "M(1)", %%mm0\n"     /* r0 = __ __ 20 __ */
-    "psrlq  $32, %%mm4\n"        /* r4 = __ __ 17 16 */
-    "movq   %%mm7, 48(%[o])\n"   /* write R3 = r7 */
-    "movq   %%mm4, %%mm6\n"      /* r6 = __ __ 17 16 */
-    "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pand   %%mm2, %%mm4\n"      /* r4 = __ __ __ 16 */
-    "movq   "M(1)", %%mm3\n"     /* r3 = __ __ FF __ */
-    "pand   %%mm1, %%mm7\n"      /* r7 = 37 __ __ __ */
-    "pand   %%mm5, %%mm3\n"      /* r3 = __ __ 31 __ */
-    "por    %%mm4, %%mm0\n"      /* r0 = __ __ 20 16 */
-    "psllq  $16, %%mm3\n"        /* r3 = __ 31 __ __ */
-    "por    %%mm0, %%mm7\n"      /* r7 = 37 __ 20 16 */
-    "movq   "M(2)", %%mm4\n"     /* r4 = __ FF __ __ */
-    "por    %%mm3, %%mm7\n"      /* r7 = 37 31 20 16 = R4 */
-    "movq   80(%[i]), %%mm0\n"
-    "movq   %%mm4, %%mm3\n"      /* r3 = __ __ FF __ */
-    "pmullw 80(%[q]), %%mm0\n"   /* r0 = 53 52 51 50 */
-    "pand   %%mm5, %%mm4\n"      /* r4 = __ 32 __ __ */
-    "movq   %%mm7, 8(%[o])\n"    /* write R4 = r7 */
-    "por    %%mm4, %%mm6\n"      /* r6 = __ 32 17 16 */
-    "movq   %%mm3, %%mm4\n"      /* r4 = __ FF __ __ */
-    "psrlq  $16, %%mm6\n"        /* r6 = __ __ 32 17 */
-    "movq   %%mm0, %%mm7\n"      /* r7 = 53 52 51 50 */
-    "pand   %%mm1, %%mm4\n"      /* r4 = __ 36 __ __ */
-    "psllq  $48, %%mm7\n"        /* r7 = 50 __ __ __ */
-    "por    %%mm4, %%mm6\n"      /* r6 = __ 36 32 17 */
-    "movq   88(%[i]), %%mm4\n"
-    "por    %%mm6, %%mm7\n"      /* r7 = 50 36 32 17 = R5 */
-    "pmullw 88(%[q]), %%mm4\n"   /* r4 = 57 56 55 54 */
-    "psrlq  $16, %%mm3\n"        /* r3 = __ __ FF __ */
-    "movq   %%mm7, 24(%[o])\n"   /* write R5 = r7 */
-    "pand   %%mm1, %%mm3\n"      /* r3 = __ __ 35 __ */
-    "psrlq  $48, %%mm5\n"        /* r5 = __ __ __ 33 */
-    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 34 */
-    "movq   104(%[i]), %%mm6\n"
-    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 35 33 */
-    "pmullw 104(%[q]), %%mm6\n"  /* r6 = 67 66 65 64 */
-    "psrlq  $16, %%mm0\n"        /* r0 = __ 53 52 51 */
-    "movq   %%mm4, %%mm7\n"      /* r7 = 57 56 55 54 */
-    "movq   %%mm2, %%mm3\n"      /* r3 = __ __ __ FF */
-    "psllq  $48, %%mm7\n"        /* r7 = 54 __ __ __ */
-    "pand   %%mm0, %%mm3\n"      /* r3 = __ __ __ 51 */
-    "pxor   %%mm3, %%mm0\n"      /* r0 = __ 53 52 __ */
-    "psllq  $32, %%mm3\n"        /* r3 = __ 51 __ __ */
-    "por    %%mm5, %%mm7\n"      /* r7 = 54 __ 35 33 */
-    "movq   %%mm6, %%mm5\n"      /* r5 = 67 66 65 64 */
-    "pand   "M(1)", %%mm6\n"     /* r6 = __ __ 65 __ */
-    "por    %%mm3, %%mm7\n"      /* r7 = 54 51 35 33 = R6 */
-    "psllq  $32, %%mm6\n"        /* r6 = 65 __ __ __ */
-    "por    %%mm1, %%mm0\n"      /* r0 = __ 53 52 34 */
-    "movq   %%mm7, 40(%[o])\n"   /* write R6 = r7 */
-    "por    %%mm6, %%mm0\n"      /* r0 = 65 53 52 34 = R7 */
-    "movq   120(%[i]), %%mm7\n"
-    "movq   %%mm5, %%mm6\n"      /* r6 = 67 66 65 64 */
-    "pmullw 120(%[q]), %%mm7\n"  /* r7 = 77 76 75 74 */
-    "psrlq  $32, %%mm5\n"        /* r5 = __ __ 67 66 */
-    "pand   %%mm2, %%mm6\n"      /* r6 = __ __ __ 64 */
-    "movq   %%mm5, %%mm1\n"      /* r1 = __ __ 67 66 */
-    "movq   %%mm0, 56(%[o])\n"   /* write R7 = r0 */
-    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 66 */
-    "movq   112(%[i]), %%mm0\n"
-    "movq   %%mm7, %%mm3\n"      /* r3 = 77 76 75 74 */
-    "pmullw 112(%[q]), %%mm0\n"  /* r0 = 73 72 71 70 */
-    "psllq  $16, %%mm3\n"        /* r3 = 76 75 74 __ */
-    "pand   "M(3)", %%mm7\n"     /* r7 = 77 __ __ __ */
-    "pxor   %%mm1, %%mm5\n"      /* r5 = __ __ 67 __ */
-    "por    %%mm5, %%mm6\n"      /* r6 = __ __ 67 64 */
-    "movq   %%mm3, %%mm5\n"      /* r5 = 76 75 74 __ */
-    "pand   "M(3)", %%mm5\n"     /* r5 = 76 __ __ __ */
-    "por    %%mm1, %%mm7\n"      /* r7 = 77 __ __ 66 */
-    "movq   96(%[i]), %%mm1\n"
-    "pxor   %%mm5, %%mm3\n"      /* r3 = __ 75 74 __ */
-    "pmullw 96(%[q]), %%mm1\n"   /* r1 = 63 62 61 60 */
-    "por    %%mm3, %%mm7\n"      /* r7 = 77 75 74 66 = R15 */
-    "por    %%mm5, %%mm6\n"      /* r6 = 76 __ 67 64 */
-    "movq   %%mm0, %%mm5\n"      /* r5 = 73 72 71 70 */
-    "movq   %%mm7, 120(%[o])\n"  /* store R15 = r7 */
-    "psrlq  $16, %%mm5\n"        /* r5 = __ 73 72 71 */
-    "pand   "M(2)", %%mm5\n"     /* r5 = __ 73 __ __ */
-    "movq   %%mm0, %%mm7\n"      /* r7 = 73 72 71 70 */
-    "por    %%mm5, %%mm6\n"      /* r6 = 76 73 67 64 = R14 */
-    "pand   %%mm2, %%mm0\n"      /* r0 = __ __ __ 70 */
-    "pxor   %%mm0, %%mm7\n"      /* r7 = 73 72 71 __ */
-    "psllq  $32, %%mm0\n"        /* r0 = __ 70 __ __ */
-    "movq   %%mm6, 104(%[o])\n"  /* write R14 = r6 */
-    "psrlq  $16, %%mm4\n"        /* r4 = __ 57 56 55 */
-    "movq   72(%[i]), %%mm5\n"
-    "psllq  $16, %%mm7\n"        /* r7 = 72 71 __ __ */
-    "pmullw 72(%[q]), %%mm5\n"   /* r5 = 47 46 45 44 */
-    "movq   %%mm7, %%mm6\n"      /* r6 = 72 71 __ __ */
-    "movq   "M(2)", %%mm3\n"     /* r3 = __ FF __ __ */
-    "psllq  $16, %%mm6\n"        /* r6 = 71 __ __ __ */
-    "pand   "M(3)", %%mm7\n"     /* r7 = 72 __ __ __ */
-    "pand   %%mm1, %%mm3\n"      /* r3 = __ 62 __ __ */
-    "por    %%mm0, %%mm7\n"      /* r7 = 72 70 __ __ */
-    "movq   %%mm1, %%mm0\n"      /* r0 = 63 62 61 60 */
-    "pand   "M(3)", %%mm1\n"     /* r1 = 63 __ __ __ */
-    "por    %%mm3, %%mm6\n"      /* r6 = 71 62 __ __ */
-    "movq   %%mm4, %%mm3\n"      /* r3 = __ 57 56 55 */
-    "psrlq  $32, %%mm1\n"        /* r1 = __ __ 63 __ */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 55 */
-    "por    %%mm1, %%mm7\n"      /* r7 = 72 70 63 __ */
-    "por    %%mm3, %%mm7\n"      /* r7 = 72 70 63 55 = R13 */
-    "movq   %%mm4, %%mm3\n"      /* r3 = __ 57 56 55 */
-    "pand   "M(1)", %%mm3\n"     /* r3 = __ __ 56 __ */
-    "movq   %%mm5, %%mm1\n"      /* r1 = 47 46 45 44 */
-    "movq   %%mm7, 88(%[o])\n"   /* write R13 = r7 */
-    "psrlq  $48, %%mm5\n"        /* r5 = __ __ __ 47 */
-    "movq   64(%[i]), %%mm7\n"
-    "por    %%mm3, %%mm6\n"      /* r6 = 71 62 56 __ */
-    "pmullw 64(%[q]), %%mm7\n"   /* r7 = 43 42 41 40 */
-    "por    %%mm5, %%mm6\n"      /* r6 = 71 62 56 47 = R12 */
-    "pand   "M(2)", %%mm4\n"     /* r4 = __ 57 __ __ */
-    "psllq  $32, %%mm0\n"        /* r0 = 61 60 __ __ */
-    "movq   %%mm6, 72(%[o])\n"   /* write R12 = r6 */
-    "movq   %%mm0, %%mm6\n"      /* r6 = 61 60 __ __ */
-    "pand   "M(3)", %%mm0\n"     /* r0 = 61 __ __ __ */
-    "psllq  $16, %%mm6\n"        /* r6 = 60 __ __ __ */
-    "movq   40(%[i]), %%mm5\n"
-    "movq   %%mm1, %%mm3\n"      /* r3 = 47 46 45 44 */
-    "pmullw 40(%[q]), %%mm5\n"   /* r5 = 27 26 25 24 */
-    "psrlq  $16, %%mm1\n"        /* r1 = __ 47 46 45 */
-    "pand   "M(1)", %%mm1\n"     /* r1 = __ __ 46 __ */
-    "por    %%mm4, %%mm0\n"      /* r0 = 61 57 __ __ */
-    "pand   %%mm7, %%mm2\n"      /* r2 = __ __ __ 40 */
-    "por    %%mm1, %%mm0\n"      /* r0 = 61 57 46 __ */
-    "por    %%mm2, %%mm0\n"      /* r0 = 61 57 46 40 = R11 */
-    "psllq  $16, %%mm3\n"        /* r3 = 46 45 44 __ */
-    "movq   %%mm3, %%mm4\n"      /* r4 = 46 45 44 __ */
-    "movq   %%mm5, %%mm2\n"      /* r2 = 27 26 25 24 */
-    "movq   %%mm0, 112(%[o])\n"  /* write R11 = r0 */
-    "psrlq  $48, %%mm2\n"        /* r2 = __ __ __ 27 */
-    "pand   "M(2)", %%mm4\n"     /* r4 = __ 45 __ __ */
-    "por    %%mm2, %%mm6\n"      /* r6 = 60 __ __ 27 */
-    "movq   "M(1)", %%mm2\n"     /* r2 = __ __ FF __ */
-    "por    %%mm4, %%mm6\n"      /* r6 = 60 45 __ 27 */
-    "pand   %%mm7, %%mm2\n"      /* r2 = __ __ 41 __ */
-    "psllq  $32, %%mm3\n"        /* r3 = 44 __ __ __ */
-    "por    80(%[o]), %%mm3\n"   /* r3 = 44 __ __ 23 */
-    "por    %%mm2, %%mm6\n"      /* r6 = 60 45 41 27 = R10 */
-    "movq   "M(3)", %%mm2\n"     /* r2 = FF __ __ __ */
-    "psllq  $16, %%mm5\n"        /* r5 = 26 25 24 __ */
-    "movq   %%mm6, 96(%[o])\n"   /* store R10 = r6 */
-    "pand   %%mm5, %%mm2\n"      /* r2 = 26 __ __ __ */
-    "movq   "M(2)", %%mm6\n"     /* r6 = __ FF __ __ */
-    "pxor   %%mm2, %%mm5\n"      /* r5 = __ 25 24 __ */
-    "pand   %%mm7, %%mm6\n"      /* r6 = __ 42 __ __ */
-    "psrlq  $32, %%mm2\n"        /* r2 = __ __ 26 __ */
-    "pand   "M(3)", %%mm7\n"     /* r7 = 43 __ __ __ */
-    "por    %%mm2, %%mm3\n"      /* r3 = 44 __ 26 23 */
-    "por    64(%[o]), %%mm7\n"   /* r7 = 43 __ __ 12 */
-    "por    %%mm3, %%mm6\n"      /* r6 = 44 42 26 23 = R9 */
-    "por    %%mm5, %%mm7\n"      /* r7 = 43 25 24 12 = R8 */
-    "movq   %%mm6, 80(%[o])\n"   /* store R9 = r6 */
-    "movq   %%mm7, 64(%[o])\n"   /* store R8 = r7 */
-    /* 123c  ( / 64 coeffs  < 2c / coeff) */
-
-/* Done w/dequant + descramble + partial transpose; now do the idct itself. */
-
-#   define I( K)    MtoSTR((K*16))"(%[o])"
-#   define J( K)    MtoSTR(((K - 4)*16)+8)"(%[o])"
-
-    RowIDCT         /* 46 c */
-    Transpose       /* 19 c */
-
-#   undef I
-#   undef J
-#   define I( K)    MtoSTR((K*16)+64)"(%[o])"
-#   define J( K)    MtoSTR(((K-4)*16)+72)"(%[o])"
-
-    RowIDCT         /* 46 c */
-    Transpose       /* 19 c */
-
-#   undef I
-#   undef J
-#   define I( K)    MtoSTR((K * 16))"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT      /* 57 c */
-
-#   undef I
-#   undef J
-#   define I( K)    MtoSTR((K*16)+8)"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT      /* 57 c */
-
-#   undef I
-#   undef J
-    /* 368 cycles  ( / 64 coeff  <  6 c / coeff) */
-
-    "emms\n"
-    :
-    :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
-   );
-}
-
-/**************************************************************************************
- *
- *      Routine:        MMX_idct10
- *
- *      Description:    Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
- *
- *      Input:          Pointer to input and output buffer
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   The input coefficients are in transposed ZigZag order
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-/* --------------------------------------------------------------- */
-// This macro does four 4-sample one-dimensional idcts in parallel.  Inputs
-// 4 thru 7 are assumed to be zero.
-#define BeginIDCT_10 "#BeginIDCT_10\n"    \
-    "   movq    "I(3)",%%mm2\n"           \
-                                          \
-    "   movq    "C(3)",%%mm6\n"           \
-    "   movq    %%mm2,%%mm4\n"            \
-                                          \
-    "   movq    "C(5)",%%mm1\n"           \
-    "   pmulhw  %%mm6,%%mm4\n"            \
-                                          \
-    "   movq    "I(1)",%%mm3\n"           \
-    "   pmulhw  %%mm2,%%mm1\n"            \
-                                          \
-    "   movq    "C(1)",%%mm0\n"           \
-    "   paddw   %%mm2,%%mm4\n"            \
-                                          \
-    "   pxor    %%mm6,%%mm6\n"            \
-    "   paddw   %%mm1,%%mm2\n"            \
-                                          \
-    "   movq    "I(2)",%%mm5\n"           \
-    "   pmulhw  %%mm3,%%mm0\n"            \
-                                          \
-    "   movq    %%mm5,%%mm1\n"            \
-    "   paddw   %%mm3,%%mm0\n"            \
-                                          \
-    "   pmulhw  "C(7)",%%mm3\n"           \
-    "   psubsw  %%mm2,%%mm6\n"            \
-                                          \
-    "   pmulhw  "C(2)",%%mm5\n"           \
-    "   psubsw  %%mm4,%%mm0\n"            \
-                                          \
-    "   movq    "I(2)",%%mm7\n"           \
-    "   paddsw  %%mm4,%%mm4\n"            \
-                                          \
-    "   paddw   %%mm5,%%mm7\n"            \
-    "   paddsw  %%mm0,%%mm4\n"            \
-                                          \
-    "   pmulhw  "C(6)",%%mm1\n"           \
-    "   psubsw  %%mm6,%%mm3\n"            \
-                                          \
-    "   movq    %%mm4,"I(1)"\n"           \
-    "   paddsw  %%mm6,%%mm6\n"            \
-                                          \
-    "   movq    "C(4)",%%mm4\n"           \
-    "   paddsw  %%mm3,%%mm6\n"            \
-                                          \
-    "   movq    %%mm3,%%mm5\n"            \
-    "   pmulhw  %%mm4,%%mm3\n"            \
-                                          \
-    "   movq    %%mm6,"I(2)"\n"           \
-    "   movq    %%mm0,%%mm2\n"            \
-                                          \
-    "   movq    "I(0)",%%mm6\n"           \
-    "   pmulhw  %%mm4,%%mm0\n"            \
-                                          \
-    "   paddw   %%mm3,%%mm5\n"            \
-    "   paddw   %%mm0,%%mm2\n"            \
-                                          \
-    "   psubsw  %%mm1,%%mm5\n"            \
-    "   pmulhw  %%mm4,%%mm6\n"            \
-                                          \
-    "   paddw   "I(0)",%%mm6\n"           \
-    "   paddsw  %%mm1,%%mm1\n"            \
-                                          \
-    "   movq    %%mm6,%%mm4\n"            \
-    "   paddsw  %%mm5,%%mm1\n"            \
-                                          \
-    "   psubsw  %%mm2,%%mm6\n"            \
-    "   paddsw  %%mm2,%%mm2\n"            \
-                                          \
-    "   movq    "I(1)",%%mm0\n"           \
-    "   paddsw  %%mm6,%%mm2\n"            \
-                                          \
-    "   psubsw  %%mm1,%%mm2\n"            \
-    "#end BeginIDCT_10\n"
-// end BeginIDCT_10 macro (25 cycles).
-
-#define RowIDCT_10 "#RowIDCT_10\n"                            \
-    BeginIDCT_10                                              \
-    "\n"                                                      \
-    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */               \
-    "   psubsw  %%mm7,%%mm4\n"        /* r4 = E. = E - G */   \
-    "   paddsw  %%mm1,%%mm1\n"        /* r1 = H. + H. */      \
-    "   paddsw  %%mm7,%%mm7\n"        /* r7 = G + G */        \
-    "   paddsw  %%mm2,%%mm1\n"        /* r1 = R1 = A.. + H. */\
-    "   paddsw  %%mm4,%%mm7\n"        /* r7 = G. = E + G */   \
-    "   psubsw  %%mm3,%%mm4\n"        /* r4 = R4 = E. - D. */ \
-    "   paddsw  %%mm3,%%mm3\n"                                \
-    "   psubsw  %%mm5,%%mm6\n"        /* r6 = R6 = F. - B.. */\
-    "   paddsw  %%mm5,%%mm5\n"                                \
-    "   paddsw  %%mm4,%%mm3\n"        /* r3 = R3 = E. + D. */ \
-    "   paddsw  %%mm6,%%mm5\n"        /* r5 = R5 = F. + B.. */\
-    "   psubsw  %%mm0,%%mm7\n"        /* r7 = R7 = G. - C. */ \
-    "   paddsw  %%mm0,%%mm0\n"                                \
-    "   movq    %%mm1,"I(1)"\n"   /* save R1 */               \
-    "   paddsw  %%mm7,%%mm0\n"        /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT_10\n"
-// end RowIDCT macro (8 + 38 = 46 cycles)
-
-// Column IDCT normalizes and stores final results.
-
-#define ColumnIDCT_10 "#ColumnIDCT_10\n"                  \
-    BeginIDCT_10                                          \
-    "\n"                                                  \
-    "   paddsw  "Eight",%%mm2\n"                          \
-    "   paddsw  %%mm1,%%mm1\n"    /* r1 = H. + H. */      \
-    "   paddsw  %%mm2,%%mm1\n"    /* r1 = R1 = A.. + H. */\
-    "   psraw   ""$4"",%%mm2\n"       /* r2 = NR2 */      \
-    "   psubsw  %%mm7,%%mm4\n"    /* r4 = E. = E - G */   \
-    "   psraw   ""$4"",%%mm1\n"       /* r1 = NR1 */      \
-    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */           \
-    "   paddsw  %%mm7,%%mm7\n"    /* r7 = G + G */        \
-    "   movq    %%mm2,"I(2)"\n"   /* store NR2 at I2 */   \
-    "   paddsw  %%mm4,%%mm7\n"    /* r7 = G. = E + G */   \
-    "   movq    %%mm1,"I(1)"\n"   /* store NR1 at I1 */   \
-    "   psubsw  %%mm3,%%mm4\n"    /* r4 = R4 = E. - D. */ \
-    "   paddsw  "Eight",%%mm4\n"                          \
-    "   paddsw  %%mm3,%%mm3\n"    /* r3 = D. + D. */      \
-    "   paddsw  %%mm4,%%mm3\n"    /* r3 = R3 = E. + D. */ \
-    "   psraw   ""$4"",%%mm4\n"       /* r4 = NR4 */      \
-    "   psubsw  %%mm5,%%mm6\n"    /* r6 = R6 = F. - B.. */\
-    "   psraw   ""$4"",%%mm3\n"       /* r3 = NR3 */      \
-    "   paddsw  "Eight",%%mm6\n"                          \
-    "   paddsw  %%mm5,%%mm5\n"    /* r5 = B.. + B.. */    \
-    "   paddsw  %%mm6,%%mm5\n"    /* r5 = R5 = F. + B.. */\
-    "   psraw   ""$4"",%%mm6\n"       /* r6 = NR6 */      \
-    "   movq    %%mm4,"J(4)"\n"   /* store NR4 at J4 */   \
-    "   psraw   ""$4"",%%mm5\n"       /* r5 = NR5 */      \
-    "   movq    %%mm3,"I(3)"\n"   /* store NR3 at I3 */   \
-    "   psubsw  %%mm0,%%mm7\n"    /* r7 = R7 = G. - C. */ \
-    "   paddsw  "Eight",%%mm7\n"                          \
-    "   paddsw  %%mm0,%%mm0\n"    /* r0 = C. + C. */      \
-    "   paddsw  %%mm7,%%mm0\n"    /* r0 = R0 = G. + C. */ \
-    "   psraw   ""$4"",%%mm7\n"       /* r7 = NR7 */      \
-    "   movq    %%mm6,"J(6)"\n"   /* store NR6 at J6 */   \
-    "   psraw   ""$4"",%%mm0\n"       /* r0 = NR0 */      \
-    "   movq    %%mm5,"J(5)"\n"   /* store NR5 at J5 */   \
-                                                          \
-    "   movq    %%mm7,"J(7)"\n"   /* store NR7 at J7 */   \
-                                                          \
-    "   movq    %%mm0,"I(0)"\n"   /* store NR0 at I0 */   \
-    "#end ColumnIDCT_10\n"
-// end ColumnIDCT macro (38 + 19 = 57 cycles)
-/* --------------------------------------------------------------- */
-
-
-/* --------------------------------------------------------------- */
-/* IDCT 10 */
-void IDct10__mmx( const ogg_int16_t *in,
-                  const ogg_int16_t *q,
-                  ogg_int16_t *out ) {
-
-    __asm__ __volatile__ (
-
-    "movq   (%[i]), %%mm0\n"
-    "pmullw (%[q]), %%mm0\n"     /* r0 = 03 02 01 00 */
-    "movq   16(%[i]), %%mm1\n"
-    "pmullw 16(%[q]), %%mm1\n"   /* r1 = 13 12 11 10 */
-    "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
-    "movq   8(%[i]), %%mm4\n"
-    "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
-    "pmullw 8(%[q]), %%mm4\n"    /* r4 = 07 06 05 04 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
-    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
-    "psllq  $32, %%mm1\n"        /* r1 = 11 10 __ __ */
-    "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
-    "pand   %%mm1, %%mm7\n"      /* r7 = 11 __ __ __ */
-    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
-    "pxor   %%mm7, %%mm1\n"      /* r1 = __ 10 __ __ */
-    "por    %%mm7, %%mm0\n"      /* r0 = 11 03 02 00 = R0 */
-    "movq   %%mm4, %%mm3\n"      /* r3 = 07 06 05 04 */
-    "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 04 */
-    "psllq  $16, %%mm3\n"        /* r3 = __ __ 04 __ */
-    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 04 01 */
-    "por    %%mm5, %%mm1\n"      /* r1 = __ 10 04 01 = R1 */
-    "psrlq  $16, %%mm4\n"        /* r4 = __ 07 06 05 */
-    "movq   %%mm1, 16(%[o])\n"   /* write R1 = r1 */
-    "movq   %%mm4, %%mm5\n"      /* r5 = __ 07 06 05 */
-    "psrlq  $16, %%mm4\n"        /* r4 = __ __ 07 06 */
-    "movq   %%mm2, %%mm6\n"      /* r6 = __ __ __ FF */
-    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 05 */
-    "pand   %%mm4, %%mm6\n"      /* r6 = __ __ __ 06 */
-    "pxor   %%mm6, %%mm4\n"      /* r4 = __ __ 07 __ */
-    "por    %%mm5, %%mm4\n"      /* r4 = __ __ 07 05 */
-    "movq   %%mm4, 32(%[o])\n"   /* write R2 = r4 */
-    "movq   %%mm6, 48(%[o])\n"   /* write R3 = r6 */
-
-#   define I( K)    MtoSTR((K*16))"(%[o])"
-#   define J( K)    MtoSTR(((K - 4) * 16)+8)"(%[o])"
-
-    RowIDCT_10      /* 33 c */
-    Transpose       /* 19 c */
-
-#   undef I
-#   undef J
-
-#   define I( K)    MtoSTR((K * 16))"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT_10       /* 44 c */
-
-#   undef I
-#   undef J
-#   define I( K)    MtoSTR((K * 16)+8)"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT_10       /* 44 c */
-
-#   undef I
-#   undef J
-
-    "emms\n"
-    :
-    :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
-    );
-}
-
-/**************************************************************************************
- *
- *      Routine:        MMX_idct3
- *
- *      Description:    Perform IDCT on a 8x8 block with at most 3 nonzero coefficients
- *
- *      Input:          Pointer to input and output buffer
- *
- *      Output:         None
- *
- *      Return:         None
- *
- *      Special Note:   Only works for three nonzero coefficients.
- *
- *      Error:          None
- *
- ***************************************************************************************
- */
-/***************************************************************************************
-    In IDCT 3, we are dealing with only three Non-Zero coefficients in the 8x8 block.
-    In the case that we work in the fashion RowIDCT -> ColumnIDCT, we only have to
-    do 1-D row idcts on the first two rows, the rest six rows remain zero anyway.
-    After row IDCTs, since every column could have nonzero coefficients, we need do
-    eight 1-D column IDCT. However, for each column, there are at most two nonzero
-    coefficients, coefficient 0 and coefficient 1. Same for the coefficents for the
-    two 1-d row idcts. For this reason, the process of a 1-D IDCT is simplified
-
-    from a full version:
-
-    A = (C1 * I1) + (C7 * I7)       B = (C7 * I1) - (C1 * I7)
-    C = (C3 * I3) + (C5 * I5)       D = (C3 * I5) - (C5 * I3)
-    A. = C4 * (A - C)               B. = C4 * (B - D)
-    C. = A + C                      D. = B + D
-
-    E = C4 * (I0 + I4)              F = C4 * (I0 - I4)
-    G = (C2 * I2) + (C6 * I6)       H = (C6 * I2) - (C2 * I6)
-    E. = E - G
-    G. = E + G
-
-    A.. = F + A.                    B.. = B. - H
-    F.  = F - A.                    H.  = B. + H
-
-    R0 = G. + C.    R1 = A.. + H.   R3 = E. + D.    R5 = F. + B..
-    R7 = G. - C.    R2 = A.. - H.   R4 = E. - D.    R6 = F. - B..
-
-    To:
-
-
-    A = (C1 * I1)                   B = (C7 * I1)
-    C = 0                           D = 0
-    A. = C4 * A                     B. = C4 * B
-    C. = A                          D. = B
-
-    E = C4 * I0                     F = E
-    G = 0                           H = 0
-    E. = E
-    G. = E
-
-    A.. = E + A.                    B.. = B.
-    F.  = E - A.                    H.  = B.
-
-    R0 = E + A      R1 = E + A. + B.    R3 = E + B      R5 = E - A. + B.
-    R7 = E - A      R2 = E + A. - B.    R4 = E - B      R6 = F - A. - B.
-
-******************************************************************************************/
-
-#define RowIDCT_3 "#RowIDCT_3\n"\
-    "   movq        "I(1)",%%mm7\n"   /* r7 = I1                      */  \
-    "   movq        "C(1)",%%mm0\n"   /* r0 = C1                      */  \
-    "   movq        "C(7)",%%mm3\n"   /* r3 = C7                      */  \
-    "   pmulhw      %%mm7,%%mm0\n"    /* r0 = C1 * I1 - I1            */  \
-    "   pmulhw      %%mm7,%%mm3\n"    /* r3 = C7 * I1 = B, D.         */  \
-    "   movq        "I(0)",%%mm6\n"   /* r6 = I0                      */  \
-    "   movq        "C(4)",%%mm4\n"   /* r4 = C4                      */  \
-    "   paddw       %%mm7,%%mm0\n"    /* r0 = C1 * I1 = A, C.         */  \
-    "   movq        %%mm6,%%mm1\n"    /* make a copy of I0            */  \
-    "   pmulhw      %%mm4,%%mm6\n"    /* r2 = C4 * I0 - I0            */  \
-    "   movq        %%mm0,%%mm2\n"    /* make a copy of A             */  \
-    "   movq        %%mm3,%%mm5\n"    /* make a copy of B             */  \
-    "   pmulhw      %%mm4,%%mm2\n"    /* r2 = C4 * A - A              */  \
-    "   pmulhw      %%mm4,%%mm5\n"    /* r5 = C4 * B - B              */  \
-    "   paddw       %%mm1,%%mm6\n"    /* r2 = C4 * I0 = E, F          */  \
-    "   movq        %%mm6,%%mm4\n"    /* r4 = E                       */  \
-    "   paddw       %%mm0,%%mm2\n"    /* r2 = A.                      */  \
-    "   paddw       %%mm3,%%mm5\n"    /* r5 = B.                      */  \
-    "   movq        %%mm6,%%mm7\n"    /* r7 = E                       */  \
-    "   movq        %%mm5,%%mm1\n"    /* r1 = B.                      */  \
-    /*  r0 = A      */   \
-    /*  r3 = B      */   \
-    /*  r2 = A.     */   \
-    /*  r5 = B.     */   \
-    /*  r6 = E      */   \
-    /*  r4 = E      */   \
-    /*  r7 = E      */   \
-    /*  r1 = B.     */   \
-    "   psubw       %%mm2,%%mm6\n"    /* r6 = E - A.                  */  \
-    "   psubw       %%mm3,%%mm4\n"    /* r4 = E - B ----R4            */  \
-    "   psubw       %%mm0,%%mm7\n"    /* r7 = E - A ----R7            */  \
-    "   paddw       %%mm2,%%mm2\n"    /* r2 = A. + A.                 */  \
-    "   paddw       %%mm3,%%mm3\n"    /* r3 = B + B                   */  \
-    "   paddw       %%mm0,%%mm0\n"    /* r0 = A + A                   */  \
-    "   paddw       %%mm6,%%mm2\n"    /* r2 = E + A.                  */  \
-    "   paddw       %%mm4,%%mm3\n"    /* r3 = E + B ----R3            */  \
-    "   psubw       %%mm1,%%mm2\n"    /* r2 = E + A. - B. ----R2      */  \
-    "   psubw       %%mm5,%%mm6\n"    /* r6 = E - A. - B. ----R6      */  \
-    "   paddw       %%mm1,%%mm1\n"    /* r1 = B. + B.                 */  \
-    "   paddw       %%mm5,%%mm5\n"    /* r5 = B. + B.                 */  \
-    "   paddw       %%mm7,%%mm0\n"    /* r0 = E + A ----R0            */  \
-    "   paddw       %%mm2,%%mm1\n"    /* r1 = E + A. + B. -----R1     */  \
-    "   movq        %%mm1,"I(1)"\n"   /* save r1                      */  \
-    "   paddw       %%mm6,%%mm5\n"    /* r5 = E - A. + B. -----R5     */  \
-    "#end RowIDCT_3\n"
-//End of RowIDCT_3
-
-#define ColumnIDCT_3 "#ColumnIDCT_3\n"\
-    "   movq        "I(1)",%%mm7\n"   /* r7 = I1                      */  \
-    "   movq        "C(1)",%%mm0\n"   /* r0 = C1                      */  \
-    "   movq        "C(7)",%%mm3\n"   /* r3 = C7                      */  \
-    "   pmulhw      %%mm7,%%mm0\n"    /* r0 = C1 * I1 - I1            */  \
-    "   pmulhw      %%mm7,%%mm3\n"    /* r3 = C7 * I1 = B, D.         */  \
-    "   movq        "I(0)",%%mm6\n"   /* r6 = I0                      */  \
-    "   movq        "C(4)",%%mm4\n"   /* r4 = C4                      */  \
-    "   paddw       %%mm7,%%mm0\n"    /* r0 = C1 * I1 = A, C.         */  \
-    "   movq        %%mm6,%%mm1\n"    /* make a copy of I0            */  \
-    "   pmulhw      %%mm4,%%mm6\n"    /* r2 = C4 * I0 - I0            */  \
-    "   movq        %%mm0,%%mm2\n"    /* make a copy of A             */  \
-    "   movq        %%mm3,%%mm5\n"    /* make a copy of B             */  \
-    "   pmulhw      %%mm4,%%mm2\n"    /* r2 = C4 * A - A              */  \
-    "   pmulhw      %%mm4,%%mm5\n"    /* r5 = C4 * B - B              */  \
-    "   paddw       %%mm1,%%mm6\n"    /* r2 = C4 * I0 = E, F          */  \
-    "   movq        %%mm6,%%mm4\n"    /* r4 = E                       */  \
-    "   paddw       "Eight",%%mm6\n"  /* +8 for shift                 */  \
-    "   paddw       "Eight",%%mm4\n"  /* +8 for shift                 */  \
-    "   paddw       %%mm0,%%mm2\n"    /* r2 = A.                      */  \
-    "   paddw       %%mm3,%%mm5\n"    /* r5 = B.                      */  \
-    "   movq        %%mm6,%%mm7\n"    /* r7 = E                       */  \
-    "   movq        %%mm5,%%mm1\n"    /* r1 = B.                      */  \
-/*  r0 = A      */   \
-/*  r3 = B      */   \
-/*  r2 = A.     */   \
-/*  r5 = B.     */   \
-/*  r6 = E      */   \
-/*  r4 = E      */   \
-/*  r7 = E      */   \
-/*  r1 = B.     */   \
-    "   psubw       %%mm2,%%mm6\n"    /* r6 = E - A.                  */  \
-    "   psubw       %%mm3,%%mm4\n"    /* r4 = E - B ----R4            */  \
-    "   psubw       %%mm0,%%mm7\n"    /* r7 = E - A ----R7            */  \
-    "   paddw       %%mm2,%%mm2\n"    /* r2 = A. + A.                 */  \
-    "   paddw       %%mm3,%%mm3\n"    /* r3 = B + B                   */  \
-    "   paddw       %%mm0,%%mm0\n"    /* r0 = A + A                   */  \
-    "   paddw       %%mm6,%%mm2\n"    /* r2 = E + A.                  */  \
-    "   paddw       %%mm4,%%mm3\n"    /* r3 = E + B ----R3            */  \
-    "   psraw        $4,%%mm4\n"      /* shift                        */  \
-    "   movq        %%mm4,"J(4)"\n"   /* store R4 at J4               */  \
-    "   psraw       $4,%%mm3\n"       /* shift                        */  \
-    "   movq        %%mm3,"I(3)"\n"   /* store R3 at I3               */  \
-    "   psubw       %%mm1,%%mm2\n"    /* r2 = E + A. - B. ----R2      */  \
-    "   psubw       %%mm5,%%mm6\n"    /* r6 = E - A. - B. ----R6      */  \
-    "   paddw       %%mm1,%%mm1\n"    /* r1 = B. + B.                 */  \
-    "   paddw       %%mm5,%%mm5\n"    /* r5 = B. + B.                 */  \
-    "   paddw       %%mm7,%%mm0\n"    /* r0 = E + A ----R0            */  \
-    "   paddw       %%mm2,%%mm1\n"    /* r1 = E + A. + B. -----R1     */  \
-    "   psraw       $4,%%mm7\n"       /* shift                        */  \
-    "   psraw       $4,%%mm2\n"       /* shift                        */  \
-    "   psraw       $4,%%mm0\n"       /* shift                        */  \
-    "   psraw       $4,%%mm1\n"       /* shift                        */  \
-    "   movq        %%mm7,"J(7)"\n"   /* store R7 to J7               */  \
-    "   movq        %%mm0,"I(0)"\n"   /* store R0 to I0               */  \
-    "   movq        %%mm1,"I(1)"\n"   /* store R1 to I1               */  \
-    "   movq        %%mm2,"I(2)"\n"   /* store R2 to I2               */  \
-    "   movq        %%mm1,"I(1)"\n"   /* save r1                      */  \
-    "   paddw       %%mm6,%%mm5\n"    /* r5 = E - A. + B. -----R5     */  \
-    "   psraw       $4,%%mm5\n"       /* shift                        */  \
-    "   movq        %%mm5,"J(5)"\n"   /* store R5 at J5               */  \
-    "   psraw       $4,%%mm6\n"       /* shift                        */  \
-    "   movq        %%mm6,"J(6)"\n"   /* store R6 at J6               */  \
-    "#end ColumnIDCT_3\n"
-//End of ColumnIDCT_3
-
-void IDct3__mmx( const ogg_int16_t *in,
-                 const ogg_int16_t *q,
-                 ogg_int16_t *out ) {
-
-    __asm__ __volatile__ (
-
-    "movq   (%[i]), %%mm0\n"
-    "pmullw (%[q]), %%mm0\n"     /* r0 = 03 02 01 00 */
-    "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
-    "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
-    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
-    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
-    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
-    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
-    "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
-    "movq   %%mm5, 16(%[o])\n"   /* write R1 = r5 */
-
-/* Done partial transpose; now do the idct itself. */
-
-#   define I( K)    MtoSTR((K*16))"(%[o])"
-#   define J( K)    MtoSTR(((K - 4)*16)+8)"(%[o])"
-
-    RowIDCT_3       /* 33 c */
-    Transpose       /* 19 c */
-
-#   undef I
-#   undef J
-
-#   define I( K)    MtoSTR((K * 16))"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT_3    /* 44 c */
-
-#   undef I
-#   undef J
-#   define I( K)    MtoSTR((K*16)+8)"(%[o])"
-#   define J( K)    I( K)
-
-    ColumnIDCT_3    /* 44 c */
-
-#   undef I
-#   undef J
-
-    "emms\n"
-    :
-    :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
-    );
-
-}
-
-/* install our implementation in the function table */
-void dsp_mmx_idct_init(DspFunctions *funcs)
-{
-  funcs->IDctSlow = IDctSlow__mmx;
-  funcs->IDct10 = IDct10__mmx;
-  funcs->IDct3 = IDct3__mmx;
-}
-
-#endif /* USE_ASM */

Copied: branches/theora-thusnelda/lib/enc/x86/mmxenc.c (from rev 15940, branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/mmxenc.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/mmxenc.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,64 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dct_decode_mmx.c 15078 2008-06-27 22:07:19Z xiphmont $
+
+ ********************************************************************/
+#include <string.h>
+#include "x86enc.h"
+#include "../../dec/x86/mmxloop.h"
+
+#if defined(OC_X86_ASM)
+
+/*Apply the loop filter.*/
+void oc_enc_loop_filter_mmx(CP_INSTANCE *cpi,int _flimit){
+  unsigned char OC_ALIGN8  ll[8];
+  unsigned char           *cp;
+  ogg_uint32_t            *bp;
+  int                      pli;
+  cp=cpi->frag_coded;
+  bp=cpi->frag_buffer_index;
+  if(_flimit==0)return;
+  memset(ll,_flimit,sizeof(ll));
+  for(pli=0;pli<3;pli++){
+    ogg_uint32_t *bp_begin;
+    ogg_uint32_t *bp_end;
+    int           stride;
+    int           h;
+    bp_begin=bp;
+    bp_end=bp+cpi->frag_n[pli];
+    stride=cpi->stride[pli];
+    h=cpi->frag_h[pli];
+    while(bp<bp_end){
+      ogg_uint32_t *bp_left;
+      ogg_uint32_t *bp_right;
+      bp_left=bp;
+      bp_right=bp+h;
+      for(;bp<bp_right;bp++,cp++)if(*cp){
+        if(bp>bp_left)OC_LOOP_FILTER_H_MMX(cpi->lastrecon+bp[0],stride,ll);
+        if(bp_left>bp_begin){
+          OC_LOOP_FILTER_V_MMX(cpi->lastrecon+bp[0],stride,ll);
+        }
+        if(bp+1<bp_right&&!cp[1]){
+          OC_LOOP_FILTER_H_MMX(cpi->lastrecon+bp[0]+8,stride,ll);
+        }
+        if(bp+h<bp_end&&!cp[h]){
+          OC_LOOP_FILTER_V_MMX(cpi->lastrecon+bp[h],stride,ll);
+        }
+      }
+    }
+  }
+  __asm__ __volatile__("emms\n\t");
+}
+
+#endif

Copied: branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c (from rev 15940, branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/mmxencfrag.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,429 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
+
+ ********************************************************************/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride){
+  ptrdiff_t ystride3;
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    /*Load the first 4 rows of each block.*/
+    "movq (%[src]),%%mm0\n\t"
+    "movq (%[ref]),%%mm1\n\t"
+    "movq (%[src],%[ystride]),%%mm2\n\t"
+    "movq (%[ref],%[ystride]),%%mm3\n\t"
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    "movq (%[src],%[ystride],2),%%mm4\n\t"
+    "movq (%[ref],%[ystride],2),%%mm5\n\t"
+    "movq (%[src],%[ystride3]),%%mm6\n\t"
+    "movq (%[ref],%[ystride3]),%%mm7\n\t"
+    /*Compute their SADs and add them in %%mm0*/
+    "psadbw %%mm1,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "lea (%[ref],%[ystride],4),%[ref]\n\t"
+    /*Load the next 3 rows as registers become available.*/
+    "movq (%[src]),%%mm2\n\t"
+    "movq (%[ref]),%%mm3\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq (%[ref],%[ystride]),%%mm5\n\t"
+    "movq (%[src],%[ystride]),%%mm4\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "movq (%[ref],%[ystride],2),%%mm7\n\t"
+    "movq (%[src],%[ystride],2),%%mm6\n\t"
+    /*Start adding their SADs to %%mm0*/
+    "psadbw %%mm3,%%mm2\n\t"
+    "psadbw %%mm5,%%mm4\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "psadbw %%mm7,%%mm6\n\t"
+    /*Load last row as registers become available.*/
+    "movq (%[src],%[ystride3]),%%mm2\n\t"
+    "movq (%[ref],%[ystride3]),%%mm3\n\t"
+    /*And finish adding up their SADs.*/
+    "paddw %%mm4,%%mm0\n\t"
+    "psadbw %%mm3,%%mm2\n\t"
+    "paddw %%mm6,%%mm0\n\t"
+    "paddw %%mm2,%%mm0\n\t"
+    "movd %%mm0,%[ret]\n\t"
+    :[ret]"=r"(ret),[ystride3]"=&r"(ystride3)
+    :[src]"%r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh){
+  /*Early termination is for suckers.*/
+  return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
+}
+
+/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
+   first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
+  We pre-load the next two rows of data as registers become available.*/
+#define OC_SAD2_LOOP \
+ "#OC_SAD2_LOOP\n\t" \
+ /*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
+    pavgb computes (%%mm0+%%mm1+1>>1). \
+   The latter is exactly 1 too large when the low bit of two corresponding \
+    bytes is only set in one of them. \
+   Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
+    correct the output of pavgb.*/ \
+ "movq %%mm0,%%mm6\n\t" \
+ "lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
+ "pxor %%mm1,%%mm0\n\t" \
+ "pavgb %%mm1,%%mm6\n\t" \
+ "lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm0\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "movq (%[ref2],%[ystride]),%%mm3\n\t" \
+ "psubb %%mm0,%%mm6\n\t" \
+ "movq (%[ref1]),%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm6,%%mm4\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movq (%[ref2]),%%mm1\n\t" \
+ "lea (%[src],%[ystride],2),%[src]\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "movq (%[ref1],%[ystride]),%%mm2\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "movq (%[src]),%%mm4\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movq (%[src],%[ystride]),%%mm5\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+/*Same as above, but does not pre-load the next two rows.*/
+#define OC_SAD2_TAIL \
+ "#OC_SAD2_TAIL\n\t" \
+ "movq %%mm0,%%mm6\n\t" \
+ "pavgb %%mm1,%%mm0\n\t" \
+ "pxor %%mm1,%%mm6\n\t" \
+ "movq %%mm2,%%mm1\n\t" \
+ "pand %%mm7,%%mm6\n\t" \
+ "pavgb %%mm3,%%mm2\n\t" \
+ "pxor %%mm3,%%mm1\n\t" \
+ "psubb %%mm6,%%mm0\n\t" \
+ "pand %%mm7,%%mm1\n\t" \
+ "psadbw %%mm0,%%mm4\n\t" \
+ "psubb %%mm1,%%mm2\n\t" \
+ "movd %[ret],%%mm6\n\t" \
+ "psadbw %%mm2,%%mm5\n\t" \
+ "paddw %%mm4,%%mm5\n\t" \
+ "paddw %%mm5,%%mm6\n\t" \
+ "movd %%mm6,%[ret]\n\t" \
+
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh){
+  ptrdiff_t ret;
+  __asm__ __volatile__(
+    "movq (%[ref1]),%%mm0\n\t"
+    "movq (%[ref2]),%%mm1\n\t"
+    "movq (%[ref1],%[ystride]),%%mm2\n\t"
+    "movq (%[ref2],%[ystride]),%%mm3\n\t"
+    "xor %[ret],%[ret]\n\t"
+    "movq (%[src]),%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src],%[ystride]),%%mm5\n\t"
+    "psubb %%mm6,%%mm7\n\t"
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_LOOP
+    OC_SAD2_TAIL
+    :[ret]"=&r"(ret)
+    :[src]"r"(_src),[ref1]"%r"(_ref1),[ref2]"r"(_ref2),
+     [ystride]"r"((ptrdiff_t)_ystride)
+  );
+  return (unsigned)ret;
+}
+
+void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,const unsigned char *_ref,int _ystride){
+  int i;
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
+    __asm__ __volatile__(
+      /*mm0=[src]*/
+      "movq (%[src]),%%mm0\n\t"
+      /*mm1=[ref]*/
+      "movq (%[ref]),%%mm1\n\t"
+      /*mm4=[src+ystride]*/
+      "movq (%[src],%[ystride]),%%mm4\n\t"
+      /*mm5=[ref+ystride]*/
+      "movq (%[ref],%[ystride]),%%mm5\n\t"
+      /*Compute [src]-[ref].*/
+      "movq %%mm0,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm0\n\t"
+      "movq %%mm1,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm2\n\t"
+      "punpcklbw %%mm7,%%mm1\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm1,%%mm0\n\t"
+      "psubw %%mm3,%%mm2\n\t"
+      /*Compute [src+ystride]-[ref+ystride].*/
+      "movq %%mm4,%%mm1\n\t"
+      "punpcklbw %%mm7,%%mm4\n\t"
+      "movq %%mm5,%%mm3\n\t"
+      "punpckhbw %%mm7,%%mm1\n\t"
+      "lea (%[src],%[ystride],2),%[src]\n\t"
+      "punpcklbw %%mm7,%%mm5\n\t"
+      "lea (%[ref],%[ystride],2),%[ref]\n\t"
+      "punpckhbw %%mm7,%%mm3\n\t"
+      "psubw %%mm5,%%mm4\n\t"
+      "psubw %%mm3,%%mm1\n\t"
+      /*Write the answer out.*/
+      "movq %%mm0,0x00(%[residue])\n\t"
+      "movq %%mm2,0x08(%[residue])\n\t"
+      "movq %%mm4,0x10(%[residue])\n\t"
+      "movq %%mm1,0x18(%[residue])\n\t"
+      "lea 0x20(%[residue]),%[residue]\n\t"
+      :
+      :[residue]"r"(_residue),[src]"r"(_src),[ref]"r"(_ref),
+       [ystride]"r"((ptrdiff_t)_ystride)
+      :"memory"
+    );
+  }
+}
+
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
+ const unsigned char *_src,int _ystride){
+  ptrdiff_t ystride3;
+  __asm__ __volatile__(
+    /*mm0=[src]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*mm6={-1}x4*/
+    "pcmpeqw %%mm6,%%mm6\n\t"
+    /*mm2=[src+2*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*[ystride3]=3*[ystride]*/
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    /*mm6={1}x4*/
+    "psllw $15,%%mm6\n\t"
+    /*mm3=[src+3*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*mm6={128}x4*/
+    "psrlw $8,%%mm6\n\t"
+    /*mm7=0*/
+    "pxor %%mm7,%%mm7\n\t"
+    /*[src]=[src]+4*[ystride]*/
+    "lea (%[src],%[ystride],4),%[src]\n\t"
+    /*Compute [src]-128 and [src+ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x00(%[residue])\n\t"
+    "movq %%mm4,0x08(%[residue])\n\t"
+    "movq %%mm1,0x10(%[residue])\n\t"
+    "movq %%mm5,0x18(%[residue])\n\t"
+    /*mm0=[src+4*ystride]*/
+    "movq (%[src]),%%mm0\n\t"
+    /*mm1=[src+5*ystride]*/
+    "movq (%[src],%[ystride]),%%mm1\n\t"
+    /*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x20(%[residue])\n\t"
+    "movq %%mm4,0x28(%[residue])\n\t"
+    "movq %%mm3,0x30(%[residue])\n\t"
+    "movq %%mm5,0x38(%[residue])\n\t"
+    /*mm2=[src+6*ystride]*/
+    "movq (%[src],%[ystride],2),%%mm2\n\t"
+    /*mm3=[src+7*ystride]*/
+    "movq (%[src],%[ystride3]),%%mm3\n\t"
+    /*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
+    "movq %%mm0,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm0\n\t"
+    "movq %%mm1,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm0\n\t"
+    "punpcklbw %%mm7,%%mm1\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm0,0x40(%[residue])\n\t"
+    "movq %%mm4,0x48(%[residue])\n\t"
+    "movq %%mm1,0x50(%[residue])\n\t"
+    "movq %%mm5,0x58(%[residue])\n\t"
+    /*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
+    "movq %%mm2,%%mm4\n\t"
+    "punpcklbw %%mm7,%%mm2\n\t"
+    "movq %%mm3,%%mm5\n\t"
+    "punpckhbw %%mm7,%%mm4\n\t"
+    "psubw %%mm6,%%mm2\n\t"
+    "punpcklbw %%mm7,%%mm3\n\t"
+    "psubw %%mm6,%%mm4\n\t"
+    "punpckhbw %%mm7,%%mm5\n\t"
+    "psubw %%mm6,%%mm3\n\t"
+    "psubw %%mm6,%%mm5\n\t"
+    /*Write the answer out.*/
+    "movq %%mm2,0x60(%[residue])\n\t"
+    "movq %%mm4,0x68(%[residue])\n\t"
+    "movq %%mm3,0x70(%[residue])\n\t"
+    "movq %%mm5,0x78(%[residue])\n\t"
+    :[ystride3]"=&r"(ystride3)
+    :[residue]"r"(_residue),[src]"r"(_src),[ystride]"r"((ptrdiff_t)_ystride)
+    :"memory"
+  );
+}
+
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride){
+  ptrdiff_t ystride3;
+  __asm__ __volatile__(
+    /*Load the first 3 rows.*/
+    "movq (%[src1]),%%mm0\n\t"
+    "movq (%[src2]),%%mm1\n\t"
+    "movq (%[src1],%[ystride]),%%mm2\n\t"
+    "movq (%[src2],%[ystride]),%%mm3\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq (%[src1],%[ystride],2),%%mm4\n\t"
+    "pcmpeqb %%mm6,%%mm6\n\t"
+    "movq (%[src2],%[ystride],2),%%mm5\n\t"
+    /*mm7={1}x8.*/
+    "psubb %%mm6,%%mm7\n\t"
+    /*ystride3=ystride*3.*/
+    "lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1],%[ystride3]),%%mm0\n\t"
+    /*Start averaging %%mm5 and %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 0) is done; write it out.*/
+    "movq %%mm6,(%[dst])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free, continue loading the next row.*/
+    "movq (%[src2],%[ystride3]),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    /*Advance %[src1]*/
+    "lea (%[src1],%[ystride],4),%[src1]\n\t"
+    /*%%mm2 (row 1) is done; write it out.*/
+    "movq %%mm2,(%[dst],%[ystride])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1]),%%mm2\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    /*Advance %[src2]*/
+    "lea (%[src2],%[ystride],4),%[src2]\n\t"
+    /*%%mm4 (row 2) is done; write it out.*/
+    "movq %%mm4,(%[dst],%[ystride],2)\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2]),%%mm3\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[ystride]),%%mm4\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
+    "movq %%mm2,%%mm1\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[ystride]),%%mm5\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm1\n\t"
+    /*%%mm3 is free.*/
+    "psubb %%mm0,%%mm6\n\t"
+    /*%%mm0 is free, start loading the next row.*/
+    "movq (%[src1],%[ystride],2),%%mm0\n\t"
+    /*Start averaging %%mm5 into %%mm4 using %%mm3.*/
+    "movq %%mm4,%%mm3\n\t"
+    /*%%mm6 (row 3) is done; write it out.*/
+    "movq %%mm6,(%[dst],%[ystride3])\n\t"
+    "pand %%mm7,%%mm1\n\t"
+    "pavgb %%mm5,%%mm4\n\t"
+    /*Advance %[dst]*/
+    "lea (%[dst],%[ystride],4),%[dst]\n\t"
+    "psubb %%mm1,%%mm2\n\t"
+    /*%%mm1 is free; continue loading the next row.*/
+    "movq (%[src2],%[ystride],2),%%mm1\n\t"
+    "pxor %%mm5,%%mm3\n\t"
+    /*%%mm2 (row 4) is done; write it out.*/
+    "movq %%mm2,(%[dst])\n\t"
+    "pand %%mm7,%%mm3\n\t"
+    /*Start loading the next row.*/
+    "movq (%[src1],%[ystride3]),%%mm2\n\t"
+    "psubb %%mm3,%%mm4\n\t"
+    /*Start averaging %%mm0 and %%mm1 into %%mm6.*/
+    "movq %%mm0,%%mm6\n\t"
+    /*Continue loading the next row.*/
+    "movq (%[src2],%[ystride3]),%%mm3\n\t"
+    /*%%mm4 (row 5) is done; write it out.*/
+    "movq %%mm4,(%[dst],%[ystride])\n\t"
+    "pxor %%mm1,%%mm0\n\t"
+    "pavgb %%mm1,%%mm6\n\t"
+    /*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
+    "movq %%mm2,%%mm4\n\t"
+    "pand %%mm7,%%mm0\n\t"
+    "pavgb %%mm3,%%mm2\n\t"
+    "pxor %%mm3,%%mm4\n\t"
+    "psubb %%mm0,%%mm6\n\t"
+    "pand %%mm7,%%mm4\n\t"
+    /*%%mm6 (row 6) is done, write it out.*/
+    "movq %%mm6,(%[dst],%[ystride],2)\n\t"
+    "psubb %%mm4,%%mm2\n\t"
+    /*%%mm2 (row 7) is done, write it out.*/
+    "movq %%mm2,(%[dst],%[ystride3])\n\t"
+    :[ystride3]"=&r"(ystride3)
+    :[dst]"r"(_dst),[src1]"%r"(_src1),[src2]"r"(_src2),
+     [ystride]"r"((ptrdiff_t)_ystride)
+    :"memory"
+  );
+}
+
+#endif

Copied: branches/theora-thusnelda/lib/enc/x86/mmxfdct.c (from rev 15940, branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/mmxfdct.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/mmxfdct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,660 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*MMX fDCT implementation for x86_32*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+# define OC_FDCT_STAGE1_8x4 \
+ "#OC_FDCT_STAGE1_8x4\n\t" \
+ /*Stage 1:*/ \
+ /*mm0=t7'=t0-t7*/ \
+ "psubw %%mm7,%%mm0\n\t" \
+ "paddw %%mm7,%%mm7\n\t" \
+ /*mm1=t6'=t1-t6*/ \
+ "psubw %%mm6,%%mm1\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm2=t5'=t2-t5*/ \
+ "psubw %%mm5,%%mm2\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm3=t4'=t3-t4*/ \
+ "psubw %%mm4,%%mm3\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm7=t0'=t0+t7*/ \
+ "paddw %%mm0,%%mm7\n\t" \
+ /*mm6=t1'=t1+t6*/ \
+ "paddw %%mm1,%%mm6\n\t" \
+ /*mm5=t2'=t2+t5*/ \
+ "paddw %%mm2,%%mm5\n\t" \
+ /*mm4=t3'=t3+t4*/ \
+ "paddw %%mm3,%%mm4\n\t" \
+
+# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_FDCT8x4\n\t" \
+ /*Stage 2:*/ \
+ /*mm7=t3''=t0'-t3'*/ \
+ "psubw %%mm4,%%mm7\n\t" \
+ "paddw %%mm4,%%mm4\n\t" \
+ /*mm6=t2''=t1'-t2'*/ \
+ "psubw %%mm5,%%mm6\n\t" \
+ "movq %%mm7,"_r6"(%[y])\n\t" \
+ "paddw %%mm5,%%mm5\n\t" \
+ /*mm1=t5''=t6'-t5'*/ \
+ "psubw %%mm2,%%mm1\n\t" \
+ "movq %%mm6,"_r2"(%[y])\n\t" \
+ /*mm4=t0''=t0'+t3'*/ \
+ "paddw %%mm7,%%mm4\n\t" \
+ "paddw %%mm2,%%mm2\n\t" \
+ /*mm5=t1''=t1'+t2'*/ \
+ "movq %%mm4,"_r0"(%[y])\n\t" \
+ "paddw %%mm6,%%mm5\n\t" \
+ /*mm2=t6''=t6'+t5'*/ \
+ "paddw %%mm1,%%mm2\n\t" \
+ "movq %%mm5,"_r4"(%[y])\n\t" \
+ /*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
+ /*mm4, mm5, mm6, mm7 are free.*/ \
+ /*Stage 3:*/ \
+ /*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "pcmpeqb %%mm6,%%mm6\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrlw $15,%%mm6\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm6,%%mm6\n\t" \
+ /*mm0=0, m2={-1}x4 \
+   mm5:mm4=t5''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm2,%%mm2\n\t" \
+ /*mm2=t6'', mm1=t5''+(t5''!=0) \
+   mm4=(t5''*27146+0xB500>>16)*/ \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm2,%%mm0\n\t" \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm0,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm0\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ /*mm3=t4''=t4'+s*/ \
+ "paddw %%mm4,%%mm3\n\t" \
+ /*mm1=t5'''=t4'-s*/ \
+ "psubw %%mm4,%%mm1\n\t" \
+ /*mm1=0, mm3={-1}x4 \
+   mm5:mm4=t6''*27146+0xB500*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm3,"_r1"(%[y])\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ /*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ /*mm1=t1'' \
+   mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "paddw %%mm2,%%mm4\n\t" \
+ "movq "_r4"(%[y]),%%mm1\n\t" \
+ "psraw $1,%%mm4\n\t" \
+ "movq %%mm0,%%mm2\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm0=t7''=t7'+s*/ \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm2=t6'''=t7'-s*/ \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*Stage 4:*/ \
+ /*mm0=0, mm2=t0'' \
+   mm5:mm4=t1''*27146+0xB500*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq "_r0"(%[y]),%%mm2\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movq %%mm0,"_r7"(%[y])\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ /*mm7={27146,0x4000>>1}x2 \
+   mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "pcmpeqw %%mm1,%%mm0\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "paddw %%mm1,%%mm0\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ /*mm6={0x00000E3D}x2 \
+   mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pxor %%mm1,%%mm1\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ "pcmpeqw %%mm2,%%mm1\n\t" \
+ /*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "psrad $16,%%mm4\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "paddw %%mm1,%%mm2\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm1\n\t" \
+ "paddw %%mm2,%%mm4\n\t" \
+ /*mm2=t6'', mm0=_y[0]=u=r+s>>1*/ \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "paddw %%mm4,%%mm0\n\t" \
+ "movq "_r3"(%[y]),%%mm2\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psraw $1,%%mm0\n\t" \
+ /*mm7={54491-0x7FFF,0x7FFF}x2 \
+   mm4=_y[4]=v=r-u*/ \
+ "psubw %%mm0,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "movq %%mm4,"_r4"(%[y])\n\t" \
+ /*mm0=0, mm7={36410}x4 \
+   mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "movq %%mm0,"_r0"(%[y])\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm0=0 \
+   mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "paddw %%mm2,%%mm1\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t7'', mm7={26568,0x3400}x2 \
+   mm2=s=t6'''-(36410*u>>16)*/ \
+ "movq %%mm4,%%mm1\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "movq %%mm1,"_r5"(%[y])\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddw %%mm1,%%mm4\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "psubw %%mm4,%%mm2\n\t" \
+ /*mm6={0x00007B1B}x2 \
+   mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
+ "movq %%mm2,%%mm4\n\t" \
+ "movq %%mm2,%%mm5\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={64277-0x7FFF,0x7FFF}x2 \
+   mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "psrad $17,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $17,%%mm5\n\t" \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={12785}x4 \
+   mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r3"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r1"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm3={-1}x4, mm6={1}x4 \
+   mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "pxor %%mm6,%%mm6\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm3,%%mm3\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "psubw %%mm3,%%mm6\n\t" \
+ /*mm1=t3'', mm7={20539,0x3000}x2 \
+   mm4=s=(12785*u>>16)-t4''*/ \
+ "movq %%mm4,"_r1"(%[y])\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "movq "_r6"(%[y]),%%mm1\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm6={0x00006CB7}x2 \
+   mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpcklwd %%mm6,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "punpckhwd %%mm6,%%mm5\n\t" \
+ "movd %[a],%%mm6\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "psubw %%mm3,%%mm0\n\t" \
+ "punpckldq %%mm6,%%mm6\n\t" \
+ /*mm7={60547-0x7FFF,0x7FFF}x2 \
+   mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "psrad $20,%%mm4\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "psrad $20,%%mm5\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "paddw %%mm4,%%mm2\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ /*mm0=0, mm7={25080}x4 \
+   mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
+ "movq %%mm1,%%mm4\n\t" \
+ "movq %%mm1,%%mm5\n\t" \
+ "movq %%mm2,"_r7"(%[y])\n\t" \
+ "punpcklwd %%mm1,%%mm4\n\t" \
+ "movq "_r2"(%[y]),%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ "pxor %%mm0,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "pcmpeqw %%mm0,%%mm1\n\t" \
+ "psubw %%mm3,%%mm1\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "paddd %%mm6,%%mm4\n\t" \
+ "paddd %%mm6,%%mm5\n\t" \
+ /*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
+ "movq %%mm2,%%mm6\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "pmulhw %%mm7,%%mm6\n\t" \
+ "pmullw %%mm7,%%mm3\n\t" \
+ "paddw %%mm1,%%mm6\n\t" \
+ "movq %%mm3,%%mm1\n\t" \
+ "punpckhwd %%mm6,%%mm3\n\t" \
+ "punpcklwd %%mm6,%%mm1\n\t" \
+ /*mm1={-1}x4 \
+   mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%mm3,%%mm5\n\t" \
+ "paddd %%mm1,%%mm4\n\t" \
+ "psrad $16,%%mm5\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "psrad $16,%%mm4\n\t" \
+ "pcmpeqb %%mm1,%%mm1\n\t" \
+ "packssdw %%mm5,%%mm4\n\t" \
+ /*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
+   mm4=s=(25080*u>>16)-t2''*/ \
+ "movq %%mm4,%%mm6\n\t" \
+ "pmulhw %%mm7,%%mm4\n\t" \
+ "pxor %%mm5,%%mm5\n\t" \
+ "movd %[a],%%mm7\n\t" \
+ "psubw %%mm1,%%mm5\n\t" \
+ "punpckldq %%mm7,%%mm7\n\t" \
+ "psubw %%mm2,%%mm4\n\t" \
+ /*mm2=s+(s!=0) \
+   mm4:mm3=s*21600+0x2800*/ \
+ "movq %%mm4,%%mm3\n\t" \
+ "movq %%mm4,%%mm2\n\t" \
+ "punpckhwd %%mm5,%%mm4\n\t" \
+ "pcmpeqw %%mm2,%%mm0\n\t" \
+ "pmaddwd %%mm7,%%mm4\n\t" \
+ "psubw %%mm1,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm3\n\t" \
+ "paddw %%mm0,%%mm2\n\t" \
+ "pmaddwd %%mm7,%%mm3\n\t" \
+ /*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
+   mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "movq "_r4"(%[y]),%%mm0\n\t" \
+ "psrad $18,%%mm4\n\t" \
+ "movq "_r5"(%[y]),%%mm5\n\t" \
+ "psrad $18,%%mm3\n\t" \
+ "movq "_r7"(%[y]),%%mm1\n\t" \
+ "packssdw %%mm4,%%mm3\n\t" \
+ "movq "_r0"(%[y]),%%mm4\n\t" \
+ "paddw %%mm2,%%mm3\n\t" \
+
+/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
+  On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
+   {mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
+# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
+ "#OC_TRANSPOSE8x4\n\t" \
+ /*First 4x4 transpose:*/ \
+ /*mm0 = e3 e2 e1 e0 \
+   mm5 = f3 f2 f1 f0 \
+   mm3 = g3 g2 g1 g0 \
+   mm1 = h3 h2 h1 h0*/ \
+ "movq %%mm0,%%mm2\n\t" \
+ "punpcklwd %%mm5,%%mm0\n\t" \
+ "punpckhwd %%mm5,%%mm2\n\t" \
+ "movq %%mm3,%%mm5\n\t" \
+ "punpcklwd %%mm1,%%mm3\n\t" \
+ "punpckhwd %%mm1,%%mm5\n\t" \
+ /*mm0 = f1 e1 f0 e0 \
+   mm2 = f3 e3 f2 e2 \
+   mm3 = h1 g1 h0 g0 \
+   mm5 = h3 g3 h2 g2*/ \
+ "movq %%mm0,%%mm1\n\t" \
+ "punpckldq %%mm3,%%mm0\n\t" \
+ "movq %%mm0,"_r4"(%[y])\n\t" \
+ "punpckhdq %%mm3,%%mm1\n\t" \
+ "movq "_r1"(%[y]),%%mm0\n\t" \
+ "movq %%mm2,%%mm3\n\t" \
+ "punpckldq %%mm5,%%mm2\n\t" \
+ "punpckhdq %%mm5,%%mm3\n\t" \
+ "movq "_r3"(%[y]),%%mm5\n\t" \
+ /*_y[4] = h0 g0 f0 e0 \
+    mm1  = h1 g1 f1 e1 \
+    mm2  = h2 g2 f2 e2 \
+    mm3  = h3 g3 f3 e3*/ \
+ /*Second 4x4 transpose:*/ \
+ /*mm4 = a3 a2 a1 a0 \
+   mm0 = b3 b2 b1 b0 \
+   mm6 = c3 c2 c1 c0 \
+   mm5 = d3 d2 d1 d0*/ \
+ "movq %%mm4,%%mm7\n\t" \
+ "punpcklwd %%mm0,%%mm4\n\t" \
+ "punpckhwd %%mm0,%%mm7\n\t" \
+ "movq %%mm6,%%mm0\n\t" \
+ "punpcklwd %%mm5,%%mm6\n\t" \
+ "punpckhwd %%mm5,%%mm0\n\t" \
+ /*mm4 = b1 a1 b0 a0 \
+   mm7 = b3 a3 b2 a2 \
+   mm6 = d1 c1 d0 c0 \
+   mm0 = d3 c3 d2 c2*/ \
+ "movq %%mm4,%%mm5\n\t" \
+ "punpckldq %%mm6,%%mm4\n\t" \
+ "punpckhdq %%mm6,%%mm5\n\t" \
+ "movq %%mm7,%%mm6\n\t" \
+ "punpckhdq %%mm0,%%mm7\n\t" \
+ "punpckldq %%mm0,%%mm6\n\t" \
+ /*mm4 = d0 c0 b0 a0 \
+   mm5 = d1 c1 b1 a1 \
+   mm6 = d2 c2 b2 a2 \
+   mm7 = d3 c3 b3 a3*/ \
+
+/*MMX implementation of the fDCT.*/
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add biases to correct for some systematic error that remains in
+       the full fDCT->iDCT round trip.*/
+    "movq 0x00(%[x]),%%mm0\n\t"
+    "movq 0x10(%[x]),%%mm1\n\t"
+    "movq 0x20(%[x]),%%mm2\n\t"
+    "movq 0x30(%[x]),%%mm3\n\t"
+    "pcmpeqb %%mm4,%%mm4\n\t"
+    "pxor %%mm7,%%mm7\n\t"
+    "movq %%mm0,%%mm5\n\t"
+    "psllw $2,%%mm0\n\t"
+    "pcmpeqw %%mm7,%%mm5\n\t"
+    "movq 0x70(%[x]),%%mm7\n\t"
+    "psllw $2,%%mm1\n\t"
+    "psubw %%mm4,%%mm5\n\t"
+    "psllw $2,%%mm2\n\t"
+    "mov $1,%[a]\n\t"
+    "pslld $16,%%mm5\n\t"
+    "movd %[a],%%mm6\n\t"
+    "psllq $16,%%mm5\n\t"
+    "mov $0x10001,%[a]\n\t"
+    "psllw $2,%%mm3\n\t"
+    "movd %[a],%%mm4\n\t"
+    "punpckhwd %%mm6,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "movq 0x60(%[x]),%%mm6\n\t"
+    "paddw %%mm5,%%mm0\n\t"
+    "movq 0x50(%[x]),%%mm5\n\t"
+    "paddw %%mm4,%%mm0\n\t"
+    "movq 0x40(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psllw $2,%%mm7\n\t"
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm6\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psllw $2,%%mm5\n\t"
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm4\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
+    /*Swap out this 8x4 block for the next one.*/
+    "movq 0x08(%[x]),%%mm0\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[x]),%%mm7\n\t"
+    "movq %%mm1,0x50(%[y])\n\t"
+    "movq 0x18(%[x]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[x]),%%mm6\n\t"
+    "movq %%mm2,0x60(%[y])\n\t"
+    "movq 0x28(%[x]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[x]),%%mm5\n\t"
+    "movq %%mm3,0x70(%[y])\n\t"
+    "movq 0x38(%[x]),%%mm3\n\t"
+    /*And increase its working precision, too.*/
+    "psllw $2,%%mm0\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "psllw $2,%%mm7\n\t"
+    "movq 0x48(%[x]),%%mm4\n\t"
+    /*We inline stage1 of the transform here so we can get better instruction
+       scheduling with the shifts.*/
+    /*mm0=t7'=t0-t7*/
+    "psubw %%mm7,%%mm0\n\t"
+    "psllw $2,%%mm1\n\t"
+    "paddw %%mm7,%%mm7\n\t"
+    "psllw $2,%%mm6\n\t"
+    /*mm1=t6'=t1-t6*/
+    "psubw %%mm6,%%mm1\n\t"
+    "psllw $2,%%mm2\n\t"
+    "paddw %%mm6,%%mm6\n\t"
+    "psllw $2,%%mm5\n\t"
+    /*mm2=t5'=t2-t5*/
+    "psubw %%mm5,%%mm2\n\t"
+    "psllw $2,%%mm3\n\t"
+    "paddw %%mm5,%%mm5\n\t"
+    "psllw $2,%%mm4\n\t"
+    /*mm3=t4'=t3-t4*/
+    "psubw %%mm4,%%mm3\n\t"
+    "paddw %%mm4,%%mm4\n\t"
+    /*mm7=t0'=t0+t7*/
+    "paddw %%mm0,%%mm7\n\t"
+    /*mm6=t1'=t1+t6*/
+    "paddw %%mm1,%%mm6\n\t"
+    /*mm5=t2'=t2+t5*/
+    "paddw %%mm2,%%mm5\n\t"
+    /*mm4=t3'=t3+t4*/
+    "paddw %%mm3,%%mm4\n\t"
+    OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
+    /*Here the first 4x4 block of output from the last transpose is the second
+       4x4 block of input for the next transform.
+      We have cleverly arranged that it already be in the appropriate place,
+       so we only have to do half the stores and loads.*/
+    "movq 0x00(%[y]),%%mm0\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "movq 0x10(%[y]),%%mm1\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "movq 0x20(%[y]),%%mm2\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "movq 0x30(%[y]),%%mm3\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x18(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x08(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq 0x40(%[y]),%%mm0\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x30(%[y])\n\t"
+    "movq 0x78(%[y]),%%mm7\n\t"
+    "movq %%mm1,0x08(%[y])\n\t"
+    "movq 0x50(%[y]),%%mm1\n\t"
+    "movq %%mm6,0x20(%[y])\n\t"
+    "movq 0x68(%[y]),%%mm6\n\t"
+    "movq %%mm2,0x28(%[y])\n\t"
+    "movq 0x60(%[y]),%%mm2\n\t"
+    "movq %%mm5,0x10(%[y])\n\t"
+    "movq 0x58(%[y]),%%mm5\n\t"
+    "movq %%mm3,0x38(%[y])\n\t"
+    "movq 0x70(%[y]),%%mm3\n\t"
+    "movq %%mm4,0x00(%[y])\n\t"
+    "movq 0x48(%[y]),%%mm4\n\t"
+    OC_FDCT_STAGE1_8x4
+    OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    OC_TRANSPOSE8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
+    /*mm0={-2}x4*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    "paddw %%mm0,%%mm0\n\t"
+    /*Round the results.*/
+    "psubw %%mm0,%%mm1\n\t"
+    "psubw %%mm0,%%mm2\n\t"
+    "psraw $2,%%mm1\n\t"
+    "psubw %%mm0,%%mm3\n\t"
+    "movq %%mm1,0x58(%[y])\n\t"
+    "psraw $2,%%mm2\n\t"
+    "psubw %%mm0,%%mm4\n\t"
+    "movq 0x48(%[y]),%%mm1\n\t"
+    "psraw $2,%%mm3\n\t"
+    "psubw %%mm0,%%mm5\n\t"
+    "movq %%mm2,0x68(%[y])\n\t"
+    "psraw $2,%%mm4\n\t"
+    "psubw %%mm0,%%mm6\n\t"
+    "movq %%mm3,0x78(%[y])\n\t"
+    "psraw $2,%%mm5\n\t"
+    "psubw %%mm0,%%mm7\n\t"
+    "movq %%mm4,0x40(%[y])\n\t"
+    "psraw $2,%%mm6\n\t"
+    "psubw %%mm0,%%mm1\n\t"
+    "movq %%mm5,0x50(%[y])\n\t"
+    "psraw $2,%%mm7\n\t"
+    "movq %%mm6,0x60(%[y])\n\t"
+    "psraw $2,%%mm1\n\t"
+    "movq %%mm7,0x70(%[y])\n\t"
+    "movq %%mm1,0x48(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+
+#endif

Deleted: branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -1,119 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-  last mod: $Id: recon_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
-
- ********************************************************************/
-
-#include "../codec_internal.h"
-#include <stddef.h>
-
-#if defined(USE_ASM)
-
-/*TODO: This is basically oc_state_frag_copy_mmx() without the enclosing loop.
-  Seems like one of these two should share the other's code.*/
-static void oc_copy8x8_mmx(const unsigned char *_src,unsigned char *_dst,
- ogg_uint32_t _ystride){
-  ptrdiff_t esi;
-  __asm__ __volatile__(
-    /*src+0*src_ystride*/
-    "movq (%[src]),%%mm0\n\t"
-    /*esi=src_ystride*3*/
-    "lea (%[ystride],%[ystride],2),%[s]\n\t"
-    /*src+1*src_ystride*/
-    "movq (%[src],%[ystride]),%%mm1\n\t"
-    /*src+2*src_ystride*/
-    "movq (%[src],%[ystride],2),%%mm2\n\t"
-    /*src+3*src_ystride*/
-    "movq (%[src],%[s]),%%mm3\n\t"
-    /*dst+0*dst_ystride*/
-    "movq %%mm0,(%[dst])\n\t"
-    /*dst+1*dst_ystride*/
-    "movq %%mm1,(%[dst],%[ystride])\n\t"
-    /*Pointer to next 4.*/
-    "lea (%[src],%[ystride],4),%[src]\n\t" 
-    /*dst+2*dst_ystride*/
-    "movq %%mm2,(%[dst],%[ystride],2)\n\t"
-    /*dst+3*dst_ystride*/
-    "movq %%mm3,(%[dst],%[s])\n\t"
-    /*Pointer to next 4.*/
-    "lea (%[dst],%[ystride],4),%[dst]\n\t" 
-    /*src+0*src_ystride*/
-    "movq (%[src]),%%mm0\n\t"
-    /*src+1*src_ystride*/
-    "movq (%[src],%[ystride]),%%mm1\n\t"
-    /*src+2*src_ystride*/
-    "movq (%[src],%[ystride],2),%%mm2\n\t"
-    /*src+3*src_ystride*/
-    "movq (%[src],%[s]),%%mm3\n\t"
-    /*dst+0*dst_ystride*/
-    "movq %%mm0,(%[dst])\n\t"
-    /*dst+1*dst_ystride*/
-    "movq %%mm1,(%[dst],%[ystride])\n\t"
-    /*dst+2*dst_ystride*/
-    "movq %%mm2,(%[dst],%[ystride],2)\n\t"
-    /*dst+3*dst_ystride*/
-    "movq %%mm3,(%[dst],%[s])\n\t"
-    :[s]"=&S"(esi)
-    :[dst]"r"(_dst),[src]"r"(_src),[ystride]"r"((ptrdiff_t)_ystride)
-    :"memory"
-  );
-}
-
-/*TODO: There isn't much penalty to just re-using
-   oc_frag_recon_inter_mmx() from the decoder here; we should do that.*/
-static void oc_recon8x8_mmx(unsigned char *_dst,const ogg_int16_t *_residue,
- ogg_uint32_t _ystride){
-  ptrdiff_t s;
-  int       i;
-  /*Zero mm0.*/
-  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
-  for(i=8;i-->0;){
-    __asm__ __volatile__(
-      /*Load mm2 with _src*/
-      "movq (%[dst]),%%mm2\n\t"
-      /*Load mm4 with low part of residue.*/
-      "movq (%[res]),%%mm4\n\t"
-      /*Load mm5 with high part of residue.*/
-      "movq 8(%[res]),%%mm5\n\t"
-      /*Copy mm2 to mm3.*/
-      "movq %%mm2,%%mm3\n\t"
-      /*Expand low part of _src to 16 bits.*/
-      "punpcklbw %%mm0,%%mm2\n\t"
-      /*Expand high part of _src to 16 bits.*/
-      "punpckhbw %%mm0,%%mm3\n\t"
-      /*Add low part with low part of residue.*/
-      "paddsw %%mm4,%%mm2\n\t"
-      /*High with high.*/
-      "paddsw %%mm5,%%mm3\n\t"
-      /*Pack and saturate to mm2.*/
-      "packuswb %%mm3,%%mm2\n\t"
-      /*_residue+=16*/
-      "lea 16(%[res]),%[res]\n\t"
-      /*Put mm2 to dest.*/
-      "movq %%mm2,(%[dst])\n\t"
-      /*_dst+=_dst_ystride*/
-      "lea (%[dst],%[ystride]),%[dst]\n\t"
-      :[dst]"+r"(_dst),[res]"+r"(_residue)
-      :[ystride]"r"((ptrdiff_t)_ystride)
-      :"memory"
-    );
-  }
-}
-
-void dsp_mmx_recon_init(DspFunctions *_funcs){
-  _funcs->copy8x8=oc_copy8x8_mmx;
-  _funcs->recon8x8=oc_recon8x8_mmx;
-}
-
-#endif /* USE_ASM */

Copied: branches/theora-thusnelda/lib/enc/x86/sse2fdct.c (from rev 15940, branches/theora-thusnelda/lib/enc/x86/fdct_sse2.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/sse2fdct.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/sse2fdct.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,518 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************/
+/*SSE2 fDCT implementation for x86_64.*/
+/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
+#include <stddef.h>
+#include "x86enc.h"
+
+#if defined(OC_X86_64_ASM)
+
+# define OC_FDCT8x8 \
+ /*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
+ "#OC_FDCT8x8\n\t" \
+ /*Stage 1:*/ \
+ "movdqa %%xmm0,%%xmm11\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "movdqa %%xmm2,%%xmm9\n\t" \
+ "movdqa %%xmm3,%%xmm8\n\t" \
+ /*xmm11=t7'=t0-t7*/ \
+ "psubw %%xmm7,%%xmm11\n\t" \
+ /*xmm10=t6'=t1-t6*/ \
+ "psubw %%xmm6,%%xmm10\n\t" \
+ /*xmm9=t5'=t2-t5*/ \
+ "psubw %%xmm5,%%xmm9\n\t" \
+ /*xmm8=t4'=t3-t4*/ \
+ "psubw %%xmm4,%%xmm8\n\t" \
+ /*xmm0=t0'=t0+t7*/ \
+ "paddw %%xmm7,%%xmm0\n\t" \
+ /*xmm1=t1'=t1+t6*/ \
+ "paddw %%xmm6,%%xmm1\n\t" \
+ /*xmm5=t2'=t2+t5*/ \
+ "paddw %%xmm2,%%xmm5\n\t" \
+ /*xmm4=t3'=t3+t4*/ \
+ "paddw %%xmm3,%%xmm4\n\t" \
+ /*xmm2,3,6,7 are now free.*/ \
+ /*Stage 2:*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ "mov $0x5A806A0A,%[a]\n\t" \
+ "movdqa %%xmm1,%%xmm2\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm2=t2''=t1'-t2'*/ \
+ "psubw %%xmm5,%%xmm2\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ /*xmm3=t3''=t0'-t3'*/ \
+ "psubw %%xmm4,%%xmm3\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ /*xmm10=t5''=t6'-t5'*/ \
+ "psubw %%xmm9,%%xmm10\n\t" \
+ "paddw %%xmm12,%%xmm12\n\t" \
+ /*xmm4=t0''=t0'+t3'*/ \
+ "paddw %%xmm0,%%xmm4\n\t" \
+ /*xmm1=t1''=t1'+t2'*/ \
+ "paddw %%xmm5,%%xmm1\n\t" \
+ /*xmm6=t6''=t6'+t5'*/ \
+ "paddw %%xmm9,%%xmm6\n\t" \
+ /*xmm0,xmm5,xmm9 are now free.*/ \
+ /*Stage 3:*/ \
+ /*xmm10:xmm5=t5''*27146+0xB500 \
+   xmm0=t5''*/ \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "movdqa %%xmm10,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm5\n\t" \
+ "pmaddwd %%xmm13,%%xmm5\n\t" \
+ /*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
+ "psrad $16,%%xmm10\n\t" \
+ "psrad $16,%%xmm5\n\t" \
+ "packssdw %%xmm10,%%xmm5\n\t" \
+ "paddw %%xmm0,%%xmm5\n\t" \
+ /*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm0\n\t" \
+ "psubw %%xmm14,%%xmm0\n\t" \
+ "paddw %%xmm5,%%xmm0\n\t" \
+ "movdqa %%xmm8,%%xmm5\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ /*xmm5=t5'''=t4'-s*/ \
+ "psubw %%xmm0,%%xmm5\n\t" \
+ /*xmm8=t4''=t4'+s*/ \
+ "paddw %%xmm0,%%xmm8\n\t" \
+ /*xmm0,xmm7,xmm9,xmm10 are free.*/ \
+ /*xmm7:xmm9=t6''*27146+0xB500*/ \
+ "movdqa %%xmm6,%%xmm7\n\t" \
+ "movdqa %%xmm6,%%xmm9\n\t" \
+ "punpckhwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpcklwd %%xmm12,%%xmm9\n\t" \
+ "pmaddwd %%xmm13,%%xmm9\n\t" \
+ /*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
+ "psrad $16,%%xmm7\n\t" \
+ "psrad $16,%%xmm9\n\t" \
+ "packssdw %%xmm7,%%xmm9\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ /*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
+ "pcmpeqw %%xmm15,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm6\n\t" \
+ "paddw %%xmm6,%%xmm9\n\t" \
+ "movdqa %%xmm11,%%xmm7\n\t" \
+ "psraw $1,%%xmm9\n\t" \
+ /*xmm7=t6'''=t7'-s*/ \
+ "psubw %%xmm9,%%xmm7\n\t" \
+ /*xmm9=t7''=t7'+s*/ \
+ "paddw %%xmm11,%%xmm9\n\t" \
+ /*xmm0,xmm6,xmm10,xmm11 are free.*/ \
+ /*Stage 4:*/ \
+ /*xmm10:xmm0=t1''*27146+0xB500*/ \
+ "movdqa %%xmm1,%%xmm0\n\t" \
+ "movdqa %%xmm1,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm0\n\t" \
+ "pmaddwd %%xmm13,%%xmm0\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
+ "psrad $16,%%xmm0\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x20006A0A,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "paddw %%xmm1,%%xmm0\n\t" \
+ /*xmm10:xmm4=t0''*27146+0x4000*/ \
+ "movdqa %%xmm4,%%xmm1\n\t" \
+ "movdqa %%xmm4,%%xmm10\n\t" \
+ "punpcklwd %%xmm12,%%xmm4\n\t" \
+ "pmaddwd %%xmm13,%%xmm4\n\t" \
+ "punpckhwd %%xmm12,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ /*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
+ "psrad $16,%%xmm4\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "mov $0x6CB7,%[a]\n\t" \
+ "packssdw %%xmm10,%%xmm4\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "mov $0x7FFF6C84,%[a]\n\t" \
+ "paddw %%xmm1,%%xmm4\n\t" \
+ /*xmm0=_y[0]=u=r+s>>1*/ \
+ "paddw %%xmm4,%%xmm0\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psraw $1,%%xmm0\n\t" \
+ /*xmm4=_y[4]=v=r-u*/ \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm0,%%xmm4\n\t" \
+ /*xmm1,xmm6,xmm10,xmm11 are free.*/ \
+ /*xmm6:xmm10=60547*t3''+0x6CB7*/ \
+ "movdqa %%xmm3,%%xmm10\n\t" \
+ "movdqa %%xmm3,%%xmm6\n\t" \
+ "punpcklwd %%xmm3,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x61F861F8,%[a]\n\t" \
+ "punpckhwd %%xmm3,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm6\n\t" \
+ /*xmm1:xmm2=25080*t2'' \
+   xmm12=t2''*/ \
+ "movdqa %%xmm2,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm12\n\t" \
+ "pmullw %%xmm13,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm11\n\t" \
+ "movdqa %%xmm2,%%xmm1\n\t" \
+ "punpcklwd %%xmm11,%%xmm2\n\t" \
+ "punpckhwd %%xmm11,%%xmm1\n\t" \
+ /*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
+ "paddd %%xmm2,%%xmm10\n\t" \
+ "paddd %%xmm1,%%xmm6\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm3\n\t" \
+ "psrad $16,%%xmm6\n\t" \
+ "psubw %%xmm14,%%xmm3\n\t" \
+ "packssdw %%xmm6,%%xmm10\n\t" \
+ "paddw %%xmm3,%%xmm10\n\t" \
+ /*xmm2=_y[2]=u \
+   xmm10=s=(25080*u>>16)-t2''*/ \
+ "movdqa %%xmm10,%%xmm2\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "psubw %%xmm12,%%xmm10\n\t" \
+ /*xmm1:xmm6=s*21600+0x2800*/ \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "mov $0x28005460,%[a]\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "movdqa %%xmm10,%%xmm6\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "punpcklwd %%xmm12,%%xmm6\n\t" \
+ "pmaddwd %%xmm13,%%xmm6\n\t" \
+ "mov $0x0E3D,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm1\n\t" \
+ "pmaddwd %%xmm13,%%xmm1\n\t" \
+ /*xmm6=(s*21600+0x2800>>18)+s*/ \
+ "psrad $18,%%xmm6\n\t" \
+ "psrad $18,%%xmm1\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm1,%%xmm6\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t" \
+ /*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
+ "mov $0x7FFF54DC,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm10,%%xmm6\n\t " \
+ /*xmm1,xmm3,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
+ "movdqa %%xmm5,%%xmm10\n\t" \
+ "movdqa %%xmm5,%%xmm11\n\t" \
+ "punpcklwd %%xmm5,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x8E3A8E3A,%[a]\n\t" \
+ "punpckhwd %%xmm5,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm7:xmm12=36410*t6''' \
+   xmm1=t6'''*/ \
+ "movdqa %%xmm7,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm1\n\t" \
+ "pmulhw %%xmm13,%%xmm3\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpckhwd %%xmm3,%%xmm7\n\t" \
+ "punpcklwd %%xmm3,%%xmm12\n\t" \
+ /*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "paddd %%xmm7,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm5\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm5\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ /*xmm5=_y[5]=u \
+   xmm1=s=t6'''-(36410*u>>16)*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm5\n\t" \
+ "mov $0x340067C8,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddw %%xmm5,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "psubw %%xmm10,%%xmm1\n\t" \
+ /*xmm11:xmm3=s*26568+0x3400*/ \
+ "movdqa %%xmm1,%%xmm3\n\t" \
+ "movdqa %%xmm1,%%xmm11\n\t" \
+ "punpcklwd %%xmm12,%%xmm3\n\t" \
+ "pmaddwd %%xmm13,%%xmm3\n\t" \
+ "mov $0x7B1B,%[a]\n\t" \
+ "punpckhwd %%xmm12,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ /*xmm3=(s*26568+0x3400>>17)+s*/ \
+ "psrad $17,%%xmm3\n\t" \
+ "psrad $17,%%xmm11\n\t" \
+ "movd %[a],%%xmm12\n\t" \
+ "packssdw %%xmm11,%%xmm3\n\t" \
+ "pshufd $00,%%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t" \
+ /*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
+ "mov $0x7FFF7B16,%[a]\n\t" \
+ "pcmpeqw %%xmm15,%%xmm1\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm14,%%xmm1\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddw %%xmm1,%%xmm3\n\t " \
+ /*xmm1,xmm7,xmm10,xmm11 are free.*/ \
+ /*xmm11:xmm10=64277*t7''+0x7B1B*/ \
+ "movdqa %%xmm9,%%xmm10\n\t" \
+ "movdqa %%xmm9,%%xmm11\n\t" \
+ "punpcklwd %%xmm9,%%xmm10\n\t" \
+ "pmaddwd %%xmm13,%%xmm10\n\t" \
+ "mov $0x31F131F1,%[a]\n\t" \
+ "punpckhwd %%xmm9,%%xmm11\n\t" \
+ "pmaddwd %%xmm13,%%xmm11\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ /*xmm12:xmm7=12785*t4''*/ \
+ "movdqa %%xmm8,%%xmm7\n\t" \
+ "movdqa %%xmm8,%%xmm1\n\t" \
+ "pmullw %%xmm13,%%xmm7\n\t" \
+ "pmulhw %%xmm13,%%xmm1\n\t" \
+ "movdqa %%xmm7,%%xmm12\n\t" \
+ "punpcklwd %%xmm1,%%xmm7\n\t" \
+ "punpckhwd %%xmm1,%%xmm12\n\t" \
+ /*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
+ "paddd %%xmm7,%%xmm10\n\t" \
+ "paddd %%xmm12,%%xmm11\n\t" \
+ "psrad $16,%%xmm10\n\t" \
+ "pcmpeqw %%xmm15,%%xmm9\n\t" \
+ "psrad $16,%%xmm11\n\t" \
+ "psubw %%xmm14,%%xmm9\n\t" \
+ "packssdw %%xmm11,%%xmm10\n\t" \
+ "pxor %%xmm12,%%xmm12\n\t" \
+ "paddw %%xmm9,%%xmm10\n\t" \
+ /*xmm1=_y[1]=u \
+   xmm10=s=(12785*u>>16)-t4''*/ \
+ "psubw %%xmm14,%%xmm12\n\t" \
+ "movdqa %%xmm10,%%xmm1\n\t" \
+ "mov $0x3000503B,%[a]\n\t" \
+ "pmulhw %%xmm13,%%xmm10\n\t" \
+ "movd %[a],%%xmm13\n\t" \
+ "psubw %%xmm8,%%xmm10\n\t" \
+ "pshufd $00,%%xmm13,%%xmm13\n\t" \
+ /*xmm8:xmm7=s*20539+0x3000*/ \
+ "movdqa %%xmm10,%%xmm7\n\t" \
+ "movdqa %%xmm10,%%xmm8\n\t" \
+ "punpcklwd %%xmm12,%%xmm7\n\t" \
+ "pmaddwd %%xmm13,%%xmm7\n\t" \
+ "punpckhwd %%xmm12,%%xmm8\n\t" \
+ "pmaddwd %%xmm13,%%xmm8\n\t" \
+ /*xmm7=(s*20539+0x3000>>20)+s*/ \
+ "psrad $20,%%xmm7\n\t" \
+ "psrad $20,%%xmm8\n\t" \
+ "packssdw %%xmm8,%%xmm7\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t" \
+ /*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
+ "pcmpeqw %%xmm15,%%xmm10\n\t" \
+ "psubw %%xmm14,%%xmm10\n\t" \
+ "paddw %%xmm10,%%xmm7\n\t " \
+
+# define OC_TRANSPOSE8x8 \
+ "#OC_TRANSPOSE8x8\n\t" \
+ "movdqa %%xmm4,%%xmm8\n\t" \
+ /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
+ "punpcklwd %%xmm5,%%xmm4\n\t" \
+ /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
+ "punpckhwd %%xmm5,%%xmm8\n\t" \
+ /*xmm5 is free.*/ \
+ "movdqa %%xmm0,%%xmm5\n\t" \
+ /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
+ "punpcklwd %%xmm1,%%xmm0\n\t" \
+ /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
+ "punpckhwd %%xmm1,%%xmm5\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm6,%%xmm1\n\t" \
+ /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
+ "punpcklwd %%xmm7,%%xmm6\n\t" \
+ /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
+ "punpckhwd %%xmm7,%%xmm1\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm2,%%xmm7\n\t" \
+ /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
+ "punpcklwd %%xmm3,%%xmm7\n\t" \
+ /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
+ "punpckhwd %%xmm3,%%xmm2\n\t" \
+ /*xmm3 is free.*/ \
+ "movdqa %%xmm0,%%xmm3\n\t" \
+ /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
+ "punpckldq %%xmm7,%%xmm0\n\t" \
+ /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
+ "punpckhdq %%xmm7,%%xmm3\n\t" \
+ /*xmm7 is free.*/ \
+ "movdqa %%xmm5,%%xmm7\n\t" \
+ /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
+ "punpckldq %%xmm2,%%xmm5\n\t" \
+ /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
+ "punpckhdq %%xmm2,%%xmm7\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm4,%%xmm2\n\t" \
+ /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
+ "punpckldq %%xmm6,%%xmm2\n\t" \
+ /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
+ "punpckhdq %%xmm6,%%xmm4\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm8,%%xmm6\n\t" \
+ /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
+ "punpckldq %%xmm1,%%xmm6\n\t" \
+ /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
+ "punpckhdq %%xmm1,%%xmm8\n\t" \
+ /*xmm1 is free.*/ \
+ "movdqa %%xmm0,%%xmm1\n\t" \
+ /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
+ "punpcklqdq %%xmm2,%%xmm0\n\t" \
+ /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
+ "punpckhqdq %%xmm2,%%xmm1\n\t" \
+ /*xmm2 is free.*/ \
+ "movdqa %%xmm3,%%xmm2\n\t" \
+ /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
+ "punpcklqdq %%xmm4,%%xmm2\n\t" \
+ /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
+ "punpckhqdq %%xmm4,%%xmm3\n\t" \
+ /*xmm4 is free.*/ \
+ "movdqa %%xmm5,%%xmm4\n\t" \
+ /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
+ "punpcklqdq %%xmm6,%%xmm4\n\t" \
+ /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
+ "punpckhqdq %%xmm6,%%xmm5\n\t" \
+ /*xmm6 is free.*/ \
+ "movdqa %%xmm7,%%xmm6\n\t" \
+ /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
+ "punpcklqdq %%xmm8,%%xmm6\n\t" \
+ /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
+ "punpckhqdq %%xmm8,%%xmm7\n\t" \
+ /*xmm8 is free.*/ \
+
+/*SSE2 implementation of the fDCT for x86-64 only.
+  Because of the 8 extra XMM registers on x86-64, this version can operate
+   without any temporary stack access at all.*/
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
+  ptrdiff_t a;
+  __asm__ __volatile__(
+    /*Load the input.*/
+    "movdqa 0x00(%[x]),%%xmm0\n\t"
+    "movdqa 0x10(%[x]),%%xmm1\n\t"
+    "movdqa 0x20(%[x]),%%xmm2\n\t"
+    "movdqa 0x30(%[x]),%%xmm3\n\t"
+    "movdqa 0x40(%[x]),%%xmm4\n\t"
+    "movdqa 0x50(%[x]),%%xmm5\n\t"
+    "movdqa 0x60(%[x]),%%xmm6\n\t"
+    "movdqa 0x70(%[x]),%%xmm7\n\t"
+    /*Add two extra bits of working precision to improve accuracy; any more and
+       we could overflow.*/
+    /*We also add a few biases to correct for some systematic error that
+       remains in the full fDCT->iDCT round trip.*/
+    /*xmm15={0}x8*/
+    "pxor %%xmm15,%%xmm15\n\t"
+    /*xmm14={-1}x8*/
+    "pcmpeqb %%xmm14,%%xmm14\n\t"
+    "psllw $2,%%xmm0\n\t"
+    /*xmm8=xmm0*/
+    "movdqa %%xmm0,%%xmm8\n\t"
+    "psllw $2,%%xmm1\n\t"
+    /*xmm8={_x[7...0]==0}*/
+    "pcmpeqw %%xmm15,%%xmm8\n\t"
+    "psllw $2,%%xmm2\n\t"
+    /*xmm8={_x[7...0]!=0}*/
+    "psubw %%xmm14,%%xmm8\n\t"
+    "psllw $2,%%xmm3\n\t"
+    /*%[a]=1*/
+    "mov $1,%[a]\n\t"
+    /*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pslld $16,%%xmm8\n\t"
+    "psllw $2,%%xmm4\n\t"
+    /*xmm9={0,0,0,0,0,0,0,1}*/
+    "movd %[a],%%xmm9\n\t"
+    /*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
+    "pshufhw $0x00,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm5\n\t"
+    /*%[a]={1}x2*/
+    "mov $0x10001,%[a]\n\t"
+    /*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
+    "pshuflw $0x01,%%xmm8,%%xmm8\n\t"
+    "psllw $2,%%xmm6\n\t"
+    /*xmm10={0,0,0,0,0,0,1,1}*/
+    "movd %[a],%%xmm10\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
+    "paddw %%xmm8,%%xmm0\n\t"
+    "psllw $2,%%xmm7\n\t"
+    /*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
+    "paddw %%xmm10,%%xmm0\n\t"
+    /*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
+    "psubw %%xmm9,%%xmm1\n\t"
+    /*Transform columns.*/
+    OC_FDCT8x8
+    /*Transform rows.*/
+    OC_TRANSPOSE8x8
+    OC_FDCT8x8
+    /*TODO: zig-zag ordering?*/
+    OC_TRANSPOSE8x8
+    /*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
+    "paddw %%xmm14,%%xmm14\n\t"
+    "psubw %%xmm14,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm1\n\t"
+    "psraw $2,%%xmm0\n\t"
+    "psubw %%xmm14,%%xmm2\n\t"
+    "psraw $2,%%xmm1\n\t"
+    "psubw %%xmm14,%%xmm3\n\t"
+    "psraw $2,%%xmm2\n\t"
+    "psubw %%xmm14,%%xmm4\n\t"
+    "psraw $2,%%xmm3\n\t"
+    "psubw %%xmm14,%%xmm5\n\t"
+    "psraw $2,%%xmm4\n\t"
+    "psubw %%xmm14,%%xmm6\n\t"
+    "psraw $2,%%xmm5\n\t"
+    "psubw %%xmm14,%%xmm7\n\t"
+    "psraw $2,%%xmm6\n\t"
+    "psraw $2,%%xmm7\n\t"
+    /*Store the result.*/
+    "movdqa %%xmm0,0x00(%[y])\n\t"
+    "movdqa %%xmm1,0x10(%[y])\n\t"
+    "movdqa %%xmm2,0x20(%[y])\n\t"
+    "movdqa %%xmm3,0x30(%[y])\n\t"
+    "movdqa %%xmm4,0x40(%[y])\n\t"
+    "movdqa %%xmm5,0x50(%[y])\n\t"
+    "movdqa %%xmm6,0x60(%[y])\n\t"
+    "movdqa %%xmm7,0x70(%[y])\n\t"
+    :[a]"=&r"(a)
+    :[y]"r"(_y),[x]"r"(_x)
+    :"memory"
+  );
+}
+#endif

Added: branches/theora-thusnelda/lib/enc/x86/x86enc.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/x86enc.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/x86enc.c	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,50 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+#include "x86enc.h"
+
+#if defined(OC_X86_ASM)
+
+#include "../../cpu.c"
+
+void oc_enc_vtable_init_x86(CP_INSTANCE *_cpi){
+  ogg_uint32_t cpu_flags;
+  cpu_flags=oc_cpu_flags_get();
+  oc_enc_vtable_init_c(_cpi);
+  if(cpu_flags&OC_CPU_X86_MMX){
+    _cpi->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
+    _cpi->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
+    _cpi->opt_vtable.frag_copy=oc_frag_copy_mmx;
+    _cpi->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
+    _cpi->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
+    _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmx;
+    _cpi->opt_vtable.dequant_idct8x8=oc_dequant_idct8x8_mmx;
+    _cpi->opt_vtable.enc_loop_filter=oc_enc_loop_filter_mmx;
+    _cpi->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
+  }
+  if(cpu_flags&OC_CPU_X86_MMXEXT){
+    _cpi->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
+    _cpi->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
+    _cpi->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
+    _cpi->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
+  }
+  if(cpu_flags&OC_CPU_X86_SSE2){
+# if defined(OC_X86_64_ASM)
+    _cpi->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
+# endif
+  }
+}
+#endif

Added: branches/theora-thusnelda/lib/enc/x86/x86enc.h
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/x86enc.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/x86/x86enc.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -0,0 +1,42 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+    last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
+
+ ********************************************************************/
+
+#if !defined(_x86_x86enc_H)
+# define _x86_x86enc_H (1)
+# include "../codec_internal.h"
+# include "../../dec/x86/x86int.h"
+
+void oc_enc_vtable_init_x86(CP_INSTANCE *_cpi);
+
+unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride);
+unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref,int _ystride,unsigned _thresh);
+unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
+ const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
+ unsigned _thresh);
+void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,const unsigned char *_y,int _stride);
+void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
+ const unsigned char *_x,int _stride);
+void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
+ const unsigned char *_src1,const unsigned char *_src2,int _ystride);
+void oc_enc_fdct8x8_mmx(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
+void oc_enc_loop_filter_mmx(CP_INSTANCE *_cpi,int _flimit);
+
+#endif

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-04-24 11:46:40 UTC (rev 15952)
+++ branches/theora-thusnelda/lib/internal.h	2009-04-26 14:30:15 UTC (rev 15953)
@@ -34,6 +34,33 @@
 #  pragma warning(disable:4799)
 # endif
 
+/*Some assembly constructs require aligned operands.*/
+# if defined(OC_X86_ASM)
+#  if defined(__GNUC__)
+#   define OC_ALIGN8 __attribute__((aligned(8)))
+#   define OC_ALIGN16 __attribute__((aligned(16)))
+#  endif
+# endif
+# if !defined(OC_ALIGN8)
+#  define OC_ALIGN8
+# endif
+# if !defined(OC_ALIGN16)
+#  define OC_ALIGN16
+# endif
+
+
+
+typedef struct oc_sb                    oc_sb;
+typedef struct oc_mb                    oc_mb;
+typedef struct oc_border_info           oc_border_info;
+typedef struct oc_fragment              oc_fragment;
+typedef struct oc_fragment_plane        oc_fragment_plane;
+typedef struct oc_base_opt_vtable       oc_base_opt_vtable;
+typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
+typedef struct oc_theora_state          oc_theora_state;
+
+
+
 /*This library's version.*/
 # define OC_VENDOR_STRING "Xiph.Org libThusnelda I 20090403"
 
@@ -124,10 +151,6 @@
 
 
 
-typedef struct oc_theora_state oc_theora_state;
-
-
-
 /*A map from a super block to fragment numbers.*/
 typedef int         oc_sb_map[4][4];
 /*A map from a macro block to fragment numbers.*/
@@ -150,12 +173,12 @@
    are called "fragments".
   Fragments are indexed in image order, left to right, then bottom to top,
    from Y plane to Cb plane to Cr plane.*/
-typedef struct{
+struct oc_sb{
   unsigned  coded_fully:1;
   unsigned  coded_partially:1;
   unsigned  quad_valid:4;
   oc_sb_map map;
-}oc_sb;
+};
 
 
 
@@ -166,7 +189,7 @@
    contains between 6 and 12 fragments, depending on the pixel format.
   Therefore macro block information is kept in a separate array from super
    blocks, to avoid unused space in the other planes.*/
-typedef struct{
+struct oc_mb{
   /*The current macro block mode.
     A negative number indicates the macro block lies entirely outside the
      coded frame.*/
@@ -180,7 +203,7 @@
     When chroma components are decimated, the extra fragments have an index of
      -1.*/
   oc_mb_map     map;
-}oc_mb;
+};
 
 
 
@@ -189,20 +212,20 @@
   This marks which pixels belong to the displayable region, and is used to
    ensure that pixels outside of this region are never referenced.
   This allows applications to pass in buffers that are really the size of the
-   displayable region without causing a seg fault.*/
-typedef struct{
+   displayable region without causing a segfault.*/
+struct oc_border_info{
   /*A bit mask marking which pixels are in the displayable region.
     Pixel (x,y) corresponds to bit (y<<3|x).*/
   ogg_int64_t mask;
   /*The number of pixels in the displayable region.
     This is always positive, and always less than 64.*/
   int         npixels;
-}oc_border_info;
+};
 
 
 
 /*Fragment information.*/
-typedef struct{
+struct oc_fragment{
   /*A flag indicating whether or not this fragment is coded.*/
   unsigned        coded:1;
   /*A flag indicating that all of this fragment lies outside the displayable
@@ -233,12 +256,12 @@
   oc_border_info *border;
   /*The motion vector used for this fragment.*/
   oc_mv           mv;
-}oc_fragment;
+};
 
 
 
 /*A description of each fragment plane.*/
-typedef struct{
+struct oc_fragment_plane{
   /*The number of fragments in the horizontal direction.*/
   int nhfrags;
   /*The number of fragments in the vertical direction.*/
@@ -255,28 +278,32 @@
   int sboffset;
   /*The total number of super blocks in the plane.*/
   int nsbs;
-}oc_fragment_plane;
+};
 
 
 
 /*The shared (encoder and decoder) functions that have accelerated variants.*/
-typedef struct{
-  void (*frag_recon_intra)(unsigned char *_dst,int _dst_ystride,
-   const ogg_int16_t *_residue);
-  void (*frag_recon_inter)(unsigned char *_dst,int _dst_ystride,
-   const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-  void (*frag_recon_inter2)(unsigned char *_dst,int _dst_ystride,
-   const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
-   int _src2_ystride,const ogg_int16_t *_residue);
-  void (*state_frag_copy)(const oc_theora_state *_state,
+struct oc_base_opt_vtable{
+  void (*frag_copy)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride);
+  void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
+   const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter)(unsigned char *_dst,
+   const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+  void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
+   const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+  void (*dequant_idct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+   int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+   const ogg_uint16_t _ac_quant[64]);
+  void (*state_frag_recon)(const oc_theora_state *_state,oc_fragment *_frag,
+   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+   ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+  void (*state_frag_copy_list)(const oc_theora_state *_state,
    const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
-  void (*state_frag_recon)(oc_theora_state *_state,oc_fragment *_frag,
-   int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
-   ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
+  void (*state_loop_filter_frag_rows)(const oc_theora_state *_state,int *_bv,
+   int _refi,int _pli,int _fragy0,int _fragy_end);  
   void (*restore_fpu)(void);
-  void (*state_loop_filter_frag_rows)(oc_theora_state *_state,int *_bv,
-   int _refi,int _pli,int _fragy0,int _fragy_end);  
-}oc_base_opt_vtable;
+};
 
 
 
@@ -349,7 +376,8 @@
   int                 qis[3];
   /*The number of quality indices used in the current frame.*/
   int                 nqis;
-  /*The dequantization tables.*/
+  /*The dequantization tables.
+    Note that these are stored in zig-zag order.*/
   oc_quant_table     *dequant_tables[2][3];
   oc_quant_tables     dequant_table_data[2][3];
   /*Loop filter strength parameters.*/
@@ -419,7 +447,7 @@
 void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
  th_ycbcr_buffer _img);
 int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
-int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offsets,
+int oc_state_get_mv_offsets(const oc_theora_state *_state,int *_offsets,
  int _dx,int _dy,int _ystride,int _pli);
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
@@ -430,38 +458,45 @@
 #endif
 
 /*Shared accelerated functions.*/
+void oc_frag_copy(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride);
 void oc_frag_recon_intra(const oc_theora_state *_state,
- unsigned char *_dst,int _dst_ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter(const oc_theora_state *_state,
- unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
+ unsigned char *_dst,int _dst_ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter(const oc_theora_state *_state,unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter2(const oc_theora_state *_state,
- unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue);
-void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
+ unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
+ int _ystride,const ogg_int16_t _residue[64]);
+void oc_dequant_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
+ const ogg_int16_t _x[64],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_recon(const oc_theora_state *_state,oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_copy_list(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon(oc_theora_state *_state,oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
-void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
+void oc_state_loop_filter_frag_rows(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu(const oc_theora_state *_state);
 
 /*Default pure-C implementations.*/
+void oc_frag_copy_c(unsigned char *_dst,
+ const unsigned char *_src,int _src_ystride);
 void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
- const ogg_int16_t *_residue);
-void oc_frag_recon_inter_c(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src,int _src_ystride,const ogg_int16_t *_residue);
-void oc_frag_recon_inter2_c(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue);
-void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
+ const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter_c(unsigned char *_dst,
+ const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
+void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
+ const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
+void oc_dequant_idct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64],
+ int _last_zzi,int _ncoefs,ogg_uint16_t _dc_quant,
+ const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_recon_c(const oc_theora_state *_state,oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+void oc_state_frag_copy_list_c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon_c(oc_theora_state *_state,oc_fragment *_frag,
- int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
-void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
+void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_c(void);
 
@@ -480,9 +515,8 @@
 typedef double (*oc_state_granule_time_func)(theora_state *_th,
  ogg_int64_t _granulepos);
 
-typedef struct oc_state_dispatch_vtbl oc_state_dispatch_vtbl;
 
-struct oc_state_dispatch_vtbl{
+struct oc_state_dispatch_vtable{
   oc_state_clear_func         clear;
   oc_state_control_func       control;
   oc_state_granule_frame_func granule_frame;



More information about the commits mailing list