[xiph-commits] r15675 - in branches/theora-thusnelda: examples include/theora lib lib/dec lib/dec/x86 lib/enc lib/enc/x86

Fri Feb 6 01:43:28 PST 2009

Author: tterribe
Date: 2009-02-06 01:43:27 -0800 (Fri, 06 Feb 2009)
New Revision: 15675

Added:
   branches/theora-thusnelda/include/theora/theoraenc.h
   branches/theora-thusnelda/lib/dec/bitpack.c
   branches/theora-thusnelda/lib/dec/bitpack.h
   branches/theora-thusnelda/lib/dec/x86_vc/
   branches/theora-thusnelda/lib/enc/encapiwrapper.c
Removed:
   branches/theora-thusnelda/lib/dec/enquant.h
Modified:
   branches/theora-thusnelda/examples/Makefile.am
   branches/theora-thusnelda/examples/dump_video.c
   branches/theora-thusnelda/examples/encoder_example.c
   branches/theora-thusnelda/examples/player_example.c
   branches/theora-thusnelda/examples/png2theora.c
   branches/theora-thusnelda/examples/splayer.c
   branches/theora-thusnelda/include/theora/Makefile.am
   branches/theora-thusnelda/include/theora/codec.h
   branches/theora-thusnelda/include/theora/theora.h
   branches/theora-thusnelda/include/theora/theoradec.h
   branches/theora-thusnelda/lib/Makefile.am
   branches/theora-thusnelda/lib/Version_script
   branches/theora-thusnelda/lib/Version_script-dec
   branches/theora-thusnelda/lib/Version_script-enc
   branches/theora-thusnelda/lib/cpu.c
   branches/theora-thusnelda/lib/cpu.h
   branches/theora-thusnelda/lib/dec/apiwrapper.c
   branches/theora-thusnelda/lib/dec/apiwrapper.h
   branches/theora-thusnelda/lib/dec/dct.h
   branches/theora-thusnelda/lib/dec/decapiwrapper.c
   branches/theora-thusnelda/lib/dec/decinfo.c
   branches/theora-thusnelda/lib/dec/decint.h
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/dequant.c
   branches/theora-thusnelda/lib/dec/dequant.h
   branches/theora-thusnelda/lib/dec/fragment.c
   branches/theora-thusnelda/lib/dec/huffdec.c
   branches/theora-thusnelda/lib/dec/huffdec.h
   branches/theora-thusnelda/lib/dec/huffman.h
   branches/theora-thusnelda/lib/dec/idct.c
   branches/theora-thusnelda/lib/dec/idct.h
   branches/theora-thusnelda/lib/dec/info.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/ocintrin.h
   branches/theora-thusnelda/lib/dec/quant.c
   branches/theora-thusnelda/lib/dec/quant.h
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxfrag.c
   branches/theora-thusnelda/lib/dec/x86/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86/x86int.h
   branches/theora-thusnelda/lib/dec/x86/x86state.c
   branches/theora-thusnelda/lib/enc/dct_decode.c
   branches/theora-thusnelda/lib/enc/dsp.c
   branches/theora-thusnelda/lib/enc/encoder_quant.c
   branches/theora-thusnelda/lib/enc/encoder_toplevel.c
   branches/theora-thusnelda/lib/enc/mode.c
   branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
   branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
   branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
   branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
   branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
   branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
   branches/theora-thusnelda/lib/internal.h
Log:
Forward-port changes from r14138 through r15674 from trunk.


Modified: branches/theora-thusnelda/examples/Makefile.am
===================================================================

--- branches/theora-thusnelda/examples/Makefile.am	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/Makefile.am	2009-02-06 09:43:27 UTC (rev 15675)
@@ -9,6 +9,7 @@
 
 AM_CFLAGS = $(OGG_CFLAGS)
 LDADD = ../lib/libtheora.la $(OGG_LIBS)
+LDADDENC = ../lib/libtheoraenc.la ../lib/libtheoradec.la $(OGG_LIBS)
 
 dump_video_SOURCES = dump_video.c
 EXTRA_dump_video_SOURCES = getopt.c getopt1.c getopt.h
@@ -22,7 +23,7 @@
 encoder_example_SOURCES = encoder_example.c
 EXTRA_encoder_example_SOURCES = getopt.c getopt1.c getopt.h
 encoder_example_CFLAGS = $(OGG_CFLAGS) $(VORBIS_CFLAGS)
-encoder_example_LDADD = $(GETOPT_OBJS) $(LDADD) $(VORBIS_LIBS) $(VORBISENC_LIBS)
+encoder_example_LDADD = $(GETOPT_OBJS) $(LDADDENC) $(VORBIS_LIBS) $(VORBISENC_LIBS)
 encoder_example_DEPENDENCIES = $(GETOPT_OBJS)
 
 png2theora_SOURCES = png2theora.c

Modified: branches/theora-thusnelda/examples/dump_video.c
===================================================================
--- branches/theora-thusnelda/examples/dump_video.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/dump_video.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -347,16 +347,16 @@
   /* install signal handler */
   signal (SIGINT, sigint_handler);
 
-  /* Finally the main decode loop. 
+  /* Finally the main decode loop.
 
-     It's one Theora packet per frame, so this is pretty 
+     It's one Theora packet per frame, so this is pretty
      straightforward if we're not trying to maintain sync
      with other multiplexed streams.
 
      the videobuf_ready flag is used to maintain the input
      buffer in the libogg stream state. If there's no output
      frame available at the end of the decode step, we must
-     need more input data. We could simplify this by just 
+     need more input data. We could simplify this by just
      using the return code on ogg_page_packetout(), but the
      flag system extends easily to the case were you care
      about more than one multiplexed stream (like with audio

Modified: branches/theora-thusnelda/examples/encoder_example.c
===================================================================
--- branches/theora-thusnelda/examples/encoder_example.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/encoder_example.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -16,6 +16,9 @@
 
  ********************************************************************/
 
+#if !defined(_REENTRANT)
+#define _REENTRANT
+#endif
 #if !defined(_GNU_SOURCE)
 #define _GNU_SOURCE
 #endif
@@ -29,22 +32,8 @@
 #define _FILE_OFFSET_BITS 64
 #endif
 
-/* Define to give performance data win32 only*/
-//#define THEORA_PERF_DATA 
-#ifdef THEORA_PERF_DATA
-#include <windows.h>
-#endif
-
-#ifdef HAVE_CONFIG_H
-# include <config.h>
-#endif
-
-#ifndef _REENTRANT
-# define _REENTRANT
-#endif
-
 #include <stdio.h>
-#ifndef _WIN32
+#if !defined(_WIN32)
 #include <getopt.h>
 #include <unistd.h>
 #else
@@ -56,14 +45,14 @@
 #include <string.h>
 #include <time.h>
 #include <math.h>
-#include "theora/theora.h"
+#include "theora/theoraenc.h"
 #include "vorbis/codec.h"
 #include "vorbis/vorbisenc.h"
 
 #ifdef _WIN32
-/* supply missing headers and functions to Win32 */
-
+/*supply missing headers and functions to Win32. going to hell, I know*/
 #include <fcntl.h>
+#include <io.h>
 
 static double rint(double x)
 {
@@ -74,22 +63,18 @@
 }
 #endif
 
-const char *optstring = "b:e:o:a:A:v:V:s:S:f:F:n:m:k:";
+const char *optstring = "o:a:A:v:V:s:S:f:F:c";
 struct option options [] = {
-  {"begin-time",required_argument,NULL,'b'},
-  {"end-time",required_argument,NULL,'e'},
   {"output",required_argument,NULL,'o'},
   {"audio-rate-target",required_argument,NULL,'A'},
   {"video-rate-target",required_argument,NULL,'V'},
   {"audio-quality",required_argument,NULL,'a'},
   {"video-quality",required_argument,NULL,'v'},
-  {"aspect-numerator",optional_argument,NULL,'s'},
-  {"aspect-denominator",optional_argument,NULL,'S'},
-  {"framerate-numerator",optional_argument,NULL,'f'},
-  {"framerate-denominator",optional_argument,NULL,'F'},
-  {"noise-sensitivity",required_argument,NULL,'n'},
-  {"sharpness",required_argument,NULL,'m'},
-  {"keyframe-freq",required_argument,NULL,'k'},
+  {"aspect-numerator",required_argument,NULL,'s'},
+  {"aspect-denominator",required_argument,NULL,'S'},
+  {"framerate-numerator",required_argument,NULL,'f'},
+  {"framerate-denominator",required_argument,NULL,'F'},
+  {"vp3-compatible",no_argument,NULL,'c'},
   {NULL,0,NULL,0}
 };
 
@@ -103,29 +88,42 @@
 
 float audio_q=.1;
 int audio_r=-1;
+int vp3_compatible=0;
 
-int video_x=0;
-int video_y=0;
-int frame_x=0;
-int frame_y=0;
-int frame_x_offset=0;
-int frame_y_offset=0;
-int video_hzn=-1;
-int video_hzd=-1;
-int video_an=-1;
-int video_ad=-1;
+int frame_w=0;
+int frame_h=0;
+int pic_w=0;
+int pic_h=0;
+int pic_x=0;
+int pic_y=0;
+int video_fps_n=-1;
+int video_fps_d=-1;
+int video_par_n=-1;
+int video_par_d=-1;
+char interlace;
+int src_c_dec_h=2;
+int src_c_dec_v=2;
+int dst_c_dec_h=2;
+int dst_c_dec_v=2;
+char chroma_type[16];
 
+/*The size of each converted frame buffer.*/
+size_t y4m_dst_buf_sz;
+/*The amount to read directly into the converted frame buffer.*/
+size_t y4m_dst_buf_read_sz;
+/*The size of the auxilliary buffer.*/
+size_t y4m_aux_buf_sz;
+/*The amount to read into the auxilliary buffer.*/
+size_t y4m_aux_buf_read_sz;
+
+/*The function used perform chroma conversion.*/
+typedef void (*y4m_convert_func)(unsigned char *_dst,unsigned char *_aux);
+
+y4m_convert_func y4m_convert=NULL;
+
 int video_r=-1;
-int video_q=16;
-int noise_sensitivity=1;
-int sharpness=0;
-int keyframe_frequency=64;
+int video_q=48;
 
-long begin_sec=-1;
-long begin_usec=0;
-long end_sec=-1;
-long end_usec=0;
-
 static void usage(void){
   fprintf(stderr,
           "Usage: encoder_example [options] [audio_file] video_file\n\n"
@@ -157,31 +155,496 @@
           "                                 from YUV input file. ex: 1000000\n"
           "                                 The frame rate nominator divided by this\n"
           "                                 determinates the frame rate in units per tick\n"
-          "   -n --noise-sensitivity <n>    Theora noise sensitivity selector from 0\n"
-          "                                 to 6 (0 yields best quality but larger\n"
-          "                                 files, defaults to 1)\n"
-          "   -m --sharpness <n>            Theora sharpness selector from 0 to 2\n"
-          "                                 (0 yields crispest video at the cost of\n"
-          "                                 larger files, selecting 2 can greatly\n"
-          "                                 reduce file size but resulting video\n"
-          "                                 is blurrier, defaults to 0)\n"
-          "   -k --keyframe-freq <n>        Keyframe frequency from 8 to 1000\n"
-	  "   -b --begin-time <h:m:s.f>     Begin encoding at offset into input\n"
-	  "   -e --end-time <h:m:s.f>       End encoding at offset into input\n"
           "encoder_example accepts only uncompressed RIFF WAV format audio and\n"
           "YUV4MPEG2 uncompressed video.\n\n");
   exit(1);
 }
 
+static int y4m_parse_tags(char *_tags){
+  int   got_w;
+  int   got_h;
+  int   got_fps;
+  int   got_interlace;
+  int   got_par;
+  int   got_chroma;
+  int   tmp_video_fps_n;
+  int   tmp_video_fps_d;
+  int   tmp_video_par_n;
+  int   tmp_video_par_d;
+  char *p;
+  char *q;
+  got_w=got_h=got_fps=got_interlace=got_par=got_chroma=0;
+  for(p=_tags;;p=q){
+    /*Skip any leading spaces.*/
+    while(*p==' ')p++;
+    /*If that's all we have, stop.*/
+    if(p[0]=='\0')break;
+    /*Find the end of this tag.*/
+    for(q=p+1;*q!='\0'&&*q!=' ';q++);
+    /*Process the tag.*/
+    switch(p[0]){
+      case 'W':{
+        if(sscanf(p+1,"%d",&pic_w)!=1)return -1;
+        got_w=1;
+      }break;
+      case 'H':{
+        if(sscanf(p+1,"%d",&pic_h)!=1)return -1;
+        got_h=1;
+      }break;
+      case 'F':{
+        if(sscanf(p+1,"%d:%d",&tmp_video_fps_n,&tmp_video_fps_d)!=2)return -1;
+        got_fps=1;
+      }break;
+      case 'I':{
+        interlace=p[1];
+        got_interlace=1;
+      }break;
+      case 'A':{
+        if(sscanf(p+1,"%d:%d",&tmp_video_par_n,&tmp_video_par_d)!=2)return -1;
+        got_par=1;
+      }break;
+      case 'C':{
+        if(q-p>16)return -1;
+        memcpy(chroma_type,p+1,q-p-1);
+        chroma_type[q-p-1]='\0';
+        got_chroma=1;
+      }break;
+      /*Ignore unknown tags.*/
+    }
+  }
+  if(!got_w||!got_h||!got_fps||!got_interlace||!got_par)return -1;
+  /*Chroma-type is not specified in older files, e.g., those generated by
+     mplayer.*/
+  if(!got_chroma)strcpy(chroma_type,"420");
+  /*Update fps and aspect ratio globals if not specified in the command line.*/
+  if(video_fps_n==-1)video_fps_n=tmp_video_fps_n;
+  if(video_fps_d==-1)video_fps_d=tmp_video_fps_d;
+  if(video_par_n==-1)video_par_n=tmp_video_par_n;
+  if(video_par_d==-1)video_par_d=tmp_video_par_d;
+  return 0;
+}
+
+/*All anti-aliasing filters in the following conversion functions are based on
+   one of two window functions:
+  The 6-tap Lanczos window (for down-sampling and shifts):
+   sinc(\pi*t)*sinc(\pi*t/3), |t|<3  (sinc(t)==sin(t)/t)
+   0,                         |t|>=3
+  The 4-tap Mitchell window (for up-sampling):
+   7|t|^3-12|t|^2+16/3,             |t|<1
+   -(7/3)|x|^3+12|x|^2-20|x|+32/3,  |t|<2
+   0,                               |t|>=2
+  The number of taps is intentionally kept small to reduce computational
+   overhead and limit ringing.
+
+  The taps from these filters are scaled so that their sum is 1, and the result
+   is scaled by 128 and rounded to integers to create a filter whose
+   intermediate values fit inside 16 bits.
+  Coefficients are rounded in such a way as to ensure their sum is still 128,
+   which is usually equivalent to normal rounding.*/
+
+#define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
+#define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
+#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
+
+/*420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420mpeg2 chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  BR      |       BR      |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  The 4:2:2 modes look exactly the same, except there are twice as many chroma
+   lines, and they are vertically co-sited with the luma samples in both the
+   mpeg2 and jpeg cases (thus requiring no vertical resampling).*/
+static void y4m_convert_42xmpeg2_42xjpeg(unsigned char *_dst,
+ unsigned char *_aux){
+  int c_w;
+  int c_h;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst+=pic_w*pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w=(pic_w+dst_c_dec_h-1)/dst_c_dec_h;
+  c_h=(pic_h+dst_c_dec_v-1)/dst_c_dec_v;
+  for(pli=1;pli<3;pli++){
+    for(y=0;y<c_h;y++){
+      /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+         window.*/
+      for(x=0;x<OC_MINI(c_w,2);x++){
+        _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[0]-17*_aux[OC_MAXI(x-1,0)]+
+         114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+
+         _aux[OC_MINI(x+3,c_w-1)]+64>>7,255);
+      }
+      for(;x<c_w-3;x++){
+        _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+
+         114*_aux[x]+35*_aux[x+1]-9*_aux[x+2]+_aux[x+3]+64>>7,255);
+      }
+      for(;x<c_w;x++){
+        _dst[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+
+         114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+
+         _aux[c_w-1]+64>>7,255);
+      }
+      _dst+=c_w;
+      _aux+=c_w;
+    }
+  }
+}
+
+/*This format is only used for interlaced content, but is included for
+   completeness.
+
+  420jpeg chroma samples are sited like:
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |   BR  |       |   BR  |
+  |       |       |       |
+  Y-------Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  420paldv chroma samples are sited like:
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YR------Y-------YR------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YB------Y-------YB------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a resampling filter to shift the site locations one quarter pixel (at
+   the chroma plane's resolution) to the right.
+  Then we use another filter to move the C_r location down one quarter pixel,
+   and the C_b location up one quarter pixel.*/
+static void y4m_convert_42xpaldv_42xjpeg(unsigned char *_dst,
+ unsigned char *_aux){
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            c_sz;
+  int            pli;
+  int            y;
+  int            x;
+  /*Skip past the luma data.*/
+  _dst+=pic_w*pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w=(pic_w+1)/2;
+  c_h=(pic_h+dst_c_dec_h-1)/dst_c_dec_h;
+  c_sz=c_w*c_h;
+  /*First do the horizontal re-sampling.
+    This is the same as the mpeg2 case, except that after the horizontal case,
+     we need to apply a second vertical filter.*/
+  tmp=_aux+2*c_sz;
+  for(pli=1;pli<3;pli++){
+    for(y=0;y<c_h;y++){
+      /*Filter: [4 -17 114 35 -9 1]/128, derived from a 6-tap Lanczos
+         window.*/
+      for(x=0;x<OC_MINI(c_w,2);x++){
+        tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[0]-17*_aux[OC_MAXI(x-1,0)]+
+         114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+
+         _aux[OC_MINI(x+3,c_w-1)]+64>>7,255);
+      }
+      for(;x<c_w-3;x++){
+        tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+
+         114*_aux[x]+35*_aux[x+1]-9*_aux[x+2]+_aux[x+3]+64>>7,255);
+      }
+      for(;x<c_w;x++){
+        tmp[x]=(unsigned char)OC_CLAMPI(0,4*_aux[x-2]-17*_aux[x-1]+
+         114*_aux[x]+35*_aux[OC_MINI(x+1,c_w-1)]-9*_aux[OC_MINI(x+2,c_w-1)]+
+         _aux[c_w-1]+64>>7,255);
+      }
+      tmp+=c_w;
+      _aux+=c_w;
+    }
+    switch(pli){
+      case 1:{
+        tmp-=c_sz;
+        /*Slide C_b up a quarter-pel.
+          This is the same filter used above, but in the other order.*/
+        for(x=0;x<c_w;x++){
+          for(y=0;y<OC_MINI(c_h,3);y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[0]-
+             9*tmp[OC_MAXI(y-2,0)*c_w]+35*tmp[OC_MAXI(y-1,0)*c_w]+
+             114*tmp[y*c_w]-17*tmp[OC_MINI(y+1,c_h-1)*c_w]+
+             4*tmp[OC_MINI(y+2,c_h-1)*c_w]+64>>7,255);
+          }
+          for(;y<c_h-2;y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[(y-3)*c_w]-
+             9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w]-
+             17*tmp[(y+1)*c_w]+4*tmp[(y+2)*c_w]+64>>7,255);
+          }
+          for(;y<c_h;y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,tmp[(y-3)*c_w]-
+             9*tmp[(y-2)*c_w]+35*tmp[(y-1)*c_w]+114*tmp[y*c_w]-
+             17*tmp[OC_MINI(y+1,c_h-1)*c_w]+4*tmp[(c_h-1)*c_w]+64>>7,255);
+          }
+          _dst++;
+          tmp++;
+        }
+        _dst+=c_sz-c_w;
+        tmp-=c_w;
+      }break;
+      case 2:{
+        tmp-=c_sz;
+        /*Slide C_r down a quarter-pel.
+          This is the same as the horizontal filter.*/
+        for(x=0;x<c_w;x++){
+          for(y=0;y<OC_MINI(c_h,2);y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[0]-
+             17*tmp[OC_MAXI(y-1,0)*c_w]+114*tmp[y*c_w]+
+             35*tmp[OC_MINI(y+1,c_h-1)*c_w]-9*tmp[OC_MINI(y+2,c_h-1)*c_w]+
+             tmp[OC_MINI(y+3,c_h-1)*c_w]+64>>7,255);
+          }
+          for(;y<c_h-3;y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[(y-2)*c_w]-
+             17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[(y+1)*c_w]-
+             9*tmp[(y+2)*c_w]+tmp[(y+3)*c_w]+64>>7,255);
+          }
+          for(;y<c_h;y++){
+            _dst[y*c_w]=(unsigned char)OC_CLAMPI(0,4*tmp[(y-2)*c_w]-
+             17*tmp[(y-1)*c_w]+114*tmp[y*c_w]+35*tmp[OC_MINI(y+1,c_h-1)*c_w]-
+             9*tmp[OC_MINI(y+2,c_h-1)*c_w]+tmp[(c_h-1)*c_w]+64>>7,255);
+          }
+          _dst++;
+          tmp++;
+        }
+      }break;
+    }
+    /*For actual interlaced material, this would have to be done separately on
+       each field, and the shift amounts would be different.
+      C_r moves down 1/8, C_b up 3/8 in the top field, and C_r moves down 3/8,
+       C_b up 1/8 in the bottom field.
+      The corresponding filters would be:
+       Down 1/8 (reverse order for up): [3 -11 125 15 -4 0]/128
+       Down 3/8 (reverse order for up): [4 -19 98 56 -13 2]/128*/
+  }
+}
+
+/*422jpeg chroma samples are sited like:
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  Y---BR--Y-------Y---BR--Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  411 chroma samples are sited like:
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+  YBR-----Y-------Y-------Y-------
+  |       |       |       |
+  |       |       |       |
+  |       |       |       |
+
+  We use a filter to resample at site locations one eighth pixel (at the source
+   chroma plane's horizontal resolution) and five eighths of a pixel to the
+   right.*/
+static void y4m_convert_411_422jpeg(unsigned char *_dst,
+ unsigned char *_aux){
+  int c_w;
+  int dst_c_w;
+  int c_h;
+  int pli;
+  int y;
+  int x;
+  /*Skip past the luma data.*/
+  _dst+=pic_w*pic_h;
+  /*Compute the size of each chroma plane.*/
+  c_w=(pic_w+src_c_dec_h-1)/src_c_dec_h;
+  dst_c_w=(pic_w+dst_c_dec_h-1)/dst_c_dec_h;
+  c_h=(pic_h+dst_c_dec_v-1)/dst_c_dec_v;
+  for(pli=1;pli<3;pli++){
+    for(y=0;y<c_h;y++){
+      /*Filters: [1 110 18 -1]/128 and [-3 50 86 -5]/128, both derived from a
+         4-tap Mitchell window.*/
+      for(x=0;x<OC_MINI(c_w,1);x++){
+        _dst[x<<1]=(unsigned char)OC_CLAMPI(0,111*_aux[0]+
+         18*_aux[OC_MINI(1,c_w-1)]-_aux[OC_MINI(2,c_w-1)]+64>>7,255);
+        _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,47*_aux[0]+
+         86*_aux[OC_MINI(1,c_w-1)]-5*_aux[OC_MINI(2,c_w-1)]+64>>7,255);
+      }
+      for(;x<c_w-2;x++){
+        _dst[x<<1]=(unsigned char)OC_CLAMPI(0,_aux[x-1]+110*_aux[x]+
+         18*_aux[x+1]-_aux[x+2]+64>>7,255);
+        _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,-3*_aux[x-1]+50*_aux[x]+
+         86*_aux[x+1]-5*_aux[x+2]+64>>7,255);
+      }
+      for(;x<c_w;x++){
+        _dst[x<<1]=(unsigned char)OC_CLAMPI(0,_aux[x-1]+110*_aux[x]+
+         18*_aux[OC_MINI(x+1,c_w-1)]-_aux[c_w-1]+64>>7,255);
+        if((x<<1|1)<dst_c_w){
+          _dst[x<<1|1]=(unsigned char)OC_CLAMPI(0,-3*_aux[x-1]+50*_aux[x]+
+           86*_aux[OC_MINI(x+1,c_w-1)]-5*_aux[c_w-1]+64>>7,255);
+        }
+      }
+      _dst+=dst_c_w;
+      _aux+=c_w;
+    }
+  }
+}
+
+/*The image is padded with empty chroma components at 4:2:0.
+  This costs about 17 bits a frame to code.*/
+static void y4m_convert_mono_420jpeg(unsigned char *_dst,
+ unsigned char *_aux){
+  int c_sz;
+  _dst+=pic_w*pic_h;
+  c_sz=((pic_w+dst_c_dec_h-1)/dst_c_dec_h)*((pic_h+dst_c_dec_v-1)/dst_c_dec_v);
+  memset(_dst,128,c_sz*2);
+}
+
+#if 0
+/*Right now just 444 to 420.
+  Not too hard to generalize.*/
+static void y4m_convert_4xxjpeg_42xjpeg(unsigned char *_dst,
+ unsigned char *_aux){
+  unsigned char *tmp;
+  int            c_w;
+  int            c_h;
+  int            pic_sz;
+  int            tmp_sz;
+  int            c_sz;
+  int            pli;
+  int            y;
+  int            x;
+  /*Compute the size of each chroma plane.*/
+  c_w=(pic_w+dst_c_dec_h-1)/dst_c_dec_h;
+  c_h=(pic_h+dst_c_dec_v-1)/dst_c_dec_v;
+  pic_sz=pic_w*pic_h;
+  tmp_sz=c_w*pic_h;
+  c_sz=c_w*c_h;
+  _dst+=pic_sz;
+  for(pli=1;pli<3;pli++){
+    tmp=_aux+pic_sz;
+    /*In reality, the horizontal and vertical steps could be pipelined, for
+       less memory consumption and better cache performance, but we do them
+       separately for simplicity.*/
+    /*First do horizontal filtering (convert to 4:2:2)*/
+    /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
+    for(y=0;y<pic_h;y++){
+      for(x=0;x<OC_MINI(pic_w,2);x+=2){
+        tmp[x>>1]=OC_CLAMPI(0,64*_aux[0]+78*_aux[OC_MINI(1,pic_w-1)]-
+         17*_aux[OC_MINI(2,pic_w-1)]+3*_aux[OC_MINI(3,pic_w-1)]+64>>7,255);
+      }
+      for(;x<pic_w-3;x+=2){
+        tmp[x>>1]=OC_CLAMPI(0,3*(_aux[x-2]+_aux[x+3])-17*(_aux[x-1]+_aux[x+2])+
+         78*(_aux[x]+_aux[x+1])+64>>7,255);
+      }
+      for(;x<pic_w;x+=2){
+        tmp[x>>1]=OC_CLAMPI(0,3*(_aux[x-2]+_aux[pic_w-1])-
+         17*(_aux[x-1]+_aux[OC_MINI(x+2,pic_w-1)])+
+         78*(_aux[x]+_aux[OC_MINI(x+1,pic_w-1)])+64>>7,255);
+      }
+      tmp+=c_w;
+      _aux+=pic_w;
+    }
+    _aux-=pic_sz;
+    tmp-=tmp_sz;
+    /*Now do the vertical filtering.*/
+    for(x=0;x<c_w;x++){
+      for(y=0;y<OC_MINI(pic_h,2);y+=2){
+        _dst[(y>>1)*c_w]=OC_CLAMPI(0,64*tmp[0]+78*tmp[OC_MINI(1,pic_h-1)*c_w]-
+         17*tmp[OC_MINI(2,pic_h-1)*c_w]+3*tmp[OC_MINI(3,pic_h-1)*c_w]+
+         64>>7,255);
+      }
+      for(;y<pic_h-3;y+=2){
+        _dst[(y>>1)*c_w]=OC_CLAMPI(0,3*(tmp[(y-2)*c_w]+tmp[(y+3)*c_w])-
+         17*(tmp[(y-1)*c_w]+tmp[(y+2)*c_w])+78*(tmp[y*c_w]+tmp[(y+1)*c_w])+
+         64>>7,255);
+      }
+      for(;y<pic_h;y+=2){
+        _dst[(y>>1)*c_w]=OC_CLAMPI(0,3*(tmp[(y-2)*c_w]+tmp[(pic_h-1)*c_w])-
+         17*(tmp[(y-1)*c_w]+tmp[OC_MINI(y+2,pic_h-1)*c_w])+
+         78*(tmp[y*c_w]+tmp[OC_MINI(y+1,pic_h-1)*c_w])+64>>7,255);
+      }
+      tmp++;
+      _dst++;
+    }
+    _dst-=c_w;
+  }
+}
+#endif
+
+
+/*No conversion function needed.*/
+static void y4m_convert_null(unsigned char *_dst,
+ unsigned char *_aux){
+}
+
 static void id_file(char *f){
   FILE *test;
   unsigned char buffer[80];
   int ret;
-  int tmp_video_hzn = -1,
-      tmp_video_hzd = -1,
-      tmp_video_an = -1,
-      tmp_video_ad = -1;
-  int extra_hdr_bytes;
 
   /* open it, look for magic */
 
@@ -228,9 +691,6 @@
           ret=fread(buffer,1,20,test);
           if(ret<20)goto riff_err;
 
-          extra_hdr_bytes = (buffer[0]  + (buffer[1] << 8) +
-                            (buffer[2] << 16) + (buffer[3] << 24)) - 16;
-
           if(memcmp(buffer+4,"\001\000",2)){
             fprintf(stderr,"The WAV file %s is in a compressed format; "
                     "can't read it.\n",f);
@@ -247,18 +707,6 @@
             exit(1);
           }
 
-          /* read past extra header bytes */
-          while(extra_hdr_bytes){
-            int read_size = (extra_hdr_bytes > sizeof(buffer)) ?
-             sizeof(buffer) : extra_hdr_bytes;
-            ret = fread(buffer, 1, read_size, test);
-
-            if (ret < read_size)
-              goto riff_err;
-            else
-              extra_hdr_bytes -= read_size;
-          }
-
           /* Now, align things to the beginning of the data */
           /* Look for 'dataxxxx' */
           while(!feof(test)){
@@ -286,8 +734,7 @@
   if(!memcmp(buffer,"YUV4",4)){
     /* possible YUV2MPEG2 format file */
     /* read until newline, or 80 cols, whichever happens first */
-    /* NB the mjpegtools spec doesn't define a length limit */
-    int i,j;
+    int i;
     for(i=0;i<79;i++){
       ret=fread(buffer+i,1,1,test);
       if(ret<1)goto yuv_err;
@@ -299,7 +746,6 @@
     buffer[i]='\0';
 
     if(!memcmp(buffer,"MPEG",4)){
-      char interlace = '?';
 
       if(video){
         /* umm, we already have one */
@@ -311,72 +757,94 @@
         fprintf(stderr,"Incorrect YUV input file version; YUV4MPEG2 required.\n");
       }
 
-      /* parse the frame header */
-      j = 5;
-      while (j < i) {
-        if ((buffer[j] != ' ') && (buffer[j-1] == ' ')) 
-          switch (buffer[j]) {
-            case 'W': frame_x = atoi((char*)&buffer[j+1]); break;
-            case 'H': frame_y = atoi((char*)&buffer[j+1]); break;
-            case 'C': /* chroma subsampling */ break;
-            case 'I': interlace = buffer[j+1]; break;
-            case 'F': /* frame rate ratio */
-              tmp_video_hzn = atoi((char*)&buffer[j+1]);
-	      while ((buffer[j] != ':') && (j < i)) j++;
-              tmp_video_hzd = atoi((char*)&buffer[j+1]);
-              break;
-            case 'A': /* sample aspect ratio */
-              tmp_video_an = atoi((char*)&buffer[j+1]);
-	      while ((buffer[j] != ':') && (j < i)) j++;
-              tmp_video_ad = atoi((char*)&buffer[j+1]);
-              break;
-            case 'X': /* metadata */ break;
-            default:
-              fprintf(stderr, "unrecognized stream header tag '%c'\n", buffer[j]);
-              break;
-          }
-        j++;
-      }
-      /* verify data from the stream header */
-      if (frame_x <= 0) {
-        fprintf(stderr,"Error parsing YUV4MPEG2 header:"
-                " missing width tag in file %s.\n", f);
+      ret=y4m_parse_tags((char *)buffer+5);
+      if(ret<0){
+        fprintf(stderr,"Error parsing YUV4MPEG2 header in file %s.\n",f);
         exit(1);
       }
-      if (frame_y <= 0) {
-        fprintf(stderr,"Error parsing YUV4MPEG2 header:"
-                " missing height tag in file %s.\n", f);
+
+      if(interlace!='p'){
+        fprintf(stderr,"Input video is interlaced; Theora handles only progressive scan\n");
         exit(1);
       }
-      if (tmp_video_hzn < 0 || tmp_video_hzd < 0) {
-	/* default to 30 fps */
-	tmp_video_hzn = 30; tmp_video_hzd = 1;
-        fprintf(stderr,"Warning: no framerate defined in file %s.\n", f);
+
+      if(strcmp(chroma_type,"420")==0||strcmp(chroma_type,"420jpeg")==0){
+        src_c_dec_h=dst_c_dec_h=src_c_dec_v=dst_c_dec_v=2;
+        y4m_dst_buf_read_sz=pic_w*pic_h+2*((pic_w+1)/2)*((pic_h+1)/2);
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=0;
+        y4m_convert=y4m_convert_null;
       }
-      if (tmp_video_an < 0 || tmp_video_ad < 0) {
-	/* default to unknown */
-	tmp_video_an = 0; tmp_video_ad = 0;
+      else if(strcmp(chroma_type,"420mpeg2")==0){
+        src_c_dec_h=dst_c_dec_h=src_c_dec_v=dst_c_dec_v=2;
+        y4m_dst_buf_read_sz=pic_w*pic_h;
+        /*Chroma filter required: read into the aux buf first.*/
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=2*((pic_w+1)/2)*((pic_h+1)/2);
+        y4m_convert=y4m_convert_42xmpeg2_42xjpeg;
       }
-
-      /* update fps and aspect ratio globals if not specified in the command line */
-      if (video_hzn==-1) video_hzn = tmp_video_hzn;
-      if (video_hzd==-1) video_hzd = tmp_video_hzd;
-      if (video_an==-1) video_an = tmp_video_an;
-      if (video_ad==-1) video_ad = tmp_video_ad;
-
-      if(interlace=='?'){
-        fprintf(stderr,"Warning: input video isn't marked for interlacing;"
-          " treating this\nas progressive scan video."
-          " Deinterlace first if you get poor results.\n");
-      }else if(interlace!='p'){
-        fprintf(stderr,"Input video is interlaced; Theora handles only progressive scan\n");
+      else if(strcmp(chroma_type,"420paldv")==0){
+        src_c_dec_h=dst_c_dec_h=src_c_dec_v=dst_c_dec_v=2;
+        y4m_dst_buf_read_sz=pic_w*pic_h;
+        /*Chroma filter required: read into the aux buf first.
+          We need to make two filter passes, so we need some extra space in the
+           aux buffer.*/
+        y4m_aux_buf_sz=3*((pic_w+1)/2)*((pic_h+1)/2);
+        y4m_aux_buf_read_sz=2*((pic_w+1)/2)*((pic_h+1)/2);
+        y4m_convert=y4m_convert_42xpaldv_42xjpeg;
+      }
+      else if(strcmp(chroma_type,"422")==0){
+        src_c_dec_h=dst_c_dec_h=2;
+        src_c_dec_v=dst_c_dec_v=1;
+        y4m_dst_buf_read_sz=pic_w*pic_h;
+        /*Chroma filter required: read into the aux buf first.*/
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=2*((pic_w+1)/2)*pic_h;
+        y4m_convert=y4m_convert_42xmpeg2_42xjpeg;
+      }
+      else if(strcmp(chroma_type,"411")==0){
+        src_c_dec_h=4;
+        /*We don't want to introduce any additional sub-sampling, so we
+           promote 4:1:1 material to 4:2:2, as the closest format Theora can
+           handle.*/
+        dst_c_dec_h=2;
+        src_c_dec_v=dst_c_dec_v=1;
+        y4m_dst_buf_read_sz=pic_w*pic_h;
+        /*Chroma filter required: read into the aux buf first.*/
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=2*((pic_w+3)/4)*pic_h;
+        y4m_convert=y4m_convert_411_422jpeg;
+      }
+      else if(strcmp(chroma_type,"444")==0){
+        src_c_dec_h=dst_c_dec_h=src_c_dec_v=dst_c_dec_v=1;
+        y4m_dst_buf_read_sz=pic_w*pic_h*3;
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=0;
+        y4m_convert=y4m_convert_null;
+      }
+      else if(strcmp(chroma_type,"444alpha")==0){
+        src_c_dec_h=dst_c_dec_h=src_c_dec_v=dst_c_dec_v=1;
+        y4m_dst_buf_read_sz=pic_w*pic_h*3;
+        /*Read the extra alpha plane into the aux buf.
+          It will be discarded.*/
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=pic_w*pic_h;
+        y4m_convert=y4m_convert_null;
+      }
+      else if(strcmp(chroma_type,"mono")==0){
+        src_c_dec_h=src_c_dec_v=0;
+        dst_c_dec_h=dst_c_dec_v=2;
+        y4m_dst_buf_read_sz=pic_w*pic_h;
+        y4m_aux_buf_sz=y4m_aux_buf_read_sz=0;
+        y4m_convert=y4m_convert_mono_420jpeg;
+      }
+      else{
+        fprintf(stderr,"Unknown chroma sampling type: %s\n",chroma_type);
         exit(1);
       }
+      /*The size of the final frame buffers is always computed from the
+         destination chroma decimation type.*/
+      y4m_dst_buf_sz=pic_w*pic_h+2*((pic_w+dst_c_dec_h-1)/dst_c_dec_h)*
+       ((pic_h+dst_c_dec_v-1)/dst_c_dec_v);
 
       video=test;
 
-      fprintf(stderr,"File %s is %dx%d %.02f fps YUV12 video.\n",
-              f,frame_x,frame_y,(double)video_hzn/video_hzd);
+      fprintf(stderr,"File %s is %dx%d %.02f fps %s video.\n",
+              f,pic_w,pic_h,(double)video_fps_n/video_fps_d,chroma_type);
 
       return;
     }
@@ -406,11 +874,8 @@
                             vorbis_dsp_state *vd,
                             vorbis_block *vb,
                             int audioflag){
-  static ogg_int64_t samples_sofar=0;
   ogg_packet op;
   int i,j;
-  ogg_int64_t beginsample = audio_hz*begin_sec + audio_hz*begin_usec/1000000;
-  ogg_int64_t endsample = audio_hz*end_sec + audio_hz*end_usec/1000000;
 
   while(audio && !audioflag){
     /* process any audio already buffered */
@@ -421,63 +886,43 @@
     {
       /* read and process more audio */
       signed char readbuffer[4096];
-      signed char *readptr=readbuffer;
       int toread=4096/2/audio_ch;
       int bytesread=fread(readbuffer,1,toread*2*audio_ch,audio);
       int sampread=bytesread/2/audio_ch;
       float **vorbis_buffer;
       int count=0;
 
-      if(bytesread<=0 || 
-	 (samples_sofar>=endsample && endsample>0)){
+      if(bytesread<=0){
         /* end of file.  this can be done implicitly, but it's
            easier to see here in non-clever fashion.  Tell the
            library we're at end of stream so that it can handle the
            last frame and mark end of stream in the output properly */
         vorbis_analysis_wrote(vd,0);
       }else{
-	if(samples_sofar < beginsample){
-	  if(samples_sofar+sampread > beginsample){
-	    readptr += (beginsample-samples_sofar)*2*audio_ch;
-	    sampread += samples_sofar-beginsample;
-	    samples_sofar = sampread+beginsample;
-	  }else{
-	    samples_sofar += sampread;
-	    sampread = 0;
-	  }
-	}else{
-	  samples_sofar += sampread;
-	}
+        vorbis_buffer=vorbis_analysis_buffer(vd,sampread);
+        /* uninterleave samples */
+        for(i=0;i<sampread;i++){
+          for(j=0;j<audio_ch;j++){
+            vorbis_buffer[j][i]=((readbuffer[count+1]<<8)|
+                                 (0x00ff&(int)readbuffer[count]))/32768.f;
+            count+=2;
+          }
+        }
 
-	if(samples_sofar > endsample && endsample > 0)
-	  sampread-= (samples_sofar - endsample);
-	
-	if(sampread>0){
+        vorbis_analysis_wrote(vd,sampread);
 
-	  vorbis_buffer=vorbis_analysis_buffer(vd,sampread);
-	  /* uninterleave samples */
-	  for(i=0;i<sampread;i++){
-	    for(j=0;j<audio_ch;j++){
-	      vorbis_buffer[j][i]=((readptr[count+1]<<8)|
-				   (0x00ff&(int)readptr[count]))/32768.f;
-	      count+=2;
-	    }
-	  }
-        
-	  vorbis_analysis_wrote(vd,sampread);
-        }
       }
 
       while(vorbis_analysis_blockout(vd,vb)==1){
-        
+
         /* analysis, assume we want to use bitrate management */
         vorbis_analysis(vb,NULL);
         vorbis_bitrate_addblock(vb);
-        
+
         /* weld packets into the bitstream */
         while(vorbis_bitrate_flushpacket(vd,&op))
           ogg_stream_packetin(vo,&op);
-        
+
       }
     }
   }
@@ -487,31 +932,34 @@
 
 int fetch_and_process_video(FILE *video,ogg_page *videopage,
                             ogg_stream_state *to,
-                            theora_state *td,
+                            th_enc_ctx *td,
                             int videoflag){
   /* You'll go to Hell for using static variables */
-  static ogg_int64_t frames=0;
-  static int          state=-1;
-  static unsigned char *yuvframe[2];
-  unsigned char        *line;
-  yuv_buffer          yuv;
-  ogg_packet          op;
-  int e;
-  ogg_int64_t beginframe = (video_hzn*begin_sec + video_hzn*begin_usec/1000000)/video_hzd;
-  ogg_int64_t endframe = (video_hzn*end_sec + video_hzn*end_usec/1000000)/video_hzd;
+  static int                 state=-1;
+  static unsigned char      *yuvframe[3];
+  static th_ycbcr_buffer     ycbcr;
+  ogg_packet                 op;
+  int                        pic_sz;
+  int                        frame_c_w;
+  int                        frame_c_h;
+  int                        c_w;
+  int                        c_h;
+  int                        c_sz;
+  int                        i;
 
+  pic_sz=pic_w*pic_h;
+  frame_c_w=frame_w/dst_c_dec_h;
+  frame_c_h=frame_h/dst_c_dec_v;
+  c_w=(pic_w+dst_c_dec_h-1)/dst_c_dec_h;
+  c_h=(pic_h+dst_c_dec_v-1)/dst_c_dec_v;
+  c_sz=c_w*c_h;
+
   if(state==-1){
         /* initialize the double frame buffer */
-    yuvframe[0]=malloc(video_x*video_y*3/2);
-    yuvframe[1]=malloc(video_x*video_y*3/2);
+    yuvframe[0]=(unsigned char *)malloc(y4m_dst_buf_sz);
+    yuvframe[1]=(unsigned char *)malloc(y4m_dst_buf_sz);
+    yuvframe[2]=(unsigned char *)malloc(y4m_aux_buf_sz);
 
-        /* clear initial frame as it may be larger than actual video data */
-        /* fill Y plane with 0x10 and UV planes with 0X80, for black data */
-    memset(yuvframe[0],0x10,video_x*video_y);
-    memset(yuvframe[0]+video_x*video_y,0x80,video_x*video_y/2);
-    memset(yuvframe[1],0x10,video_x*video_y);
-    memset(yuvframe[1]+video_x*video_y,0x80,video_x*video_y/2);
-
     state=0;
   }
 
@@ -533,11 +981,11 @@
          proceeding.  after first pass and until eos, one will
          always be full when we get here */
 
-      for(;state<2 && (frames<endframe || endframe<0);){
+      for(i=state;i<2;i++){
         char c,frame[6];
         int ret=fread(frame,1,6,video);
-        
-	/* match and skip the frame header */
+
+        /* match and skip the frame header */
         if(ret<6)break;
         if(memcmp(frame,"FRAME",5)){
           fprintf(stderr,"Loss of framing in YUV input data\n");
@@ -552,35 +1000,21 @@
             exit(1);
           }
         }
-
-        /* read the Y plane into our frame buffer with centering */
-        line=yuvframe[state]+video_x*frame_y_offset+frame_x_offset;
-        for(e=0;e<frame_y;e++){
-          ret=fread(line,1,frame_x,video);
-            if(ret!=frame_x) break;
-          line+=video_x;
+        /*Read the frame data that needs no conversion.*/
+        if(fread(yuvframe[i],1,y4m_dst_buf_read_sz,video)!=
+         y4m_dst_buf_read_sz){
+          fprintf(stderr,"Error reading YUV frame data.\n");
+          exit(1);
         }
-        /* now get U plane*/
-        line=yuvframe[state]+(video_x*video_y)
-          +(video_x/2)*(frame_y_offset/2)+frame_x_offset/2;
-        for(e=0;e<frame_y/2;e++){
-          ret=fread(line,1,frame_x/2,video);
-            if(ret!=frame_x/2) break;
-          line+=video_x/2;
+        /*Read the frame data that does need conversion.*/
+        if(fread(yuvframe[2],1,y4m_aux_buf_read_sz,video)!=
+         y4m_aux_buf_read_sz){
+          fprintf(stderr,"Error reading YUV frame data.\n");
+          exit(1);
         }
-        /* and the V plane*/
-        line=yuvframe[state]+(video_x*video_y*5/4)
-                  +(video_x/2)*(frame_y_offset/2)+frame_x_offset/2;
-        for(e=0;e<frame_y/2;e++){
-          ret=fread(line,1,frame_x/2,video);
-            if(ret!=frame_x/2) break;
-          line+=video_x/2;
-        }
-
-	frames++;
-	if(frames>=beginframe)
-	  state++;
-	
+        /*Now convert the just read frame.*/
+        (*y4m_convert)(yuvframe[i],yuvframe[2]);
+        state++;
       }
 
       if(state<1){
@@ -592,30 +1026,33 @@
       /* Theora is a one-frame-in,one-frame-out system; submit a frame
          for compression and pull out the packet */
 
-      {
-        yuv.y_width=video_x;
-        yuv.y_height=video_y;
-        yuv.y_stride=video_x;
+      /*We submit the buffer to the library as if it were padded, but we do not
+         actually allocate space for the padding.
+        This is okay, because the library will never read data from the padded
+         region.
+        This is only currently true of the experimental encoder; do NOT do this
+         with the reference encoder.*/
+      ycbcr[0].width=frame_w;
+      ycbcr[0].height=frame_h;
+      ycbcr[0].stride=pic_w;
+      ycbcr[0].data=yuvframe[0]-pic_x-pic_y*pic_w;
+      ycbcr[1].width=frame_c_w;
+      ycbcr[1].height=frame_c_h;
+      ycbcr[1].stride=c_w;
+      ycbcr[1].data=yuvframe[0]+pic_sz-(pic_x/dst_c_dec_h)-
+       (pic_y/dst_c_dec_v)*c_w;
+      ycbcr[2].width=frame_c_w;
+      ycbcr[2].height=frame_c_h;
+      ycbcr[2].stride=c_w;
+      ycbcr[2].data=ycbcr[1].data+c_sz;
 
-        yuv.uv_width=video_x/2;
-        yuv.uv_height=video_y/2;
-        yuv.uv_stride=video_x/2;
+      th_encode_ycbcr_in(td,ycbcr);
 
-        yuv.y= yuvframe[0];
-        yuv.u= yuvframe[0]+ video_x*video_y;
-        yuv.v= yuvframe[0]+ video_x*video_y*5/4 ;
+      /* if there's only one frame, it's the last in the stream */
+      while(th_encode_packetout(td,state<2,&op)){
+        ogg_stream_packetin(to,&op);
       }
 
-      theora_encode_YUVin(td,&yuv);
-
-      /* if there's only one frame, it's the last in the stream */
-      if(state<2)
-        theora_encode_packetout(td,1,&op);
-      else
-        theora_encode_packetout(td,0,&op);
-
-      ogg_stream_packetin(to,&op);
-
       {
         unsigned char *temp=yuvframe[0];
         yuvframe[0]=yuvframe[1];
@@ -628,7 +1065,7 @@
   return videoflag;
 }
 
-int main(int argc,char *const *argv){
+int main(int argc,char *argv[]){
   int c,long_option_index,ret;
 
   ogg_stream_state to; /* take physical pages, weld into a logical
@@ -638,9 +1075,9 @@
   ogg_page         og; /* one Ogg bitstream page.  Vorbis packets are inside */
   ogg_packet       op; /* one raw packet of data for decode */
 
-  theora_state     td;
-  theora_info      ti;
-  theora_comment   tc;
+  th_enc_ctx      *td;
+  th_info          ti;
+  th_comment       tc;
 
   vorbis_info      vi; /* struct that stores all the static vorbis bitstream
                           settings */
@@ -658,30 +1095,15 @@
   ogg_int64_t video_bytesout=0;
   double timebase;
 
-
   FILE *outfile = stdout;
 
-#ifdef _WIN32 
-# ifdef THEORA_PERF_DATA
-    LARGE_INTEGER start_time;
-    LARGE_INTEGER final_time;
-
-    LONGLONG elapsed_ticks;
-    LARGE_INTEGER ticks_per_second;
-    
-    LONGLONG elapsed_secs;
-    LONGLONG elapsed_sec_mod;
-    double elapsed_secs_dbl ;
-# endif
-  /* We need to set stdin/stdout to binary mode. Damn windows. */
+#ifdef _WIN32 /* We need to set stdin/stdout to binary mode. Damn windows. */
   /* if we were reading/writing a file, it would also need to in
      binary mode, eg, fopen("file.wav","wb"); */
   /* Beware the evil ifdef. We avoid these where we can, but this one we
      cannot. Don't add any more, you'll probably go to hell if you do. */
   _setmode( _fileno( stdin ), _O_BINARY );
   _setmode( _fileno( stdout ), _O_BINARY );
-
-
 #endif
 
   while((c=getopt_long(argc,argv,optstring,options,&long_option_index))!=EOF){
@@ -695,7 +1117,7 @@
       break;;
 
     case 'a':
-      audio_q=atof(optarg)*.099;
+      audio_q=(float)(atof(optarg)*.099);
       if(audio_q<-.1 || audio_q>1){
         fprintf(stderr,"Illegal audio quality (choose -1 through 10)\n");
         exit(1);
@@ -704,7 +1126,7 @@
       break;
 
     case 'v':
-      video_q=rint(atof(optarg));
+      video_q=(int)rint(atof(optarg));
       if(video_q<0 || video_q>63){
         fprintf(stderr,"Illegal video quality (choose 0 through 10)\n");
         exit(1);
@@ -713,7 +1135,7 @@
       break;
 
     case 'A':
-      audio_r=atof(optarg)*1000;
+      audio_r=(int)(atof(optarg)*1000);
       if(audio_q<0){
         fprintf(stderr,"Illegal audio quality (choose > 0 please)\n");
         exit(1);
@@ -722,7 +1144,7 @@
       break;
 
     case 'V':
-      video_r=rint(atof(optarg)*1000);
+      video_r=(int)rint(atof(optarg)*1000);
       if(video_r<0){
         fprintf(stderr,"Illegal video bitrate (choose > 0 please)\n");
         exit(1);
@@ -734,91 +1156,25 @@
      break;
 
     case 's':
-      video_an=rint(atof(optarg));
+      video_par_n=(int)rint(atof(optarg));
       break;
 
     case 'S':
-      video_ad=rint(atof(optarg));
+      video_par_d=(int)rint(atof(optarg));
       break;
 
     case 'f':
-      video_hzn=rint(atof(optarg));
+      video_fps_n=(int)rint(atof(optarg));
       break;
 
     case 'F':
-      video_hzd=rint(atof(optarg));
+      video_fps_d=(int)rint(atof(optarg));
       break;
 
-    case 'n':
-      noise_sensitivity=rint(atof(optarg));
-      if(noise_sensitivity<0 || noise_sensitivity>6){
-        fprintf(stderr,"Illegal noise sensitivity (choose 0 through 6)\n");
-        exit(1);
-      }
+    case 'c':
+      vp3_compatible=1;
       break;
 
-    case 'm':
-      sharpness=rint(atof(optarg));
-      if(sharpness<0 || sharpness>2){
-        fprintf(stderr,"Illegal sharpness (choose 0 through 2)\n");
-        exit(1);
-      }
-      break;
-
-    case 'k':
-      keyframe_frequency=rint(atof(optarg));
-      if(keyframe_frequency<8 || keyframe_frequency>1000){
-        fprintf(stderr,"Illegal keyframe frequency (choose 8 through 1000)\n");
-        exit(1);
-      }
-      break;
-
-    case 'b':
-      {
-	char *pos=strchr(optarg,':');
-	begin_sec=atol(optarg);
-	if(pos){
-	  char *pos2=strchr(++pos,':');
-	  begin_sec*=60;
-	  begin_sec+=atol(pos);
-	  if(pos2){
-	    pos2++;
-	    begin_sec*=60;
-	    begin_sec+=atol(pos2);
-	  }else{
-	    pos2=pos;
-	  }
-	  pos2=strchr(pos2,'.');
-	  if(pos2){
-	    pos2++;
-	    begin_usec=atol(pos2);
-	  }
-	}
-      }
-      break;
-    case 'e':
-      {
-	char *pos=strchr(optarg,':');
-	end_sec=atol(optarg);
-	if(pos){
-	  char *pos2=strchr(++pos,':');
-	  end_sec*=60;
-	  end_sec+=atol(pos);
-	  if(pos2){
-	    pos2++;
-	    end_sec*=60;
-	    end_sec+=atol(pos2);
-	  }else{
-	    pos2=pos;
-	  }
-	  pos2=strchr(pos2,'.');
-	  if(pos2){
-	    pos2++;
-	    end_usec=atol(pos2);
-	  }
-	}
-      }
-      break;
     default:
       usage();
     }
@@ -830,72 +1186,63 @@
     optind++;
   }
 
-
-
-#ifdef THEORA_PERF_DATA
-# ifdef WIN32
-    QueryPerformanceCounter(&start_time);
-# endif
-#endif
-
-
   /* yayness.  Set up Ogg output stream */
   srand(time(NULL));
-  {
-    /* need two inequal serial numbers */
-    int serial1, serial2;
-    serial1 = rand();
-    serial2 = rand();
-    if (serial1 == serial2) serial2++;
-    ogg_stream_init(&to,serial1);
-    ogg_stream_init(&vo,serial2);
-  }
+  if(audio)ogg_stream_init(&vo,rand());
+  ogg_stream_init(&to,rand()); /* oops, add one ot the above */
 
   /* Set up Theora encoder */
   if(!video){
     fprintf(stderr,"No video files submitted for compression?\n");
     exit(1);
   }
-  /* Theora has a divisible-by-sixteen restriction for the encoded video size */
-  /* scale the frame size up to the nearest /16 and calculate offsets */
-  video_x=((frame_x + 15) >>4)<<4;
-  video_y=((frame_y + 15) >>4)<<4;
-  /* We force the offset to be even.
-     This ensures that the chroma samples align properly with the luma
-      samples. */
-  frame_x_offset=((video_x-frame_x)/2)&~1;
-  frame_y_offset=((video_y-frame_y)/2)&~1;
+  /* Theora has a divisible-by-sixteen restriction for the encoded frame size */
+  /* scale the picture size up to the nearest /16 and calculate offsets */
+  frame_w=pic_w+15&~0xF;
+  frame_h=pic_h+15&~0xF;
+  /*Force the offsets to be even so that chroma samples line up like we
+     expect.*/
+  pic_x=frame_w-pic_w>>1&~1;
+  pic_y=frame_h-pic_h>>1&~1;
 
-  theora_info_init(&ti);
-  ti.width=video_x;
-  ti.height=video_y;
-  ti.frame_width=frame_x;
-  ti.frame_height=frame_y;
-  ti.offset_x=frame_x_offset;
-  ti.offset_y=frame_y_offset;
-  ti.fps_numerator=video_hzn;
-  ti.fps_denominator=video_hzd;
-  ti.aspect_numerator=video_an;
-  ti.aspect_denominator=video_ad;
-  ti.colorspace=OC_CS_UNSPECIFIED;
-  ti.pixelformat=OC_PF_420;
+  th_info_init(&ti);
+  ti.frame_width=frame_w;
+  ti.frame_height=frame_h;
+  ti.pic_width=pic_w;
+  ti.pic_height=pic_h;
+  ti.pic_x=pic_x;
+  ti.pic_y=pic_y;
+  ti.fps_numerator=video_fps_n;
+  ti.fps_denominator=video_fps_d;
+  ti.aspect_numerator=video_par_n;
+  ti.aspect_denominator=video_par_d;
+  ti.colorspace=TH_CS_UNSPECIFIED;
   ti.target_bitrate=video_r;
   ti.quality=video_q;
+  ti.keyframe_granule_shift=6;
 
-  ti.dropframes_p=0;
-  ti.quick_p=1;
-  ti.keyframe_auto_p=1;
-  ti.keyframe_frequency=keyframe_frequency;
-  ti.keyframe_frequency_force=keyframe_frequency;
-  ti.keyframe_data_target_bitrate=video_r*1.5;
-  ti.keyframe_auto_threshold=80;
-  ti.keyframe_mindistance=8;
-  ti.noise_sensitivity=noise_sensitivity;
-  ti.sharpness=sharpness;
+  if(dst_c_dec_h==2){
+    if(dst_c_dec_v==2)ti.pixel_fmt=TH_PF_420;
+    else ti.pixel_fmt=TH_PF_422;
+  }
+  else ti.pixel_fmt=TH_PF_444;
 
-  theora_encode_init(&td,&ti);
-  theora_info_clear(&ti);
+  td=th_encode_alloc(&ti);
+  th_info_clear(&ti);
 
+  if(vp3_compatible){
+    ret=th_encode_ctl(td,TH_ENCCTL_SET_VP3_COMPATIBLE,&vp3_compatible,
+     sizeof(vp3_compatible));
+    if(ret<0||!vp3_compatible){
+      fprintf(stderr,"Could not enable strict VP3 compatibility.\n");
+      if(ret>=0){
+        fprintf(stderr,"Ensure your source format is supported by VP3.\n");
+        fprintf(stderr,
+         "(4:2:0 pixel format, width and height multiples of 16).\n");
+      }
+    }
+  }
+
   /* initialize Vorbis too, assuming we have audio to compress. */
   if(audio){
     vorbis_info_init(&vi);
@@ -916,8 +1263,13 @@
 
   /* write the bitstream header packets with proper page interleave */
 
+  th_comment_init(&tc);
+
   /* first packet will get its own page automatically */
-  theora_encode_header(&td,&op);
+  if(th_encode_flushheader(td,&tc,&op)<=0){
+    fprintf(stderr,"Internal Theora library error.\n");
+    exit(1);
+  }
   ogg_stream_packetin(&to,&op);
   if(ogg_stream_pageout(&to,&og)!=1){
     fprintf(stderr,"Internal Ogg library error.\n");
@@ -927,17 +1279,15 @@
   fwrite(og.body,1,og.body_len,outfile);
 
   /* create the remaining theora headers */
-  theora_comment_init(&tc);
-  theora_encode_comment(&tc,&op);
-  ogg_stream_packetin(&to,&op);
-  /*theora_encode_comment() doesn't take a theora_state parameter, so it has to
-     allocate its own buffer to pass back the packet data.
-    If we don't free it here, we'll leak.
-    libogg2 makes this much cleaner: the stream owns the buffer after you call
-     packetin in libogg2, but this is not true in libogg1.*/
-  free(op.packet);
-  theora_encode_tables(&td,&op);
-  ogg_stream_packetin(&to,&op);
+  for(;;){
+    ret=th_encode_flushheader(td,&tc,&op);
+    if(ret<0){
+      fprintf(stderr,"Internal Theora library error.\n");
+      exit(1);
+    }
+    else if(!ret)break;
+    ogg_stream_packetin(&to,&op);
+  }
 
   if(audio){
     ogg_packet header;
@@ -962,7 +1312,7 @@
   /* Flush the rest of our headers. This ensures
      the actual data in each stream will start
      on a new page, as per spec. */
-  while(1){
+  for(;;){
     int result = ogg_stream_flush(&to,&og);
       if(result<0){
         /* can't get here */
@@ -974,7 +1324,7 @@
     fwrite(og.body,1,og.body_len,outfile);
   }
   if(audio){
-    while(1){
+    for(;;){
       int result=ogg_stream_flush(&vo,&og);
       if(result<0){
         /* can't get here */
@@ -989,7 +1339,7 @@
 
   /* setup complete.  Raw processing loop */
   fprintf(stderr,"Compressing....\n");
-  while(1){
+  for(;;){
     ogg_page audiopage;
     ogg_page videopage;
 
@@ -997,7 +1347,7 @@
     audioflag=fetch_and_process_audio(audio,&audiopage,&vo,&vd,&vb,audioflag);
 
     /* is there a video page flushed?  If not, fetch one if possible */
-    videoflag=fetch_and_process_video(video,&videopage,&to,&td,videoflag);
+    videoflag=fetch_and_process_video(video,&videopage,&to,td,videoflag);
 
     /* no pages of either?  Must be end of stream. */
     if(!audioflag && !videoflag)break;
@@ -1009,7 +1359,7 @@
       double audiotime=
         audioflag?vorbis_granule_time(&vd,ogg_page_granulepos(&audiopage)):-1;
       double videotime=
-        videoflag?theora_granule_time(&td,ogg_page_granulepos(&videopage)):-1;
+        videoflag?th_granule_time(td,ogg_page_granulepos(&videopage)):-1;
 
       if(!audioflag){
         audio_or_video=1;
@@ -1028,7 +1378,7 @@
         video_bytesout+=fwrite(videopage.body,1,videopage.body_len,outfile);
         videoflag=0;
         timebase=videotime;
-        
+
       }else{
         /* flush an audio page */
         audio_bytesout+=fwrite(audiopage.header,1,audiopage.header_len,outfile);
@@ -1036,18 +1386,18 @@
         audioflag=0;
         timebase=audiotime;
       }
-
-      if(timebase!=-1.){
-        int hundredths=timebase*100-(long)timebase*100;
+      if(timebase > 0)
+      {
+        int hundredths=(int)(timebase*100-(long)timebase*100);
         int seconds=(long)timebase%60;
         int minutes=((long)timebase/60)%60;
         int hours=(long)timebase/3600;
-        
+
         if(audio_or_video)
-          vkbps=rint(video_bytesout*8./timebase*.001);
+          vkbps=(int)rint(video_bytesout*8./timebase*.001);
         else
-          akbps=rint(audio_bytesout*8./timebase*.001);
-        
+          akbps=(int)rint(audio_bytesout*8./timebase*.001);
+
         fprintf(stderr,
                 "\r      %d:%02d:%02d.%02d audio: %dkbps video: %dkbps                 ",
                 hours,minutes,seconds,hundredths,akbps,vkbps);
@@ -1064,33 +1414,19 @@
     vorbis_dsp_clear(&vd);
     vorbis_comment_clear(&vc);
     vorbis_info_clear(&vi);
+    if(audio!=stdin)fclose(audio);
   }
   if(video){
     ogg_stream_clear(&to);
-    theora_clear(&td);
+    th_encode_free(td);
+    th_comment_clear(&tc);
+    if(video!=stdin)fclose(video);
   }
 
   if(outfile && outfile!=stdout)fclose(outfile);
 
   fprintf(stderr,"\r   \ndone.\n\n");
 
-#ifdef THEORA_PERF_DATA
-# ifdef WIN32
-    QueryPerformanceCounter(&final_time);
-    elapsed_ticks = final_time.QuadPart - start_time.QuadPart;
-    ticks_per_second;
-    QueryPerformanceFrequency(&ticks_per_second);
-    elapsed_secs = elapsed_ticks / ticks_per_second.QuadPart;
-    elapsed_sec_mod = elapsed_ticks % ticks_per_second.QuadPart;
-    elapsed_secs_dbl = elapsed_secs;
-    elapsed_secs_dbl += ((double)elapsed_sec_mod / (double)ticks_per_second.QuadPart);
-    printf("Encode time = %lld ticks\n", elapsed_ticks);
-    printf("~%lld and %lld / %lld seconds\n", elapsed_secs, elapsed_sec_mod, ticks_per_second.QuadPart);
-    printf("~%Lf seconds\n", elapsed_secs_dbl);
-# endif
-
-#endif 
-
   return(0);
 
 }

Modified: branches/theora-thusnelda/examples/player_example.c
===================================================================
--- branches/theora-thusnelda/examples/player_example.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/player_example.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -304,19 +304,23 @@
 }
 
 static void open_video(void){
+  int w;
+  int h;
+  w=(ti.offset_x+ti.frame_width+1&~1)-(ti.offset_x&~1);
+  h=(ti.offset_y+ti.frame_height+1&~1)-(ti.offset_y&~1);
   if ( SDL_Init(SDL_INIT_VIDEO) < 0 ) {
     fprintf(stderr, "Unable to init SDL: %s\n", SDL_GetError());
     exit(1);
   }
 
-  screen = SDL_SetVideoMode(ti.frame_width, ti.frame_height, 0, SDL_SWSURFACE);
+  screen = SDL_SetVideoMode(w, h, 0, SDL_SWSURFACE);
   if ( screen == NULL ) {
     fprintf(stderr, "Unable to set %dx%d video: %s\n",
-            ti.frame_width,ti.frame_height,SDL_GetError());
+            w,h,SDL_GetError());
     exit(1);
   }
 
-  yuv_overlay = SDL_CreateYUVOverlay(ti.frame_width, ti.frame_height,
+  yuv_overlay = SDL_CreateYUVOverlay(w, h,
                                      SDL_YV12_OVERLAY,
                                      screen);
   if ( yuv_overlay == NULL ) {
@@ -326,8 +330,8 @@
   }
   rect.x = 0;
   rect.y = 0;
-  rect.w = ti.frame_width;
-  rect.h = ti.frame_height;
+  rect.w = w;
+  rect.h = h;
 
   SDL_DisplayYUVOverlay(yuv_overlay, &rect);
 }
@@ -349,7 +353,7 @@
   /* reverse u and v for SDL */
   /* and crop input properly, respecting the encoded frame rect */
   /* problems may exist for odd frame rect for some encodings */
-  crop_offset=ti.offset_x+yuv.y_stride*ti.offset_y;
+  crop_offset=(ti.offset_x&~1)+yuv.y_stride*(ti.offset_y&~1);
   for(i=0;i<yuv_overlay->h;i++)
     memcpy(yuv_overlay->pixels[0]+yuv_overlay->pitches[0]*i,
            yuv.y+crop_offset+yuv.y_stride*i,

Modified: branches/theora-thusnelda/examples/png2theora.c
===================================================================
--- branches/theora-thusnelda/examples/png2theora.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/png2theora.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -14,7 +14,7 @@
             file from a sequence of png images
   last mod: $Id$
              based on code from Vegard Nossum
-  
+
  ********************************************************************/
 
 #define _FILE_OFFSET_BITS 64
@@ -60,29 +60,27 @@
 
 static char *input_filter;
 
-const char *optstring = "o:h:v:V:s:S:f:F:";
+const char *optstring = "o:hv:V:s:S:f:F:";
 struct option options [] = {
  {"output",required_argument,NULL,'o'},
- {"help",optional_argument,NULL,'h'},
+ {"help",no_argument,NULL,'h'},
  {"video-rate-target",required_argument,NULL,'V'},
  {"video-quality",required_argument,NULL,'v'},
- {"aspect-numerator",optional_argument,NULL,'s'},
- {"aspect-denominator",optional_argument,NULL,'S'},
- {"framerate-numerator",optional_argument,NULL,'f'},
- {"framerate-denominator",optional_argument,NULL,'F'},
+ {"aspect-numerator",required_argument,NULL,'s'},
+ {"aspect-denominator",required_argument,NULL,'S'},
+ {"framerate-numerator",required_argument,NULL,'f'},
+ {"framerate-denominator",required_argument,NULL,'F'},
  {NULL,0,NULL,0}
 };
 
 static void usage(void){
   fprintf(stderr,
           "%s %s\n"
-          "Usage: %s [options] input\n\n"
-          "Output is parsed by scanf and represents a list of files, i.e.\n"
-          "  file-%%06d.png to look for files file000001.png to file9999999.png \n\n"
+          "Usage: %s [options] <input>\n\n"
+          "The input argument uses C printf format to represent a list of files,\n"
+          "  i.e. file-%%06d.png to look for files file000001.png to file9999999.png \n\n"
           "Options: \n\n"
-          "  -o --output <filename.ogv>     file name for encoded output;\n"
-          "                                 If this option is not given, the\n"
-          "                                 compressed data is sent to stdout.\n\n"
+          "  -o --output <filename.ogv>     file name for encoded output (required);\n"
           "  -V --video-rate-target <n>     bitrate target for Theora video\n\n"
           "  -v --video-quality <n>         Theora quality selector fro 0 to 10\n"
           "                                 (0 yields smallest files but lowest\n"
@@ -159,7 +157,7 @@
 }
 
 static int
-theora_write_frame(unsigned long w, unsigned long h, unsigned char *yuv)
+theora_write_frame(unsigned long w, unsigned long h, unsigned char *yuv, int last)
 {
   yuv_buffer yuv_buf;
   ogg_packet op;
@@ -174,7 +172,7 @@
 
   unsigned int x;
   unsigned int y;
-  
+
   /* Must hold: yuv_w >= w */
   yuv_w = (w + 15) & ~15;
 
@@ -229,7 +227,7 @@
     return 1;
   }
 
-  if(!theora_encode_packetout(&theora_td, 0, &op)) {
+  if(!theora_encode_packetout(&theora_td, last, &op)) {
     fprintf(stderr, "%s: error: could not read packets\n",
       option_output);
     return 1;
@@ -264,14 +262,14 @@
       fwrite(og.header, og.header_len, 1, ogg_fp);
       fwrite(og.body, og.body_len, 1, ogg_fp);
     }
-  
+
     theora_info_clear(&theora_ti);
     theora_clear(&theora_td);
-  
+
     fflush(ogg_fp);
     fclose(ogg_fp);
   }
-  
+
   ogg_stream_clear(&ogg_os);
 }
 
@@ -330,7 +328,17 @@
   png_structp png_ptr;
   png_infop info_ptr;
   png_infop end_ptr;
+  png_bytep row_data;
   png_bytep *row_pointers;
+  png_color_16p bkgd;
+  png_uint_32 width;
+  png_uint_32 height;
+  int bit_depth;
+  int color_type;
+  int interlace_type;
+  int compression_type;
+  int filter_method;
+  png_uint_32 y;
 
   fp = fopen(pathname, "rb");
   if(!fp) {
@@ -374,16 +382,38 @@
 
   png_init_io(png_ptr, fp);
   png_set_sig_bytes(png_ptr, 8);
+  png_read_info(png_ptr, info_ptr);
+  png_get_IHDR(png_ptr, info_ptr, &width, &height, &bit_depth, &color_type,
+   &interlace_type, &compression_type, &filter_method);
+  png_set_expand(png_ptr);
+  if(bit_depth<8)png_set_packing(png_ptr);
+  if(bit_depth==16)png_set_strip_16(png_ptr);
+  if(!(color_type&PNG_COLOR_MASK_COLOR))png_set_gray_to_rgb(png_ptr);
+  if(png_get_bKGD(png_ptr, info_ptr, &bkgd)){
+    png_set_background(png_ptr, bkgd, PNG_BACKGROUND_GAMMA_FILE, 1, 1.0);
+  }
+  /*Note that color_type 2 and 3 can also have alpha, despite not setting the
+     PNG_COLOR_MASK_ALPHA bit.
+    We always strip it to prevent libpng from overrunning our buffer.*/
+  png_set_strip_alpha(png_ptr);
 
-  png_read_png(png_ptr, info_ptr, PNG_TRANSFORM_STRIP_16, NULL);
+  row_data = (png_bytep)png_malloc(png_ptr,
+    3*height*width*png_sizeof(*row_data));
+  row_pointers = (png_bytep *)png_malloc(png_ptr,
+    height*png_sizeof(*row_pointers));
+  for(y = 0; y < height; y++) {
+    row_pointers[y] = row_data + y*(3*width);
+  }
+  png_read_image(png_ptr, row_pointers);
+  png_read_end(png_ptr, end_ptr);
 
-  row_pointers = png_get_rows(png_ptr, info_ptr);
-
-  *w = png_get_image_width(png_ptr, info_ptr);
-  *h = png_get_image_height(png_ptr, info_ptr);
+  *w = width;
+  *h = height;
   *yuv = malloc(*w * *h * 3);
   rgb_to_yuv(row_pointers, *yuv, *w, *h);
 
+  png_free(png_ptr, row_pointers);
+  png_free(png_ptr, row_data);
   png_destroy_read_struct(&png_ptr, &info_ptr, &end_ptr);
 
   fclose(fp);
@@ -408,7 +438,7 @@
   char *input_directory;
   char *scratch;
   struct dirent **png_files;
-  
+
   while(1) {
 
     c=getopt_long(argc,argv,optstring,options,&long_option_index);
@@ -448,7 +478,8 @@
        video_fps_numerator=rint(atof(optarg));
        break;
      case 'F':
-       video_fps_denominator=rint(atof(optarg));   
+       video_fps_denominator=rint(atof(optarg));
+       break;
      default:
         usage();
         break;
@@ -460,6 +491,10 @@
   }
 
   input_mask = argv[optind];
+  if (!input_mask) {
+    fprintf(stderr, "no input files specified; run with -h for help.\n");
+    exit(1);
+  }
   /* dirname and basename must operate on scratch strings */
   scratch = strdup(input_mask);
   input_directory = strdup(dirname(scratch));
@@ -473,20 +508,21 @@
 	input_directory, input_filter);
 #endif
   n = scandir (input_directory, &png_files, include_files, alphasort);
-  for(i=0;i< n;i++) {
+  for(i=0;i<n;i++) {
     unsigned int w;
     unsigned int h;
     unsigned char *yuv;
     char input_png[1024];
+    int last = 0;
 
     sprintf(input_png, "%s/%s", input_directory, png_files[i]->d_name);
-    
+
     if(png_read(input_png, &w, &h, &yuv)) {
       fprintf(stderr, "could not read %s\n", input_png);
       theora_close();
       exit(1);
     }
-    
+
     if(!theora_initialized) {
       theora_info_init(&theora_ti);
 
@@ -522,7 +558,8 @@
       theora_initialized = 1;
     }
 
-    if(theora_write_frame(w, h, yuv)) {
+    if(i >= n-1) last = 1;
+    if(theora_write_frame(w, h, yuv, last)) {
       theora_close();
       free(input_directory);
       free(input_filter);

Modified: branches/theora-thusnelda/examples/splayer.c
===================================================================
--- branches/theora-thusnelda/examples/splayer.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/examples/splayer.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -520,7 +520,7 @@
 PaError err;
 
 /* Ogg and codec state for demux/decode */
-ogg_sync_state   oy; 
+ogg_sync_state   oy;
 ogg_page         og;
 ogg_stream_state vo;
 ogg_stream_state to;
@@ -645,16 +645,16 @@
   
   screen = SDL_SetVideoMode(ti.frame_width, ti.frame_height, 0, SDL_SWSURFACE);
   if ( screen == NULL ) {
-    printf("Unable to set %dx%d video mode: %s\n", 
+    printf("Unable to set %dx%d video mode: %s\n",
            ti.frame_width,ti.frame_height,SDL_GetError());
     exit(1);
   }
-  
+
   yuv_overlay = SDL_CreateYUVOverlay(ti.frame_width, ti.frame_height,
 				     SDL_YV12_OVERLAY,
 				     screen);
   if ( yuv_overlay == NULL ) {
-    printf("SDL: Couldn't create SDL_yuv_overlay: %s\n", 
+    printf("SDL: Couldn't create SDL_yuv_overlay: %s\n",
 	   SDL_GetError());
     exit(1);
   }
@@ -672,29 +672,29 @@
   yuv_buffer yuv;
   int crop_offset;
   theora_decode_YUVout(&td,&yuv);
-  
+
   /* Lock SDL_yuv_overlay */
   if ( SDL_MUSTLOCK(screen) ) {
     if ( SDL_LockSurface(screen) < 0 ) return;
   }
   if (SDL_LockYUVOverlay(yuv_overlay) < 0) return;
-  
+
   /* let's draw the data (*yuv[3]) on a SDL screen (*screen) */
   /* deal with border stride */
   /* reverse u and v for SDL */
   /* and crop input properly, respecting the encoded frame rect */
   crop_offset=ti.offset_x+yuv.y_stride*ti.offset_y;
   for(i=0;i<yuv_overlay->h;i++)
-    memcpy(yuv_overlay->pixels[0]+yuv_overlay->pitches[0]*i, 
-	   yuv.y+crop_offset+yuv.y_stride*i, 
+    memcpy(yuv_overlay->pixels[0]+yuv_overlay->pitches[0]*i,
+	   yuv.y+crop_offset+yuv.y_stride*i,
 	   yuv_overlay->w);
   crop_offset=(ti.offset_x/2)+(yuv.uv_stride)*(ti.offset_y/2);
   for(i=0;i<yuv_overlay->h/2;i++){
-    memcpy(yuv_overlay->pixels[1]+yuv_overlay->pitches[1]*i, 
-	   yuv.v+crop_offset+yuv.uv_stride*i, 
+    memcpy(yuv_overlay->pixels[1]+yuv_overlay->pitches[1]*i,
+	   yuv.v+crop_offset+yuv.uv_stride*i,
 	   yuv_overlay->w/2);
-    memcpy(yuv_overlay->pixels[2]+yuv_overlay->pitches[2]*i, 
-	   yuv.u+crop_offset+yuv.uv_stride*i, 
+    memcpy(yuv_overlay->pixels[2]+yuv_overlay->pitches[2]*i,
+	   yuv.u+crop_offset+yuv.uv_stride*i,
 	   yuv_overlay->w/2);
   }
 
@@ -705,12 +705,12 @@
   }
 
   /* Show, baby, show! */
-  SDL_DisplayYUVOverlay(yuv_overlay, &rect);  
+  SDL_DisplayYUVOverlay(yuv_overlay, &rect);
 }
 
 static void usage(void){
   printf("Usage: splayer <ogg_file>\n"
-#ifdef WIN32  
+#ifdef WIN32
     "\n"
     "or drag and drop an ogg file over the .exe\n\n"
 #endif
@@ -721,7 +721,7 @@
 static int dump_comments(theora_comment *tc){
   int i, len;
   char *value;
-  
+
   printf("Encoded by %s\n",tc->vendor);
   if(tc->comments){
     printf("theora comment header:\n");
@@ -778,7 +778,7 @@
   if(theora_p)ogg_stream_pagein(&to,page);
   if(vorbis_p)ogg_stream_pagein(&vo,page);
   return 0;
-}                                   
+}
 
 void parseHeaders(){
   /* extracted from player_sample.c test file for theora alpha */
@@ -790,7 +790,7 @@
     if(ret==0)break;
     while(ogg_sync_pageout(&oy,&og)>0){
       ogg_stream_state test;
-      
+
       /* is this a mandated initial header? If not, stop parsing */
       if(!ogg_page_bos(&og)){
 	/* don't leak the page; get it into the appropriate stream */
@@ -798,11 +798,11 @@
 	stateflag=1;
 	break;
       }
-      
+
       ogg_stream_init(&test,ogg_page_serialno(&og));
       ogg_stream_pagein(&test,&og);
       ogg_stream_packetout(&test,&op);
-      
+
       /* identify the codec: try theora */
       if(!theora_p && theora_decode_header(&ti,&tc,&op)>=0){
 	/* it is theora */
@@ -837,7 +837,7 @@
       if(theora_p==3)break;
     }
 
-    /* look for more vorbis header packets */  
+    /* look for more vorbis header packets */
     while(vorbis_p && (vorbis_p<3) && (ret=ogg_stream_packetout(&vo,&op))){
       if(ret<0){
 	printf("Error parsing Vorbis stream headers; corrupt stream?\n");
@@ -853,7 +853,7 @@
 
     /* The header pages/packets will arrive before anything else we
        care about, or the stream is not obeying spec */
-    
+
     if(ogg_sync_pageout(&oy,&og)>0){
       queue_page(&og); /* demux into the appropriate stream */
     }else{
@@ -922,7 +922,7 @@
   }
   if(vorbis_p){
     vorbis_synthesis_init(&vd,&vi);
-    vorbis_block_init(&vd,&vb);  
+    vorbis_block_init(&vd,&vb);
     printf("Ogg logical stream %x is Vorbis %d channel %d Hz audio.\n",
 	   vo.serialno,vi.channels,vi.rate);
   }else{
@@ -934,7 +934,7 @@
   if(vorbis_p)open_audio();
   /* open video */
   if(theora_p)open_video();
-  
+
   /* our main loop */
   while(!playbackdone){
 
@@ -984,7 +984,7 @@
 	  audiobuf_granulepos+=i;
 
       }else{
-	
+
 	/* no pending audio; is there a pending packet to decode? */
 	if(ogg_stream_packetout(&vo,&op)>0){
 	  if(vorbis_synthesis(&vb,&op)==0) /* test for success! */
@@ -997,7 +997,7 @@
     while(theora_p && !videobuf_ready){
       /* get one video packet... */
       if(ogg_stream_packetout(&to,&op)>0){
-      
+
         theora_decode_packetin(&td,&op);
 
 	  videobuf_granulepos=td.granulepos;
@@ -1033,13 +1033,13 @@
     }
 
     /* if we're set for the next frame, sleep */
-    if((!theora_p || videobuf_ready) && 
+    if((!theora_p || videobuf_ready) &&
        (!vorbis_p || audiobuf_ready)){
         int ticks = 1.0e3*(videobuf_time-get_time());
 	if(ticks>0)
           SDL_Delay(ticks);
     }
- 
+
     if(videobuf_ready){
       /* time to write our cached frame */
       video_write();
@@ -1063,7 +1063,7 @@
         printf("Ogg buffering stopped, end of file reached.\n");
       }
     }
-    
+
     if (ogg_sync_pageout(&oy,&og)>0){
       queue_page(&og);
     }
@@ -1087,7 +1087,7 @@
     vorbis_block_clear(&vb);
     vorbis_dsp_clear(&vd);
     vorbis_comment_clear(&vc);
-    vorbis_info_clear(&vi); 
+    vorbis_info_clear(&vi);
   }
   if(theora_p){
     ogg_stream_clear(&to);
@@ -1099,7 +1099,7 @@
 
   printf("\r                                                              "
 	 "\nDone.\n");
-	 
+
   SDL_Quit();
 
   return(0);

Modified: branches/theora-thusnelda/include/theora/Makefile.am
===================================================================
--- branches/theora-thusnelda/include/theora/Makefile.am	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/include/theora/Makefile.am	2009-02-06 09:43:27 UTC (rev 15675)
@@ -2,6 +2,6 @@
 
 theoraincludedir = $(includedir)/theora
 
-theorainclude_HEADERS = theora.h
+theorainclude_HEADERS = theora.h theoradec.h theoraenc.h codec.h
 
 noinst_HEADERS = codec.h theoradec.h

Modified: branches/theora-thusnelda/include/theora/codec.h
===================================================================
--- branches/theora-thusnelda/include/theora/codec.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/include/theora/codec.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -32,14 +32,32 @@
  *
  * \subsection Organization
  *
- * The functions documented here are actually subdivided into two separate
- *  libraries:
- * - <tt>libtheoradec</tt>, contains the decoder and shared routines.
- *   You must link to this if you use any of the functions listed in
- *    this API, i.e., those listed in \ref basefuncs and \ref decfuncs.*/
+ * The functions documented here are actually subdivided into three 
+ * separate libraries:
+ * - <tt>libtheoraenc</tt> contains the encoder interface,
+ *   described in \ref encfuncs.
+ * - <tt>libtheoradec</tt> contains the decoder interface and
+ *   routines shared with the encoder.
+ *   You must also link to this if you link to <tt>libtheoraenc</tt>.
+ *   The routines in this library are described in \ref decfuncs and 
+ *   \ref basefuncs.
+ * - <tt>libtheora</tt> contains the \ref oldfuncs.
+ *
+ * New code should link to <tt>libtheoradec</tt> and, if using encoder
+ * features, <tt>libtheoraenc</tt>. Together these two export both
+ * the standard and the legacy API, so this is all that is needed by
+ * any code. The older <tt>libtheora</tt> library is provided just for
+ * compatibility with older build configurations.
+ *
+ * In general the recommended 1.x API symbols can be distinguished
+ * by their <tt>th_</tt> or <tt>TH_</tt> namespace prefix.
+ * The older, legacy API uses <tt>theora_</tt> or <tt>OC_</tt>
+ * prefixes instead.
+ */
 
 /**\file
- * The shared <tt>libtheoradec</tt> and <tt>libtheoraenc</tt> C API.*/
+ * The shared <tt>libtheoradec</tt> and <tt>libtheoraenc</tt> C API.
+ * You don't need to include this directly.*/
 
 #if !defined(_O_THEORA_CODEC_H_)
 # define _O_THEORA_CODEC_H_ (1)
@@ -94,13 +112,18 @@
  *  specification</a>, Section 4.4, for details on the precise sample
  *  locations.*/
 typedef enum{
-  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).*/
+  /**Chroma decimation by 2 in both the X and Y directions (4:2:0).
+     The Cb and Cr chroma planes are half the width and half the height of the
+      luma plane.*/
   TH_PF_420,
   /**Currently reserved.*/
   TH_PF_RSVD,
-  /**Chroma decimation by 2 in the X direction (4:2:2).*/
+  /**Chroma decimation by 2 in the X direction (4:2:2).
+     The Cb and Cr chroma planes are half the width of the luma plane, but full
+      height.*/
   TH_PF_422,
-  /**No chroma decimation (4:4:4).*/
+  /**No chroma decimation (4:4:4).
+     The Cb and Cr chroma planes are full width and full height.*/
   TH_PF_444,
   /**The total number of currently defined pixel formats.*/
   TH_PF_NFORMATS
@@ -112,8 +135,8 @@
  * This contains the image data in a left-to-right, top-down format.
  * Each row of pixels is stored contiguously in memory, but successive rows
  *  need not be.
- * Use \a ystride to compute the offset of the next row.
- * The encoder accepts both positive \a ystride values (top-down in memory) and
+ * Use \a stride to compute the offset of the next row.
+ * The encoder accepts both positive \a stride values (top-down in memory) and
  *  negative (bottom-up in memory).
  * The decoder currently always generates images with positive strides.*/
 typedef struct{
@@ -122,7 +145,7 @@
   /**The height of this plane.*/
   int            height;
   /**The offset in bytes between successive rows.*/
-  int            ystride;
+  int            stride;
   /**A pointer to the beginning of the first row.*/
   unsigned char *data;
 }th_img_plane;
@@ -444,13 +467,14 @@
  * \endcode
  * \return the version number.*/
 extern ogg_uint32_t th_version_number(void);
-/**Converts a granule position to an absolute frame number.
+/**Converts a granule position to an absolute frame index, starting at
+ *  <tt>0</tt>.
  * The granule position is interpreted in the context of a given
  *  #th_enc_ctx or #th_dec_ctx handle (either will suffice).
  * \param _encdec  A previously allocated #th_enc_ctx or #th_dec_ctx
  *                  handle.
  * \param _granpos The granule position to convert.
- * \returns The absolute frame number corresponding to \a _granpos.
+ * \returns The absolute frame index corresponding to \a _granpos.
  * \retval -1 The given granule position was invalid (i.e. negative).*/
 extern ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos);
 /**Converts a granule position to an absolute time in seconds.
@@ -460,6 +484,9 @@
  *                  handle.
  * \param _granpos The granule position to convert.
  * \return The absolute time in seconds corresponding to \a _granpos.
+ *         This is the "end time" for the frame, or the latest time it should
+ *          be displayed.
+ *         It is not the presentation time.
  * \retval -1 The given granule position was invalid (i.e. negative).*/
 extern double th_granule_time(void *_encdec,ogg_int64_t _granpos);
 /**Determines whether a Theora packet is a header or not.

Modified: branches/theora-thusnelda/include/theora/theora.h
===================================================================
--- branches/theora-thusnelda/include/theora/theora.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/include/theora/theora.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -25,19 +25,21 @@
 
 #include <stddef.h>	/* for size_t */
 
-#ifndef LIBOGG2
 #include <ogg/ogg.h>
-#else
-#include <ogg2/ogg.h>
-/* This is temporary until libogg2 is more complete */
-ogg_buffer_state *ogg_buffer_create(void);
-#endif
 
+/** \defgroup oldfuncs Legacy pre-1.0 C API */
+/*  @{ */
+
 /** \mainpage
  * 
  * \section intro Introduction
  *
- * This is the documentation for the libtheora C API.
+ * This is the documentation for the libtheora legacy C API, declared in 
+ * the theora.h header, which describes the old interface used before
+ * the 1.0 release. This API was widely deployed for several years and
+ * remains supported, but for new code we recommend the cleaner API 
+ * declared in theoradec.h and theoraenc.h.
+ *
  * libtheora is the reference implementation for
  * <a href="http://www.theora.org/">Theora</a>, a free video codec.
  * Theora is derived from On2's VP3 codec with improved integration for
@@ -125,7 +127,7 @@
  */
 
 /** \file
- * The libtheora C API.
+ * The libtheora pre-1.0 legacy C API.
  */
 
 /**
@@ -313,6 +315,20 @@
  */
 #define TH_DECCTL_SET_PPLEVEL (3)
 
+/**Sets the maximum distance between key frames.
+ * This can be changed during an encode, but will be bounded by
+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
+ *  be enlarged appropriately.
+ *
+ * \param[in]  buf <tt>ogg_uint32_t</tt>: The maximum distance between key
+ *                   frames.
+ * \param[out] buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
+ * \retval TH_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
+
 /**Set the granule position.
  * Call this after a seek, to update the internal granulepos
  * in the decoder, to insure that subsequent frames are marked
@@ -332,20 +348,6 @@
  *  \ref decctlcodes "decoder control codes".
  * Keep any experimental or vendor-specific values above \c 0x8000.*/
 /*@{*/
-/**Sets the Huffman tables to use.
- * The tables are copied, not stored by reference, so they can be freed after
- *  this call.
- * <tt>NULL</tt> may be specified to revert to the default tables.
- *
- * \param[in] buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
- * \retval TH_FAULT  \a theora_state is <tt>NULL</tt>.
- * \retval TH_EINVAL Encoding has already begun or one or more of the given
- *                     tables is not full or prefix-free, \a buf is
- *                     <tt>NULL</tt> and \a buf_sz is not zero, or \a buf is
- *                     non-<tt>NULL</tt> and \a buf_sz is not
- *                     <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
- * \retval TH_IMPL   Not supported by this implementation.*/
-#define TH_ENCCTL_SET_HUFFMAN_CODES (0)
 /**Sets the quantization parameters to use.
  * The parameters are copied, not stored by reference, so they can be freed
  *  after this call.
@@ -365,19 +367,6 @@
  *                    <tt>sizeof(#th_quant_info)</tt>.
  * \retval TH_IMPL   Not supported by this implementation.*/
 #define TH_ENCCTL_SET_QUANT_PARAMS (2)
-/**Sets the maximum distance between key frames.
- * This can be changed during an encode, but will be bounded by
- *  <tt>1<<th_info#keyframe_granule_shift</tt>.
- * If it is set before encoding begins, th_info#keyframe_granule_shift will
- *  be enlarged appropriately.
- *
- * \param[in]  buf <tt>ogg_uint32_t</tt>: The maximum distance between key
- *                   frames.
- * \param[out] buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
- * \retval TH_FAULT  \a theora_state or \a buf is <tt>NULL</tt>.
- * \retval TH_EINVAL \a buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
- * \retval TH_IMPL   Not supported by this implementation.*/
-#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
 /**Disables any encoder features that would prevent lossless transcoding back
  *  to VP3.
  * This primarily means disabling block-level QI values and not using 4MV mode
@@ -434,41 +423,6 @@
  * \retval TH_IMPL   Not supported by this implementation in the current
  *                    encoding mode.*/
 #define TH_ENCCTL_SET_SPLEVEL (14)
-/**Puts the encoder in VBR mode.
- * This can be done at any time during the encoding process, with different
- *  configuration parameters, to encode different regions of the video segment
- *  with different qualities.
- * See the #th_info struct documentation for details on how the default
- *  encoding mode is chosen.
- *
- * \param[in] buf <tt>#th_vbr_cfg</tt>: the configuration parameters.
- *                 This may be <tt>NULL</tt>, in which case the current VBR
- *                  configuration is unchanged.
- *                 The default is to use the QI setting passed in via the
- *                  #th_info struct when the encoder was initialized, with a
- *                  full range of admissible quantizers.
- * \retval OC_EFAULT \a theora_state is <tt>NULL</tt>.
- * \retval TH_EINVAL The configuration parameters do not meet one of their
- *                    stated requirements, \a buf is <tt>NULL</tt> and
- *                    \a buf_sz is not zero, or \a buf is non-<tt>NULL</tt>
- *                    and \a buf_sz is not <tt>sizeof(#th_vbr_cfg)</tt>.
- * \retval TH_IMPL   Not supported by this implementation.*/
-#define TH_ENCCTL_SETUP_VBR (16)
-/**Puts the encoder in CQI mode.
- * This can be done at any time during the encoding process, with different QI
- *  values.
- * See the #th_info struct documentation for details on how the default
- *  encoding mode is chosen.
- *
- * \param[in] buf <tt>#th_cqi_cfg</tt>: the configuration parameters.
- *                 This may be <tt>NULL</tt>, in which case the current CQI
- *                  configuration is unchanged.
- *                 The default is to use the QI setting passed in via the
- *                  #th_info struct when the encoder was initialized.
- * \retval OC_EFAULT \a theora_state is <tt>NULL</tt>.
- * \retval TH_EINVAL \a buf_sz is not <tt>sizeof(#th_cqi_cfg)</tt>.
- * \retval TH_IMPL   Not supported by this implementation.*/
-#define TH_ENCCTL_SETUP_CQI (18)
 /*@}*/
 
 #define OC_FAULT       -1       /**< General failure */
@@ -700,12 +654,17 @@
 int theora_granule_shift(theora_info *ti);
 
 /**
- * Convert a granulepos to an absolute frame number. The granulepos is
- * interpreted in the context of a given theora_state handle.
+ * Convert a granulepos to an absolute frame index, starting at 0.
+ * The granulepos is interpreted in the context of a given theora_state handle.
+ * 
+ * Note that while the granulepos encodes the frame count (i.e. starting
+ * from 1) this call returns the frame index, starting from zero. Thus
+ * One can calculate the presentation time by multiplying the index by
+ * the rate.
  *
  * \param th A previously initialized theora_state handle (encode or decode)
  * \param granulepos The granulepos to convert.
- * \returns The frame number corresponding to \a granulepos.
+ * \returns The frame index corresponding to \a granulepos.
  * \retval -1 The given granulepos is undefined (i.e. negative)
  *
  * Thus function was added in the 1.0alpha4 release.
@@ -714,10 +673,15 @@
 
 /**
  * Convert a granulepos to absolute time in seconds. The granulepos is
- * interpreted in the context of a given theora_state handle.
+ * interpreted in the context of a given theora_state handle, and gives
+ * the end time of a frame's presentation as used in Ogg mux ordering.
+ *
  * \param th A previously initialized theora_state handle (encode or decode)
  * \param granulepos The granulepos to convert.
  * \returns The absolute time in seconds corresponding to \a granulepos.
+ *          This is the "end time" for the frame, or the latest time it should
+ *           be displayed.
+ *          It is not the presentation time.
  * \retval -1. The given granulepos is undefined (i.e. negative), or
  * \retval -1. The function has been disabled because floating 
  *              point support is not available.
@@ -823,6 +787,8 @@
  * \param buf_sz The size of the parameter buffer.*/
 extern int theora_control(theora_state *th,int req,void *buf,size_t buf_sz);
 
+/* @} */ /* end oldfuncs doxygen group */
+
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */

Modified: branches/theora-thusnelda/include/theora/theoradec.h
===================================================================
--- branches/theora-thusnelda/include/theora/theoradec.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/include/theora/theoradec.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -20,6 +20,7 @@
 
 #if !defined(_O_THEORA_THEORADEC_H_)
 # define _O_THEORA_THEORADEC_H_ (1)
+# include <stddef.h>
 # include <ogg/ogg.h>
 # include "codec.h"
 
@@ -216,6 +217,22 @@
 extern int th_decode_headerin(th_info *_info,th_comment *_tc,
  th_setup_info **_setup,ogg_packet *_op);
 /**Allocates a decoder instance.
+ *
+ * <b>Security Warning:</b> The Theora format supports very large frame sizes,
+ *  potentially even larger than the address space of a 32-bit machine, and
+ *  creating a decoder context allocates the space for several frames of data.
+ * If the allocation fails here, your program will crash, possibly at some
+ *  future point because the OS kernel returned a valid memory range and will
+ *  only fail when it tries to map the pages in it the first time they are
+ *  used.
+ * Even if it succeeds, you may experience a denial of service if the frame
+ *  size is large enough to cause excessive paging.
+ * If you are integrating libtheora in a larger application where such things
+ *  are undesirable, it is highly recommended that you check the frame size in
+ *  \a _info before calling this function and refuse to decode streams where it
+ *  is larger than some reasonable maximum.
+ * libtheora will not check this for you, because there may be machines that
+ *  can handle such streams and applications that wish to.
  * \param _info  A #th_info struct filled via th_decode_headerin().
  * \param _setup A #th_setup_info handle returned via
  *                th_decode_headerin().
@@ -256,7 +273,7 @@
  *                       The player can skip the call to th_decode_ycbcr_out(),
  *                        as the contents of the decoded frame buffer have not
  *                        changed.
- * \retval TH_EFAULT     \a _dec or _op was <tt>NULL</tt>.
+ * \retval TH_EFAULT     \a _dec or \a _op was <tt>NULL</tt>.
  * \retval TH_EBADPACKET \a _op does not contain encoded video data.
  * \retval TH_EIMPL      The video data uses bitstream features which this
  *                        library does not support.*/

Copied: branches/theora-thusnelda/include/theora/theoraenc.h (from rev 15592, trunk/theora/include/theora/theoraenc.h)
===================================================================
--- branches/theora-thusnelda/include/theora/theoraenc.h	                        (rev 0)
+++ branches/theora-thusnelda/include/theora/theoraenc.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -0,0 +1,266 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ *                                                                  *
+ ********************************************************************
+
+  function:
+  last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
+
+ ********************************************************************/
+
+/**\file
+ * The <tt>libtheoraenc</tt> C encoding API.*/
+
+#if !defined(_O_THEORA_THEORAENC_H_)
+# define _O_THEORA_THEORAENC_H_ (1)
+# include <stddef.h>
+# include <ogg/ogg.h>
+# include "codec.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+
+/**\name th_encode_ctl() codes
+ * \anchor encctlcodes
+ * These are the available request codes for th_encode_ctl().
+ * By convention, these are even, to distinguish them from the
+ *  \ref decctlcodes "decoder control codes".
+ * Keep any experimental or vendor-specific values above \c 0x8000.*/
+/*@{*/
+/**Sets the Huffman tables to use.
+ * The tables are copied, not stored by reference, so they can be freed after
+ *  this call.
+ * <tt>NULL</tt> may be specified to revert to the default tables.
+ *
+ * \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
+ * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EINVAL Encoding has already begun or one or more of the given
+ *                     tables is not full or prefix-free, \a _buf is
+ *                     <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
+ *                     non-<tt>NULL</tt> and \a _buf_sz is not
+ *                     <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_HUFFMAN_CODES (0)
+/**Sets the quantization parameters to use.
+ * The parameters are copied, not stored by reference, so they can be freed
+ *  after this call.
+ * <tt>NULL</tt> may be specified to revert to the default parameters.
+ * For the current encoder, <tt>scale[ci!=0][qi]</tt> must be no greater than
+ *  <tt>scale[ci!=0][qi-1]</tt> and <tt>base[qti][pli][qi][ci]</tt> must be no
+ *  greater than <tt>base[qti][pli][qi-1][ci]</tt>.
+ * These two conditions ensure that the actual quantizer for a given \a qti,
+ *  \a pli, and \a ci does not increase as \a qi increases.
+ *
+ * \param[in] _buf #th_quant_info
+ * \retval TH_EFAULT \a _enc_ctx is <tt>NULL</tt>.
+ * \retval TH_EINVAL Encoding has already begun, the quantization parameters
+ *                    do not meet one of the above stated conditions, \a _buf
+ *                    is <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf
+ *                    is non-<tt>NULL</tt> and \a _buf_sz is not
+ *                    <tt>sizeof(#th_quant_info)</tt>.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_QUANT_PARAMS (2)
+/**Sets the maximum distance between key frames.
+ * This can be changed during an encode, but will be bounded by
+ *  <tt>1<<th_info#keyframe_granule_shift</tt>.
+ * If it is set before encoding begins, th_info#keyframe_granule_shift will
+ *  be enlarged appropriately.
+ *
+ * \param[in]  _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
+ *                   frames.
+ * \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
+/**Disables any encoder features that would prevent lossless transcoding back
+ *  to VP3.
+ * This primarily means disabling block-level QI values and not using 4MV mode
+ *  when any of the luma blocks in a macro block are not coded.
+ * It also includes using the VP3 quantization tables and Huffman codes; if you
+ *  set them explicitly after calling this function, the resulting stream will
+ *  not be VP3-compatible.
+ * If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
+ *  material, or when using a picture region smaller than the full frame (e.g.
+ *  a non-multiple-of-16 width or height), then non-VP3 bitstream features will
+ *  still be disabled, but the stream will still not be VP3-compatible, as VP3
+ *  was not capable of encoding such formats.
+ * If you call this after encoding has already begun, then the quantization
+ *  tables and codebooks cannot be changed, but the frame-level features will
+ *  be enabled or disabled as requested.
+ *
+ * \param[in]  _buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
+ *                   or 0 to disable it (the default).
+ * \param[out] _buf <tt>int</tt>: 1 if all bitstream features required for
+ *                   VP3-compatibility could be set, and 0 otherwise.
+ *                  The latter will be returned if the pixel format is not
+ *                   4:2:0, the picture region is smaller than the full frame,
+ *                   or if encoding has begun, preventing the quantization
+ *                   tables and codebooks from being set.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_IMPL   Not supported by this implementation.*/
+#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
+/**Gets the maximum speed level.
+ * Higher speed levels favor quicker encoding over better quality per bit.
+ * Depending on the encoding mode, and the internal algorithms used, quality
+ *  may actually improve, but in this case bitrate will also likely increase.
+ * In any case, overall rate/distortion performance will probably decrease.
+ * The maximum value, and the meaning of each value, may change depending on
+ *  the current encoding mode (VBR vs. CQI, etc.).
+ *
+ * \param[out] _buf int: The maximum encoding speed level.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
+ * \retval TH_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
+/**Sets the speed level.
+ * By default, the slowest speed (0) is used.
+ *
+ * \param[in] _buf int: The new encoding speed level.
+ *                      0 is slowest, larger values use less CPU.
+ * \retval TH_EFAULT \a _enc_ctx or \a _buf is <tt>NULL</tt>.
+ * \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
+ *                    encoding speed level is out of bounds.
+ *                   The maximum encoding speed level may be
+ *                    implementation- and encoding mode-specific, and can be
+ *                    obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
+ * \retval TH_IMPL   Not supported by this implementation in the current
+ *                    encoding mode.*/
+#define TH_ENCCTL_SET_SPLEVEL (14)
+/*@}*/
+
+
+
+/**The quantization parameters used by VP3.*/
+extern const th_quant_info TH_VP31_QUANT_INFO;
+
+/**The Huffman tables used by VP3.*/
+extern const th_huff_code
+ TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
+
+
+
+/**\name Encoder state
+   The following data structure is opaque, and its contents are not publicly
+    defined by this API.
+   Referring to its internals directly is unsupported, and may break without
+    warning.*/
+/*@{*/
+/**The encoder context.*/
+typedef struct th_enc_ctx    th_enc_ctx;
+/*@}*/
+
+
+
+/**\defgroup encfuncs Functions for Encoding*/
+/*@{*/
+/**\name Functions for encoding
+ * You must link to <tt>libtheoraenc</tt> and <tt>libtheoradec</tt>
+ *  if you use any of the functions in this section.
+ *
+ * The functions are listed in the order they are used in a typical encode.
+ * The basic steps are:
+ * - Fill in a #th_info structure with details on the format of the video you
+ *    wish to encode.
+ * - Allocate a #th_enc_ctx handle with th_encode_alloc().
+ * - Perform any additional encoder configuration required with
+ *    th_encode_ctl().
+ * - Repeatedly call th_encode_flushheader() to retrieve all the header
+ *    packets.
+ * - For each uncompressed frame:
+ *   - Submit the uncompressed frame via th_encode_ycbcr_in()
+ *   - Repeatedly call th_encode_packetout() to retrieve any video data packets
+ *      that are ready.
+ * - Call th_encode_free() to release all encoder memory.*/
+/*@{*/
+/**Allocates an encoder instance.
+ * \param _info A #th_info struct filled with the desired encoding parameters.
+ * \return The initialized #th_enc_ctx handle.
+ * \retval NULL If the encoding parameters were invalid.*/
+extern th_enc_ctx *th_encode_alloc(const th_info *_info);
+/**Encoder control function.
+ * This is used to provide advanced control the encoding process.
+ * \param _enc    A #th_enc_ctx handle.
+ * \param _req    The control code to process.
+ *                See \ref encctlcodes "the list of available control codes"
+ *                 for details.
+ * \param _buf    The parameters for this control code.
+ * \param _buf_sz The size of the parameter buffer.*/
+extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
+/**Outputs the next header packet.
+ * This should be called repeatedly after encoder initialization until it
+ *  returns 0 in order to get all of the header packets, in order, before
+ *  encoding actual video data.
+ * \param _enc      A #th_enc_ctx handle.
+ * \param _comments The metadata to place in the comment header, when it is
+ *                   encoded.
+ * \param _op       An <tt>ogg_packet</tt> structure to fill.
+ *                  All of the elements of this structure will be set,
+ *                   including a pointer to the header data.
+ *                  The memory for the header data is owned by
+ *                   <tt>libtheoraenc</tt>, and may be invalidated when the
+ *                   next encoder function is called.
+ * \return A positive value indicates that a header packet was successfully
+ *          produced.
+ * \retval 0         No packet was produced, and no more header packets remain.
+ * \retval TH_EFAULT \a _enc, \a _comments, or \a _op was <tt>NULL</tt>.*/
+extern int th_encode_flushheader(th_enc_ctx *_enc,
+ th_comment *_comments,ogg_packet *_op);
+/**Submits an uncompressed frame to the encoder.
+ * \param _enc   A #th_enc_ctx handle.
+ * \param _ycbcr A buffer of Y'CbCr data to encode.
+ * \retval 0         Success.
+ * \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
+ * \retval TH_EINVAL The buffer size does not match the frame size the encoder
+ *                    was initialized with, or encoding has already
+ *                    completed.*/
+extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
+/**Retrieves encoded video data packets.
+ * This should be called repeatedly after each frame is submitted to flush any
+ *  encoded packets, until it returns 0.
+ * The encoder will not buffer these packets as subsequent frames are
+ *  compressed, so a failure to do so will result in lost video data.
+ * \note Currently the encoder operates in a one-frame-in, one-packet-out
+ *        manner.
+ *       However, this may be changed in the future.
+ * \param _enc  A #th_enc_ctx handle.
+ * \param _last Set this flag to a non-zero value if no more uncompressed
+ *               frames will be submitted.
+ *              This ensures that a proper EOS flag is set on the last packet.
+ * \param _op   An <tt>ogg_packet</tt> structure to fill.
+ *              All of the elements of this structure will be set, including a
+ *               pointer to the video data.
+ *              The memory for the video data is owned by
+ *               <tt>libtheoraenc</tt>, and may be invalidated when the next
+ *               encoder function is called.
+ * \return A positive value indicates that a video data packet was successfully
+ *          produced.
+ * \retval 0         No packet was produced, and no more encoded video data
+ *                    remains.
+ * \retval TH_EFAULT \a _enc or \a _op was <tt>NULL</tt>.*/
+extern int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op);
+/**Frees an allocated encoder instance.
+ * \param _enc A #th_enc_ctx handle.*/
+extern void th_encode_free(th_enc_ctx *_enc);
+/*@}*/
+/*@}*/
+
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif

Modified: branches/theora-thusnelda/lib/Makefile.am
===================================================================
--- branches/theora-thusnelda/lib/Makefile.am	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/Makefile.am	2009-02-06 09:43:27 UTC (rev 15675)
@@ -1,8 +1,8 @@
-INCLUDES = -I$(top_srcdir)/include -I$(top_srcdir)/lib -I$(top_srcdir)/lib/dec -I$(top_srcdir)/lib/enc
+INCLUDES = -I$(top_srcdir)/include
 AM_CFLAGS = $(OGG_CFLAGS) $(CAIRO_CFLAGS)
-LIBADD = $(OGG_LIBS) 
 
 EXTRA_DIST = \
+	cpu.c \
         enc/x86/dct_decode_mmx.c \
         enc/x86/dsp_mmx.c \
         enc/x86/dsp_mmxext.c \
@@ -14,12 +14,14 @@
         enc/x86_32_vs/recon_mmx.c \
         enc/dct_encode.c \
         enc/encode.c \
-        enc/encoder_toplevel.c
+        enc/encoder_toplevel.c \
+        dec/x86_vc
 
 lib_LTLIBRARIES = libtheoradec.la libtheoraenc.la libtheora.la
 
 if THEORA_DISABLE_ENCODE
 encoder_sources = \
+	enc/encapiwrapper.c \
 	enc/encoder_disabled.c
 else
 encoder_sources = \
@@ -29,6 +31,7 @@
 	enc/encoder_idct.c \
 	enc/encoder_toplevel.c \
 	enc/encoder_quant.c \
+	enc/encapiwrapper.c \
 	enc/dct.c \
 	enc/dct_decode.c \
 	enc/frarray.c \
@@ -58,6 +61,7 @@
 
 decoder_sources = \
 	dec/apiwrapper.c \
+	dec/bitpack.c \
 	dec/decapiwrapper.c \
 	dec/decinfo.c \
 	dec/decode.c \
@@ -97,10 +101,10 @@
 	enc/toplevel_lookup.h \
 	enc/dsp.h \
 	dec/apiwrapper.h \
+	dec/bitpack.h \
 	dec/dct.h \
 	dec/decint.h \
 	dec/dequant.h \
-	dec/enquant.h \
 	dec/huffdec.h \
 	dec/huffman.h \
 	dec/idct.h \
@@ -109,7 +113,6 @@
 	dec/x86/x86int.h
 
 libtheoradec_la_SOURCES = \
-  cpu.c \
 	$(decoder_arch_sources) \
 	$(decoder_sources) \
   Version_script-dec
@@ -118,24 +121,22 @@
   @THEORADEC_LDFLAGS@ @CAIRO_LIBS@
 
 libtheoraenc_la_SOURCES = \
-  cpu.c \
 	$(encoder_arch_sources) \
 	$(encoder_sources) \
   Version_script-enc
 libtheoraenc_la_LDFLAGS = \
   -version-info @THENC_LIB_CURRENT@:@THENC_LIB_REVISION@:@THENC_LIB_AGE@ \
-  @THEORAENC_LDFLAGS@
+  @THEORAENC_LDFLAGS@ $(OGG_LIBS)
 
 libtheora_la_SOURCES = \
-  cpu.c \
 	$(decoder_arch_sources) \
 	$(decoder_sources) \
 	$(encoder_arch_sources) \
 	$(encoder_sources) \
-  Version_script-old
+  Version_script
 libtheora_la_LDFLAGS = \
   -version-info @TH_LIB_CURRENT@:@TH_LIB_REVISION@:@TH_LIB_AGE@ \
-  @THEORA_LDFLAGS@ @CAIRO_LIBS@
+  @THEORA_LDFLAGS@ @CAIRO_LIBS@ $(OGG_LIBS)
 
 debug:
 	$(MAKE) all CFLAGS="@DEBUG@" 

Modified: branches/theora-thusnelda/lib/Version_script
===================================================================
--- branches/theora-thusnelda/lib/Version_script	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/Version_script	2009-02-06 09:43:27 UTC (rev 15675)
@@ -5,7 +5,7 @@
 # applications linking to the libraries.
 #
 
-# We use something that looks like an versioned so filename here 
+# We use something that looks like a versioned so filename here 
 # to define the old API because of a historical confusion. This
 # label must be kept to maintain ABI compatibility.
 

Modified: branches/theora-thusnelda/lib/Version_script-dec
===================================================================
--- branches/theora-thusnelda/lib/Version_script-dec	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/Version_script-dec	2009-02-06 09:43:27 UTC (rev 15675)
@@ -41,7 +41,7 @@
 };
 
 # The deprecated legacy api from the libtheora alpha releases.
-# We use something that looks like an versioned so filename here 
+# We use something that looks like a versioned so filename here 
 # to define the old API because of a historical confusion. This
 # label must be kept to maintain ABI compatibility.
 

Modified: branches/theora-thusnelda/lib/Version_script-enc
===================================================================
--- branches/theora-thusnelda/lib/Version_script-enc	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/Version_script-enc	2009-02-06 09:43:27 UTC (rev 15675)
@@ -24,7 +24,7 @@
 };
 
 # The encoder portion of the deprecated alpha release api.
-# We use something that looks like an versioned so filename here 
+# We use something that looks like a versioned so filename here 
 # to define the old API because of a historical confusion. This
 # label must be kept to maintain ABI compatibility.
 

Modified: branches/theora-thusnelda/lib/cpu.c
===================================================================
--- branches/theora-thusnelda/lib/cpu.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/cpu.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -25,7 +25,6 @@
   return 0;
 }
 #else
-
 # if !defined(_MSC_VER)
 #  if defined(__amd64__)||defined(__x86_64__)
 /*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
@@ -72,10 +71,10 @@
   do{ \
     ogg_uint32_t cpu_info[4]; \
     oc_cpuid_helper(cpu_info,_op); \
-    (_eax) = cpu_info[0]; \
-    (_ebx) = cpu_info[1]; \
-    (_ecx) = cpu_info[2]; \
-    (_edx) = cpu_info[3]; \
+    (_eax)=cpu_info[0]; \
+    (_ebx)=cpu_info[1]; \
+    (_ecx)=cpu_info[2]; \
+    (_edx)=cpu_info[3]; \
   }while(0)
 
 static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
@@ -90,14 +89,43 @@
     pushfd
     pop eax
     popfd
-    mov [_eax],eax
-    mov [_ebx],ebx
+    mov ecx,_eax
+    mov [ecx],eax
+    mov ecx,_ebx
+    mov [ecx],ebx
   }
 }
 # endif
 
-ogg_uint32_t oc_cpu_flags_get(void){
+static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
   ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
+  if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
+  if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+  if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
+  if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
+  if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
+  return flags;
+}
+
+static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
+  ogg_uint32_t flags;
+  /*If there isn't even MMX, give up.*/
+  if(!(_edx&0x00800000))return 0;
+  flags=OC_CPU_X86_MMX;
+  if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
+  if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
+  if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
+  if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
+  if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
+  return flags;
+}
+
+static ogg_uint32_t oc_cpu_flags_get(void){
+  ogg_uint32_t flags;
   ogg_uint32_t eax;
   ogg_uint32_t ebx;
   ogg_uint32_t ecx;
@@ -133,12 +161,7 @@
    ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
     /*Intel, Transmeta (tested with Crusoe TM5800):*/
     cpuid(1,eax,ebx,ecx,edx);
-    /*If there isn't even MMX, give up.*/
-    if(!(edx&0x00800000))return 0;
-    flags=OC_CPU_X86_MMX;
-    if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-    if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-    if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+    flags=oc_parse_intel_flags(edx,ecx);
   }
   /*              D M A c          i t n e          h t u A*/
   else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
@@ -146,50 +169,53 @@
    ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
     /*AMD, Geode:*/
     cpuid(0x80000000,eax,ebx,ecx,edx);
-    if(eax<0x80000001){
-      /*No extended functions supported.
-        Use normal cpuid flags.*/
-      cpuid(1,eax,ebx,ecx,edx);
-      /*If there isn't even MMX, give up.*/
-      if(!(edx&0x00800000))return 0;
-      flags=OC_CPU_X86_MMX;
-      if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-    }
+    if(eax<0x80000001)flags=0;
     else{
       cpuid(0x80000001,eax,ebx,ecx,edx);
-      /*If there isn't even MMX, give up.*/
-      if(!(edx&0x00800000))return 0;
-      flags=OC_CPU_X86_MMX;
-      if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
-      if(edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
-      if(edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
-      /*Also check for SSE.*/
-      cpuid(1,eax,ebx,ecx,edx);
-      if(edx&0x02000000)flags|=OC_CPU_X86_SSE;
+      flags=oc_parse_amd_flags(edx,ecx);
     }
-    if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-    if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+    /*Also check for SSE.*/
+    cpuid(1,eax,ebx,ecx,edx);
+    flags|=oc_parse_intel_flags(edx,ecx);
   }
+  /*Technically some VIA chips can be configured in the BIOS to return any
+     string here the user wants.
+    There is a special detection method that can be used to identify such
+     processors, but in my opinion, if the user really wants to change it, they
+     deserve what they get.*/
   /*              s l u a          H r u a          t n e C*/
   else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
     /*VIA:*/
-    /*The C7 (and later?) processors support Intel-like cpuid info.*/
-    /*The C3-2 (Nehemiah) cores appear to, as well.*/
+    /*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
+       chips (thanks to the engineers from Centaur Technology who provided it).
+      These chips support Intel-like cpuid info.
+      The C3-2 (Nehemiah) cores appear to, as well.*/
     cpuid(1,eax,ebx,ecx,edx);
-    if(edx&0x00800000){
-      flags=OC_CPU_X86_MMX;
-      if(edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
-      if(edx&0x04000000)flags|=OC_CPU_X86_SSE2;
-      if(ecx&0x00000001)flags|=OC_CPU_X86_PNI;
+    flags=oc_parse_intel_flags(edx,ecx);
+    if(eax>=0x80000001){
+      /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
+        We need to check this even if the Intel test succeeds to pick up 3DNow!
+         support on these processors.
+        Unlike actual AMD processors, we cannot _rely_ on this info, since
+         some cores (e.g., the 693 stepping of the Nehemiah) claim to support
+         this function, yet return edx=0, despite the Intel test indicating
+         MMX support.
+        Therefore the features detected here are strictly added to those
+         detected by the Intel test.*/
+      /*TODO: How about earlier chips?*/
+      cpuid(0x80000001,eax,ebx,ecx,edx);
+      /*Note: As of the C7, this function returns Intel-style extended feature
+         flags, not AMD-style.
+        Currently, this only defines bits 11, 20, and 29 (0x20100800), which
+         do not conflict with any of the AMD flags we inspect.
+        For the remaining bits, Intel tells us, "Do not count on their value",
+         but VIA assures us that they will all be zero (at least on the C7 and
+         Isaiah chips).
+        In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
+         (0xC0C00000) for something else, we will have to add code to detect
+         the model to decide when it is appropriate to inspect them.*/
+      flags|=oc_parse_amd_flags(edx,ecx);
     }
-    else flags=0;
-    /*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
-      We need to check this even if the Intel test succeeds to pick up 3dnow!
-       support on these processors.*/
-    /*TODO: How about earlier chips?*/
-    cpuid(0x80000001,eax,ebx,ecx,edx);
-    if(edx&0x00800000)flags|=OC_CPU_X86_MMX;
-    if(edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
   }
   else{
     /*Implement me.*/

Modified: branches/theora-thusnelda/lib/cpu.h
===================================================================
--- branches/theora-thusnelda/lib/cpu.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/cpu.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
  function:
@@ -18,14 +18,17 @@
 # define _x86_cpu_H (1)
 #include "internal.h"
 
-#define OC_CPU_X86_MMX    (1<<0)
-#define OC_CPU_X86_3DNOW  (1<<1)
+#define OC_CPU_X86_MMX      (1<<0)
+#define OC_CPU_X86_3DNOW    (1<<1)
 #define OC_CPU_X86_3DNOWEXT (1<<2)
-#define OC_CPU_X86_MMXEXT (1<<3)
-#define OC_CPU_X86_SSE    (1<<4)
-#define OC_CPU_X86_SSE2   (1<<5)
-#define OC_CPU_X86_PNI    (1<<6)
+#define OC_CPU_X86_MMXEXT   (1<<3)
+#define OC_CPU_X86_SSE      (1<<4)
+#define OC_CPU_X86_SSE2     (1<<5)
+#define OC_CPU_X86_PNI      (1<<6)
+#define OC_CPU_X86_SSSE3    (1<<7)
+#define OC_CPU_X86_SSE4_1   (1<<8)
+#define OC_CPU_X86_SSE4_2   (1<<9)
+#define OC_CPU_X86_SSE4A    (1<<10)
+#define OC_CPU_X86_SSE5     (1<<11)
 
-ogg_uint32_t oc_cpu_flags_get(void);
-
 #endif

Modified: branches/theora-thusnelda/lib/dec/apiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/dec/apiwrapper.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/apiwrapper.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -40,7 +40,7 @@
   memset(_ci,0,sizeof(*_ci));
   if(api!=NULL){
     if(api->clear!=NULL)(*api->clear)(api);
-    free(api);
+    _ogg_free(api);
   }
 }
 

Modified: branches/theora-thusnelda/lib/dec/apiwrapper.h
===================================================================
--- branches/theora-thusnelda/lib/dec/apiwrapper.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/apiwrapper.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Copied: branches/theora-thusnelda/lib/dec/bitpack.c (from rev 15674, trunk/theora/lib/dec/bitpack.c)
===================================================================
--- branches/theora-thusnelda/lib/dec/bitpack.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/dec/bitpack.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -0,0 +1,121 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2008             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id$
+
+ ********************************************************************/
+
+/*We're 'MSb' endian; if we write a word but read individual bits,
+   then we'll read the MSb first.*/
+
+#include <string.h>
+#include <stdlib.h>
+#include "bitpack.h"
+
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes){
+  memset(_b,0,sizeof(*_b));
+  _b->buffer=_b->ptr=_buf;
+  _b->storage=_bytes;
+}
+
+int theorapackB_look1(oggpack_buffer *_b,long *_ret){
+  if(_b->endbyte>=_b->storage){
+    *_ret=0L;
+    return -1;
+  }
+  *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
+  return 0;
+}
+
+void theorapackB_adv1(oggpack_buffer *_b){
+  if(++(_b->endbit)>7){
+    _b->endbit=0;
+    _b->ptr++;
+    _b->endbyte++;
+  }
+}
+
+/*Here we assume that 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret){
+  long ret;
+  long m;
+  long d;
+  int fail;
+  m=32-_bits;
+  _bits+=_b->endbit;
+  d=_b->storage-_b->endbyte;
+  if(d<=4){
+    /*Not the main path.*/
+    if(d*8<_bits){
+      *_ret=0L;
+      fail=-1;
+      goto overflow;
+    }
+    /*Special case to avoid reading _b->ptr[0], which might be past the end of
+       the buffer; also skips some useless accounting.*/
+    else if(!_bits){
+      *_ret=0L;
+      return 0;
+    }
+  }
+  ret=_b->ptr[0]<<24+_b->endbit;
+  if(_bits>8){
+    ret|=_b->ptr[1]<<16+_b->endbit;
+    if(_bits>16){
+      ret|=_b->ptr[2]<<8+_b->endbit;
+      if(_bits>24){
+        ret|=_b->ptr[3]<<_b->endbit;
+        if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
+      }
+    }
+  }
+  *_ret=((ret&0xFFFFFFFFUL)>>(m>>1))>>(m+1>>1);
+  fail=0;
+overflow:
+  _b->ptr+=_bits>>3;
+  _b->endbyte+=_bits>>3;
+  _b->endbit=_bits&7;
+  return fail;
+}
+
+int theorapackB_read1(oggpack_buffer *_b,long *_ret){
+  int fail;
+  if(_b->endbyte>=_b->storage){
+    /*Not the main path.*/
+    *_ret=0L;
+    fail=-1;
+  }
+  else{
+    *_ret=(_b->ptr[0]>>7-_b->endbit)&1;
+    fail=0;
+  }
+  _b->endbit++;
+  if(_b->endbit>7){
+    _b->endbit=0;
+    _b->ptr++;
+    _b->endbyte++;
+  }
+  return fail;
+}
+
+long theorapackB_bytes(oggpack_buffer *_b){
+  return _b->endbyte+(_b->endbit+7>>3);
+}
+
+long theorapackB_bits(oggpack_buffer *_b){
+  return _b->endbyte*8+_b->endbit;
+}
+
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b){
+  return _b->buffer;
+}

Copied: branches/theora-thusnelda/lib/dec/bitpack.h (from rev 15674, trunk/theora/lib/dec/bitpack.h)
===================================================================
--- branches/theora-thusnelda/lib/dec/bitpack.h	                        (rev 0)
+++ branches/theora-thusnelda/lib/dec/bitpack.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -0,0 +1,38 @@
+/********************************************************************
+ *                                                                  *
+ * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
+ * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
+ * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
+ * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
+ *                                                                  *
+ * THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2008             *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
+ *                                                                  *
+ ********************************************************************
+
+  function: packing variable sized words into an octet stream
+  last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
+
+ ********************************************************************/
+#if !defined(_bitpack_H)
+# define _bitpack_H (1)
+# include <ogg/ogg.h>
+
+void theorapackB_readinit(oggpack_buffer *_b,unsigned char *_buf,int _bytes);
+int theorapackB_look1(oggpack_buffer *_b,long *_ret);
+void theorapackB_adv1(oggpack_buffer *_b);
+/*Here we assume 0<=_bits&&_bits<=32.*/
+int theorapackB_read(oggpack_buffer *_b,int _bits,long *_ret);
+int theorapackB_read1(oggpack_buffer *_b,long *_ret);
+long theorapackB_bytes(oggpack_buffer *_b);
+long theorapackB_bits(oggpack_buffer *_b);
+unsigned char *theorapackB_get_buffer(oggpack_buffer *_b);
+
+/*These two functions are implemented locally in huffdec.c*/
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+/*static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret);*/
+/*static void theorapackB_adv(oggpack_buffer *_b,int _bits);*/
+
+
+#endif

Modified: branches/theora-thusnelda/lib/dec/dct.h
===================================================================
--- branches/theora-thusnelda/lib/dec/dct.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/dct.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/dec/decapiwrapper.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decapiwrapper.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/decapiwrapper.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -93,7 +93,6 @@
   th_api_info    *apiinfo;
   th_api_wrapper *api;
   th_info         info;
-
   api=(th_api_wrapper *)_ci->codec_setup;
   /*Allocate our own combined API wrapper/theora_info struct.
     We put them both in one malloc'd block so that when the API wrapper is
@@ -131,7 +130,6 @@
   th_api_wrapper *api;
   th_info         info;
   int             ret;
-
   api=(th_api_wrapper *)_ci->codec_setup;
   /*Allocate an API wrapper struct on demand, since it will not also include a
      theora_info struct like the ones that are used in a theora_state struct.*/
@@ -164,9 +162,9 @@
   th_api_wrapper *api;
   ogg_int64_t     gp;
   int             ret;
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
   api=(th_api_wrapper *)_td->i->codec_setup;
   ret=th_decode_packetin(api->decode,_op,&gp);
-
   if(ret<0)return OC_BADPACKET;
   _td->granulepos=gp;
   return 0;
@@ -177,11 +175,11 @@
   th_dec_ctx      *decode;
   th_ycbcr_buffer  buf;
   int              ret;
-
+  if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
   api=(th_api_wrapper *)_td->i->codec_setup;
-  decode = (th_dec_ctx *)api->decode;
+  decode=(th_dec_ctx *)api->decode;
+  if(!decode)return OC_FAULT;
   ret=th_decode_ycbcr_out(decode,buf);
-
 #ifdef HAVE_CAIRO
   /* If telemetry ioctls are active, we need to draw to the output
      buffer.  Stuff the plane into cairo. */
@@ -505,18 +503,16 @@
     cairo_surface_destroy(cs);
   }
 #endif
-
   if(ret>=0){
     _yuv->y_width=buf[0].width;
     _yuv->y_height=buf[0].height;
-    _yuv->y_stride=buf[0].ystride;
+    _yuv->y_stride=buf[0].stride;
     _yuv->uv_width=buf[1].width;
     _yuv->uv_height=buf[1].height;
-    _yuv->uv_stride=buf[1].ystride;
+    _yuv->uv_stride=buf[1].stride;
     _yuv->y=buf[0].data;
     _yuv->u=buf[1].data;
     _yuv->v=buf[2].data;
   }
-
   return ret;
 }

Modified: branches/theora-thusnelda/lib/dec/decinfo.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decinfo.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/decinfo.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -17,6 +17,7 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 #include "decint.h"
 
 
@@ -29,7 +30,7 @@
 static void oc_unpack_octets(oggpack_buffer *_opb,char *_buf,size_t _len){
   while(_len-->0){
     long val;
-    theora_read(_opb,8,&val);
+    theorapackB_read(_opb,8,&val);
     *_buf++=(char)val;
   }
 }
@@ -38,18 +39,18 @@
 static long oc_unpack_length(oggpack_buffer *_opb){
   long ret[4];
   int  i;
-  for(i=0;i<4;i++)theora_read(_opb,8,ret+i);
+  for(i=0;i<4;i++)theorapackB_read(_opb,8,ret+i);
   return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24;
 }
 
 static int oc_info_unpack(oggpack_buffer *_opb,th_info *_info){
   long val;
   /*Check the codec bitstream version.*/
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->version_major=(unsigned char)val;
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->version_minor=(unsigned char)val;
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->version_subminor=(unsigned char)val;
   /*verify we can parse this bitstream version.
      We accept earlier minors and all subminors, by spec*/
@@ -59,77 +60,82 @@
     return TH_EVERSION;
   }
   /*Read the encoded frame description.*/
-  theora_read(_opb,16,&val);
+  theorapackB_read(_opb,16,&val);
   _info->frame_width=(ogg_uint32_t)val<<4;
-  theora_read(_opb,16,&val);
+  theorapackB_read(_opb,16,&val);
   _info->frame_height=(ogg_uint32_t)val<<4;
-  theora_read(_opb,24,&val);
+  theorapackB_read(_opb,24,&val);
   _info->pic_width=(ogg_uint32_t)val;
-  theora_read(_opb,24,&val);
+  theorapackB_read(_opb,24,&val);
   _info->pic_height=(ogg_uint32_t)val;
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->pic_x=(ogg_uint32_t)val;
   /*Note: The sense of pic_y is inverted in what we pass back to the
      application compared to how it is stored in the bitstream.
     This is because the bitstream uses a right-handed coordinate system, while
      applications expect a left-handed one.*/
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->pic_y=_info->frame_height-_info->pic_height-(ogg_uint32_t)val;
-  theora_read32(_opb,&val);
+  theorapackB_read(_opb,32,&val);
   _info->fps_numerator=(ogg_uint32_t)val;
-  theora_read32(_opb,&val);
+  theorapackB_read(_opb,32,&val);
   _info->fps_denominator=(ogg_uint32_t)val;
-  if(_info->frame_width<=0||_info->frame_height<=0||
+  if(_info->frame_width==0||_info->frame_height==0||
    _info->pic_width+_info->pic_x>_info->frame_width||
    _info->pic_height+_info->pic_y>_info->frame_height||
-   _info->fps_numerator<=0||_info->fps_denominator<=0){
+   _info->fps_numerator==0||_info->fps_denominator==0){
     return TH_EBADHEADER;
   }
-  theora_read(_opb,24,&val);
+  theorapackB_read(_opb,24,&val);
   _info->aspect_numerator=(ogg_uint32_t)val;
-  theora_read(_opb,24,&val);
+  theorapackB_read(_opb,24,&val);
   _info->aspect_denominator=(ogg_uint32_t)val;
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   _info->colorspace=(th_colorspace)val;
-  theora_read(_opb,24,&val);
+  theorapackB_read(_opb,24,&val);
   _info->target_bitrate=(int)val;
-  theora_read(_opb,6,&val);
+  theorapackB_read(_opb,6,&val);
   _info->quality=(int)val;
-  theora_read(_opb,5,&val);
+  theorapackB_read(_opb,5,&val);
   _info->keyframe_granule_shift=(int)val;
-  theora_read(_opb,2,&val);
+  theorapackB_read(_opb,2,&val);
   _info->pixel_fmt=(th_pixel_fmt)val;
   if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER;
-  if(theora_read(_opb,3,&val)<0||val!=0)return TH_EBADHEADER;
+  if(theorapackB_read(_opb,3,&val)<0||val!=0)return TH_EBADHEADER;
   return 0;
 }
 
 static int oc_comment_unpack(oggpack_buffer *_opb,th_comment *_tc){
   long len;
+  int  i;
   /*Read the vendor string.*/
   len=oc_unpack_length(_opb);
-  if(len<0)return TH_EBADHEADER;
+  if(len<0||theorapackB_bytes(_opb)+len>_opb->storage)return TH_EBADHEADER;
   _tc->vendor=_ogg_malloc((size_t)len+1);
   oc_unpack_octets(_opb,_tc->vendor,len);
   _tc->vendor[len]='\0';
   /*Read the user comments.*/
-  _tc->comments=oc_unpack_length(_opb);
-  if(_tc->comments>=0){
-    int i;
-    _tc->comment_lengths=(int *)_ogg_malloc(
-     _tc->comments*sizeof(_tc->comment_lengths[0]));
-    _tc->user_comments=(char **)_ogg_malloc(
-     _tc->comments*sizeof(_tc->user_comments[0]));
-    for(i=0;i<_tc->comments;i++){
-      len=oc_unpack_length(_opb);
-      if(len<0)return TH_EBADHEADER;
-      _tc->comment_lengths[i]=len;
-      _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
-      oc_unpack_octets(_opb,_tc->user_comments[i],len);
-      _tc->user_comments[i][len]='\0';
+  _tc->comments=(int)oc_unpack_length(_opb);
+  if(_tc->comments<0||_tc->comments>(LONG_MAX>>2)||
+   theorapackB_bytes(_opb)+((long)_tc->comments<<2)>_opb->storage){
+    return TH_EBADHEADER;
+  }
+  _tc->comment_lengths=(int *)_ogg_malloc(
+   _tc->comments*sizeof(_tc->comment_lengths[0]));
+  _tc->user_comments=(char **)_ogg_malloc(
+   _tc->comments*sizeof(_tc->user_comments[0]));
+  for(i=0;i<_tc->comments;i++){
+    len=oc_unpack_length(_opb);
+    if(len<0||theorapackB_bytes(_opb)+len>_opb->storage){
+      _tc->comments=i;
+      return TH_EBADHEADER;
     }
+    _tc->comment_lengths[i]=len;
+    _tc->user_comments[i]=_ogg_malloc((size_t)len+1);
+    oc_unpack_octets(_opb,_tc->user_comments[i],len);
+    _tc->user_comments[i][len]='\0';
   }
-  return theora_read(_opb,0,&len)<0?TH_EBADHEADER:0;
+  return theorapackB_read(_opb,0,&len)<0?TH_EBADHEADER:0;
 }
 
 static int oc_setup_unpack(oggpack_buffer *_opb,th_setup_info *_setup){
@@ -152,7 +158,7 @@
   long val;
   int  packtype;
   int  ret;
-  theora_read(_opb,8,&val);
+  theorapackB_read(_opb,8,&val);
   packtype=(int)val;
   /*If we're at a data packet and we have received all three headers, we're
      done.*/
@@ -177,7 +183,7 @@
       if(_tc==NULL)return TH_EFAULT;
       /*We shoud have already decoded the info header, and should not yet have
          decoded the comment header.*/
-      if(_info->frame_width<=0||_tc->vendor!=NULL)return TH_EBADHEADER;
+      if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER;
       ret=oc_comment_unpack(_opb,_tc);
       if(ret<0)th_comment_clear(_tc);
       else ret=2;
@@ -188,7 +194,7 @@
       if(_tc==NULL||_setup==NULL)return TH_EFAULT;
       /*We should have already decoded the info header and the comment header,
          and should not yet have decoded the setup header.*/
-      if(_info->frame_width<=0||_tc->vendor==NULL||*_setup!=NULL){
+      if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){
         return TH_EBADHEADER;
       }
       setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup));
@@ -220,9 +226,8 @@
   int            ret;
   if(_op==NULL)return TH_EBADHEADER;
   if(_info==NULL)return TH_EFAULT;
-  oggpackB_readinit(&opb,_op->packet,_op->bytes);
+  theorapackB_readinit(&opb,_op->packet,_op->bytes);
   ret=oc_dec_headerin(&opb,_info,_tc,_setup,_op);
-  /*TODO: Clear opb in libogg2.*/
   return ret;
 }
 

Modified: branches/theora-thusnelda/lib/dec/decint.h
===================================================================
--- branches/theora-thusnelda/lib/dec/decint.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/decint.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -20,6 +20,7 @@
 # define _decint_H (1)
 # include "theora/theoradec.h"
 # include "../internal.h"
+# include "bitpack.h"
 
 typedef struct th_setup_info oc_setup_info;
 typedef struct th_dec_ctx    oc_dec_ctx;
@@ -46,100 +47,54 @@
 
 struct th_dec_ctx{
   /*Shared encoder/decoder state.*/
-  oc_theora_state          state;
+  oc_theora_state      state;
   /*Whether or not packets are ready to be emitted.
     This takes on negative values while there are remaining header packets to
      be emitted, reaches 0 when the codec is ready for input, and goes to 1
      when a frame has been processed and a data packet is ready.*/
-  int                      packet_state;
+  int                  packet_state;
   /*Buffer in which to assemble packets.*/
-  oggpack_buffer           opb;
+  oggpack_buffer       opb;
   /*Huffman decode trees.*/
-  oc_huff_node            *huff_tables[TH_NHUFFMAN_TABLES];
+  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of one past the last token in each plane for each coefficient.
     The final entries are the total number of tokens for each coefficient.*/
-  int                      ti0[3][64];
+  int                  ti0[3][64];
   /*The index of one past the last extra bits entry in each plane for each
      coefficient.
     The final entries are the total number of extra bits entries for each
      coefficient.*/
-  int                      ebi0[3][64];
+  int                  ebi0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
-  int                      eob_runs[3][64];
+  int                  eob_runs[3][64];
   /*The DCT token lists.*/
-  unsigned char          **dct_tokens;
+  unsigned char      **dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  ogg_uint16_t           **extra_bits;
+  ogg_uint16_t       **extra_bits;
   /*The out-of-loop post-processing level.*/
-  int                      pp_level;
+  int                  pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
-  int                      pp_dc_scale[64];
+  int                  pp_dc_scale[64];
   /*The sharpen modifier used for out-of-loop deringing.*/
-  int                      pp_sharp_mod[64];
+  int                  pp_sharp_mod[64];
   /*The DC quantization index of each block.*/
-  unsigned char           *dc_qis;
+  unsigned char       *dc_qis;
   /*The variance of each block.*/
-  int                     *variances;
+  int                 *variances;
   /*The storage for the post-processed frame buffer.*/
-  unsigned char           *pp_frame_data;
+  unsigned char       *pp_frame_data;
   /*Whether or not the post-processsed frame buffer has space for chroma.*/
-  int                      pp_frame_has_chroma;
+  int                  pp_frame_has_chroma;
   /*The buffer used for the post-processed frame.*/
-  th_ycbcr_buffer          pp_frame_buf;
+  th_ycbcr_buffer      pp_frame_buf;
   /*The striped decode callback function.*/
-  th_stripe_callback       stripe_cb;
-
-  /* output metrics for debugging */
-  int                      telemetry;
-  int                      telemetry_mbmode;
-  int                      telemetry_mv;
-  unsigned char           *telemetry_frame_data;
-
+  th_stripe_callback   stripe_cb;
+  /*Output metrics for debugging.*/
+  int                  telemetry;
+  int                  telemetry_mbmode;
+  int                  telemetry_mv;
+  unsigned char       *telemetry_frame_data;
 };
 
-/*Fix-ups for the libogg1 API, which returns -1 when there are insufficient
-   bits left in the packet as the value read.
-  This has two problems:
-  a) Cannot distinguish between reading 32 1 bits and failing to have
-   sufficient bits left in the packet.
-  b) Returns values that are outside the range [0..(1<<nbits)-1], which can
-   crash code that uses such values as indexes into arrays, etc.
-
-  We solve the first problem by doing two reads and combining the results.
-  We solve the second problem by masking out the result based on the sign bit
-   of the return value.
-  It's a little more work, but branchless, so it should not slow us down much.
-
-  The libogg2 API does not have these problems, and the definitions of the
-   functions below can be replaced by direct libogg2 calls.
-
-  One issue remaining is that in libogg2, the return value and the number of
-   bits parameters are swapped between the read and write functions.
-  This can cause some confusion.
-  We could fix that in our wrapper here, but then we would be swapped from the
-   normal libogg2 calls, which could also cause confusion.
-  For the moment we keep the libogg2 parameter ordering.*/
-
-/*Read 32 bits.
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-extern int theora_read32(oggpack_buffer *_opb,long *_ret);
-/*Read n bits, where n <= 31 for libogg1.
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-extern int theora_read(oggpack_buffer *_opb,int _nbits,long *_ret);
-/*Read 1 bit,
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-extern int theora_read1(oggpack_buffer *_opb,long *_ret);
-/*Look ahead n bits, where n <= 31 for libogg1.
-  In the event that there are some bits remaining, but fewer than n, then the
-   remaining bits are returned, with the missing bits set to 0, and the
-   function succeeds.
-  The stream can be advanced afterwards with oggpackB_adv().
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-extern int theora_look(oggpack_buffer *_opb,int _nbits,long *_ret);
-
 #endif

Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,13 +6,13 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
   function:
     last mod: $Id$
-  
+
  ********************************************************************/
 
 #include <stdlib.h>
@@ -43,72 +43,8 @@
 /*Maximum valid post-processing level.*/
 #define OC_PP_LEVEL_MAX       (7)
 
-/*Read 32 bits.
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-int theora_read32(oggpack_buffer *_opb,long *_ret){
-  long ret1;
-  long ret2;
-  long mask;
-  ret1=oggpackB_read(_opb,16);
-  ret2=oggpackB_read(_opb,16);
-  mask=ret2>>31;
-  *_ret=((ret1<<16)|ret2)&~mask;
-  return (int)mask;
-}
 
-/*Read n bits, where n <= 31 for libogg1.
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-int theora_read(oggpack_buffer *_opb,int _nbits,long *_ret){
-  long mask;
-  *_ret=oggpackB_read(_opb,_nbits);
-  mask=*_ret>>31;
-  *_ret&=~mask;
-  return (int)mask;
-}
 
-/*Read 1 bit,
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-int theora_read1(oggpack_buffer *_opb,long *_ret){
-  int mask;
-  *_ret=oggpackB_read1(_opb);
-  mask=(int)*_ret>>31;
-  *_ret&=~mask;
-  return mask;
-}
-
-/*Look ahead n bits, where n <= 31 for libogg1.
-  In the event that there are some bits remaining, but fewer than n, then the
-   remaining bits are returned, with the missing bits set to 0, and the
-   function succeeds.
-  The stream can be advanced afterwards with oggpackB_adv().
-  *_ret is set to 0 on failure.
-  Return: 0 on success, or a negative value on failure.*/
-int theora_look(oggpack_buffer *_opb,int _nbits,long *_ret){
-  int nbits;
-  *_ret=oggpackB_look(_opb,_nbits);
-  if(*_ret>=0)return 0;
-  /*libogg1 fails if we try to look past the end of the stream.
-    We might be looking ahead more bits than we actually need, however, and so
-     we must return the ones that are actually there.*/
-  /*There's no accessor for the storage field, which we need to figure out
-     how many bits _are_ left in the buffer (without resorting to trial and
-     error, which would be silly).*/
-  nbits=(_opb->storage<<3)-oggpackB_bits(_opb);
-  if(nbits>0){
-    /*If there are some bits left, return them.*/
-    *_ret=oggpackB_look(_opb,nbits)<<_nbits-nbits;
-    /*Success should be guaranteed.*/
-    return 0;
-  }
-  /*If there are no bits left, then we truly should fail.*/
-  *_ret=0;
-  return -1;
-}
-
-
 /*The mode alphabets for the various mode coding schemes.
   Scheme 0 uses a custom alphabet, which is not stored in this table.*/
 static const int OC_MODE_ALPHABETS[7][OC_NMODES]={
@@ -165,26 +101,26 @@
      11110xxx                10-17
      111110xxxx              18-33
      111111xxxxxxxxxxxx      34-4129*/
-  theora_read1(_opb,&bits);
+  theorapackB_read1(_opb,&bits);
   if(bits==0)return 1;
-  theora_read(_opb,2,&bits);
+  theorapackB_read(_opb,2,&bits);
   if((bits&2)==0)return 2+(int)bits;
   else if((bits&1)==0){
-    theora_read1(_opb,&bits);
+    theorapackB_read1(_opb,&bits);
     return 4+(int)bits;
   }
-  theora_read(_opb,3,&bits);
+  theorapackB_read(_opb,3,&bits);
   if((bits&4)==0)return 6+(int)bits;
   else if((bits&2)==0){
     ret=10+((bits&1)<<2);
-    theora_read(_opb,2,&bits);
+    theorapackB_read(_opb,2,&bits);
     return ret+(int)bits;
   }
   else if((bits&1)==0){
-    theora_read(_opb,4,&bits);
+    theorapackB_read(_opb,4,&bits);
     return 18+(int)bits;
   }
-  theora_read(_opb,12,&bits);
+  theorapackB_read(_opb,12,&bits);
   return 34+(int)bits;
 }
 
@@ -199,21 +135,21 @@
      1110xx                  7-10
      11110xx                 11-14
      11111xxxx               15-30*/
-  theora_read(_opb,2,&bits);
+  theorapackB_read(_opb,2,&bits);
   if((bits&2)==0)return 1+(int)bits;
   else if((bits&1)==0){
-    theora_read1(_opb,&bits);
+    theorapackB_read1(_opb,&bits);
     return 3+(int)bits;
   }
-  theora_read(_opb,2,&bits);
+  theorapackB_read(_opb,2,&bits);
   if((bits&2)==0)return 5+(int)bits;
   else if((bits&1)==0){
-    theora_read(_opb,2,&bits);
+    theorapackB_read(_opb,2,&bits);
     return 7+(int)bits;
   }
-  theora_read(_opb,3,&bits);
+  theorapackB_read(_opb,3,&bits);
   if((bits&4)==0)return 11+bits;
-  theora_read(_opb,2,&bits2);
+  theorapackB_read(_opb,2,&bits2);
   return 15+((bits&3)<<2)+bits2;
 }
 
@@ -227,13 +163,14 @@
   int ret;
   ret=oc_state_init(&_dec->state,_info);
   if(ret<0)return ret;
-  oc_huff_trees_copy(_dec->huff_tables,_setup->huff_tables);
+  oc_huff_trees_copy(_dec->huff_tables,
+   (const oc_huff_node *const *)_setup->huff_tables);
   for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
     _dec->state.dequant_tables[qti][pli]=
      _dec->state.dequant_table_data[qti][pli];
   }
   oc_dequant_tables_init(_dec->state.dequant_tables,_dec->pp_dc_scale,
-			 &_setup->qinfo);
+   &_setup->qinfo);
   for(qi=0;qi<64;qi++){
     int qsum;
     qsum=0;
@@ -279,36 +216,34 @@
 
 static int oc_dec_frame_header_unpack(oc_dec_ctx *_dec){
   long val;
-
   /*Check to make sure this is a data packet.*/
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   if(val!=0)return TH_EBADPACKET;
   /*Read in the frame type (I or P).*/
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   _dec->state.frame_type=(int)val;
   /*Read in the current qi.*/
-  theora_read(&_dec->opb,6,&val);
+  theorapackB_read(&_dec->opb,6,&val);
   _dec->state.qis[0]=(int)val;
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   if(!val)_dec->state.nqis=1;
   else{
-    theora_read(&_dec->opb,6,&val);
+    theorapackB_read(&_dec->opb,6,&val);
     _dec->state.qis[1]=(int)val;
-    theora_read1(&_dec->opb,&val);
+    theorapackB_read1(&_dec->opb,&val);
     if(!val)_dec->state.nqis=2;
     else{
-      theora_read(&_dec->opb,6,&val);
+      theorapackB_read(&_dec->opb,6,&val);
       _dec->state.qis[2]=(int)val;
       _dec->state.nqis=3;
     }
   }
-
   if(_dec->state.frame_type==OC_INTRA_FRAME){
     /*Keyframes have 3 unused configuration bits, holdovers from VP3 days.
       Most of the other unused bits in the VP3 headers were eliminated.
       I don't know why these remain.*/
     /* I wanted to eliminate wasted bits, but not all config wiggle room --Monty */
-    theora_read(&_dec->opb,3,&val);
+    theorapackB_read(&_dec->opb,3,&val);
     if(val!=0)return TH_EIMPL;
   }
   return 0;
@@ -364,9 +299,8 @@
   int    flag;
   int    npartial;
   int    run_count;
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   flag=(int)val;
-
   sb=_dec->state.sbs;
   sb_end=sb+_dec->state.nsbs;
   run_count=npartial=0;
@@ -380,10 +314,9 @@
       npartial+=flag;
       sb++;
     }
-
     while(--run_count>0&&sb<sb_end);
     if(full_run&&sb<sb_end){
-      theora_read1(&_dec->opb,&val);
+      theorapackB_read1(&_dec->opb,&val);
       flag=(int)val;
     }
     else flag=!flag;
@@ -408,9 +341,8 @@
   sb_end=sb+_dec->state.nsbs;
   /*Skip partially coded super blocks.*/
   for(;sb->coded_partially;sb++);
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   flag=(int)val;
-
   while(sb<sb_end){
     int full_run;
     run_count=oc_sb_run_unpack(&_dec->opb);
@@ -421,7 +353,7 @@
       sb->coded_fully=flag;
     }
     if(full_run&&sb<sb_end){
-      theora_read1(&_dec->opb,&val);
+      theorapackB_read1(&_dec->opb,&val);
       flag=(int)val;
     }
     else flag=!flag;
@@ -445,7 +377,7 @@
   npartial=oc_dec_partial_sb_flags_unpack(_dec);
   if(npartial<_dec->state.nsbs)oc_dec_coded_sb_flags_unpack(_dec);
   if(npartial>0){
-    theora_read1(&_dec->opb,&val);
+    theorapackB_read1(&_dec->opb,&val);
     flag=!(int)val;
   }
   else flag=0;
@@ -499,7 +431,7 @@
   long val;
   int  i;
   for(i=0;i<7;i++){
-    theora_read1(_opb,&val);
+    theorapackB_read1(_opb,&val);
     if(!val)break;
   }
   return i;
@@ -507,56 +439,50 @@
 
 static int oc_clc_mode_unpack(oggpack_buffer *_opb){
   long val;
-  theora_read(_opb,3,&val);
+  theorapackB_read(_opb,3,&val);
   return (int)val;
 }
 
 /*Unpacks the list of macro block modes for INTER frames.*/
-void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
+static void oc_dec_mb_modes_unpack(oc_dec_ctx *_dec){
   oc_mode_unpack_func  mode_unpack;
   oc_mb               *mb;
   oc_mb               *mb_end;
   const int           *alphabet;
-  long                 val,j;
+  long                 val;
   int                  scheme0_alphabet[8];
   int                  mode_scheme;
-  theora_read(&_dec->opb,3,&val);
+  theorapackB_read(&_dec->opb,3,&val);
   mode_scheme=(int)val;
-
   if(mode_scheme==0){
     int mi;
     /*Just in case, initialize the modes to something.
       If the bitstream doesn't contain each index exactly once, it's likely
        corrupt and the rest of the packet is garbage anyway, but this way we
        won't crash, and we'll decode SOMETHING.*/
+    /*LOOP VECTORIZES*/
     for(mi=0;mi<OC_NMODES;mi++)scheme0_alphabet[mi]=OC_MODE_INTER_NOMV;
     for(mi=0;mi<OC_NMODES;mi++){
-      theora_read(&_dec->opb,3,&val);
+      theorapackB_read(&_dec->opb,3,&val);
       scheme0_alphabet[val]=OC_MODE_ALPHABETS[6][mi];
     }
     alphabet=scheme0_alphabet;
-  }else 
-    alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
-  if(mode_scheme==7)
-    mode_unpack=oc_clc_mode_unpack;
-  else 
-    mode_unpack=oc_vlc_mode_unpack;
+  }
+  else alphabet=OC_MODE_ALPHABETS[mode_scheme-1];
+  if(mode_scheme==7)mode_unpack=oc_clc_mode_unpack;
+  else mode_unpack=oc_vlc_mode_unpack;
   mb=_dec->state.mbs;
   mb_end=mb+_dec->state.nmbs;
-
-  for(j=0;mb<mb_end;mb++){
+  for(;mb<mb_end;mb++){
     if(mb->mode!=OC_MODE_INVALID){
       int bi;
       for(bi=0;bi<4;bi++){
-	int fragi;
-	fragi=mb->map[0][bi];
-	if(fragi>=0&&_dec->state.frags[fragi].coded)break;
+        int fragi;
+        fragi=mb->map[0][bi];
+        if(fragi>=0&&_dec->state.frags[fragi].coded)break;
       }
-      if(bi<4){
-	mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
-	
-      }else 
-	mb->mode=OC_MODE_INTER_NOMV;
+      if(bi<4)mb->mode=alphabet[(*mode_unpack)(&_dec->opb)];
+      else mb->mode=OC_MODE_INTER_NOMV;
     }
   }
 }
@@ -568,34 +494,25 @@
 static int oc_vlc_mv_comp_unpack(oggpack_buffer *_opb){
   long bits;
   int  mvsigned[2];
-  theora_read(_opb,3,&bits);
+  theorapackB_read(_opb,3,&bits);
   switch(bits){
-    case 0:return 0;
-    case 1:return 1;
-    case 2:return -1;
-    case 3:{
-      mvsigned[0]=2;
-      theora_read1(_opb,&bits);
+    case  0:return 0;
+    case  1:return 1;
+    case  2:return -1;
+    case  3:
+    case  4:{
+      mvsigned[0]=(int)(bits-1);
+      theorapackB_read1(_opb,&bits);
     }break;
-    case 4:{
-      mvsigned[0]=3;
-      theora_read1(_opb,&bits);
-    }break;
-    case 5:{
-      theora_read(_opb,3,&bits);
-      mvsigned[0]=4+(bits>>1);
+    /*case  5:
+    case  6:
+    case  7:*/
+    default:{
+      mvsigned[0]=1<<bits-3;
+      theorapackB_read(_opb,bits-2,&bits);
+      mvsigned[0]+=(int)(bits>>1);
       bits&=1;
     }break;
-    case 6:{
-      theora_read(_opb,4,&bits);
-      mvsigned[0]=8+(bits>>1);
-      bits&=1;
-    }break;
-    case 7:{
-      theora_read(_opb,5,&bits);
-      mvsigned[0]=16+(bits>>1);
-      bits&=1;
-    }break;
   }
   mvsigned[1]=-mvsigned[0];
   return mvsigned[bits];
@@ -604,7 +521,7 @@
 static int oc_clc_mv_comp_unpack(oggpack_buffer *_opb){
   long bits;
   int  mvsigned[2];
-  theora_read(_opb,6,&bits);
+  theorapackB_read(_opb,6,&bits);
   mvsigned[0]=bits>>1;
   mvsigned[1]=-mvsigned[0];
   return mvsigned[bits&1];
@@ -623,7 +540,7 @@
   oc_mv                   last_mv[2];
   oc_mv                   cbmvs[4];
   set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_dec->state.info.pixel_fmt];
-  theora_read1(&_dec->opb,&val);
+  theorapackB_read1(&_dec->opb,&val);
   mv_comp_unpack=val?oc_clc_mv_comp_unpack:oc_vlc_mv_comp_unpack;
   map_idxs=OC_MB_MAP_IDXS[_dec->state.info.pixel_fmt];
   map_nidxs=OC_MB_MAP_NIDXS[_dec->state.info.pixel_fmt];
@@ -652,74 +569,62 @@
     if(ncoded<=0)continue;
     mb_mode=mb->mode;
     switch(mb_mode){
-    case OC_MODE_INTER_MV_FOUR:
-      {
-	oc_mv       lbmvs[4];
-	int         bi;
-	/*Mark the tail of the list, so we don't accidentally go past it.*/
-	coded[ncoded]=-1;
-	for(bi=codedi=0;bi<4;bi++){
-	  if(coded[codedi]==bi){
-	    codedi++;
-	    frag=_dec->state.frags+mb->map[0][bi];
-	    frag->mbmode=mb_mode;
-	    frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-	    frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-	  }
-	  else lbmvs[bi][0]=lbmvs[bi][1]=0;
-	}
-	if(codedi>0){
-	  last_mv[1][0]=last_mv[0][0];
-	  last_mv[1][1]=last_mv[0][1];
-	  last_mv[0][0]=lbmvs[coded[codedi-1]][0];
-	  last_mv[0][1]=lbmvs[coded[codedi-1]][1];
-	}
-	if(codedi<ncoded){
-	  (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
-	  for(;codedi<ncoded;codedi++){
-	    mapi=coded[codedi];
-	    bi=mapi&3;
-	    frag=_dec->state.frags+mb->map[mapi>>2][bi];
-	    frag->mbmode=mb_mode;
-	    frag->mv[0]=cbmvs[bi][0];
-	    frag->mv[1]=cbmvs[bi][1];
-	  }
-	}
-      }
-      break;
-    case OC_MODE_INTER_MV:
-      {
-	last_mv[1][0]=last_mv[0][0];
-	last_mv[1][1]=last_mv[0][1];
-	mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-	mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-      }
-      break;
-    case OC_MODE_INTER_MV_LAST:
-      {
+      case OC_MODE_INTER_MV_FOUR:{
+        oc_mv       lbmvs[4];
+        int         bi;
+        /*Mark the tail of the list, so we don't accidentally go past it.*/
+        coded[ncoded]=-1;
+        for(bi=codedi=0;bi<4;bi++){
+          if(coded[codedi]==bi){
+            codedi++;
+            frag=_dec->state.frags+mb->map[0][bi];
+            frag->mbmode=mb_mode;
+            frag->mv[0]=lbmvs[bi][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+            frag->mv[1]=lbmvs[bi][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+          }
+          else lbmvs[bi][0]=lbmvs[bi][1]=0;
+        }
+        if(codedi>0){
+          last_mv[1][0]=last_mv[0][0];
+          last_mv[1][1]=last_mv[0][1];
+          last_mv[0][0]=lbmvs[coded[codedi-1]][0];
+          last_mv[0][1]=lbmvs[coded[codedi-1]][1];
+        }
+        if(codedi<ncoded){
+          (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
+          for(;codedi<ncoded;codedi++){
+            mapi=coded[codedi];
+            bi=mapi&3;
+            frag=_dec->state.frags+mb->map[mapi>>2][bi];
+            frag->mbmode=mb_mode;
+            frag->mv[0]=cbmvs[bi][0];
+            frag->mv[1]=cbmvs[bi][1];
+          }
+        }
+      }break;
+      case OC_MODE_INTER_MV:{
+        last_mv[1][0]=last_mv[0][0];
+        last_mv[1][1]=last_mv[0][1];
+        mbmv[0]=last_mv[0][0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+        mbmv[1]=last_mv[0][1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
+      }break;
+      case OC_MODE_INTER_MV_LAST:{
         mbmv[0]=last_mv[0][0];
         mbmv[1]=last_mv[0][1];
-      }
-      break;
-    case OC_MODE_INTER_MV_LAST2:
-      {
+      }break;
+      case OC_MODE_INTER_MV_LAST2:{
         mbmv[0]=last_mv[1][0];
         mbmv[1]=last_mv[1][1];
         last_mv[1][0]=last_mv[0][0];
         last_mv[1][1]=last_mv[0][1];
         last_mv[0][0]=mbmv[0];
         last_mv[0][1]=mbmv[1];
-      }
-      break;
-    case OC_MODE_GOLDEN_MV:
-      {
+      }break;
+      case OC_MODE_GOLDEN_MV:{
         mbmv[0]=(signed char)(*mv_comp_unpack)(&_dec->opb);
         mbmv[1]=(signed char)(*mv_comp_unpack)(&_dec->opb);
-      }
-      break;
-    default:
-      mbmv[0]=mbmv[1]=0;
-      break;
+      }break;
+      default:mbmv[0]=mbmv[1]=0;break;
     }
     /*4MV mode fills in the fragments itself.
       For all other modes we can use this common code.*/
@@ -756,7 +661,7 @@
   else{
     long val;
     int  flag;
-    int  nqi0;
+    int  nqi1;
     int  run_count;
     /*Otherwise, we decode a qi index for each fragment, using two passes of
       the same binary RLE scheme used for super-block coded bits.
@@ -766,20 +671,20 @@
      At first we just store the qii in the fragment.
      After all the qii's are decoded, we make a final pass to replace them
       with the corresponding qi's for this frame.*/
-    theora_read1(&_dec->opb,&val);
+    theorapackB_read1(&_dec->opb,&val);
     flag=(int)val;
-    run_count=nqi0=0;
+    run_count=nqi1=0;
     while(coded_fragi<coded_fragi_end){
       int full_run;
       run_count=oc_sb_run_unpack(&_dec->opb);
       full_run=run_count>=4129;
       do{
         _dec->state.frags[*coded_fragi++].qi=flag;
-        nqi0+=!flag;
+        nqi1+=flag;
       }
       while(--run_count>0&&coded_fragi<coded_fragi_end);
       if(full_run&&coded_fragi<coded_fragi_end){
-        theora_read1(&_dec->opb,&val);
+        theorapackB_read1(&_dec->opb,&val);
         flag=(int)val;
       }
       else flag=!flag;
@@ -788,11 +693,11 @@
       If it's not, we should issue a warning of some kind.*/
     /*If we have 3 different qi's for this frame, and there was at least one
        fragment with a non-zero qi, make the second pass.*/
-    if(_dec->state.nqis==3&&nqi0<ncoded_fragis){
+    if(_dec->state.nqis==3&&nqi1>0){
       /*Skip qii==0 fragments.*/
       for(coded_fragi=_dec->state.coded_fragis;
        _dec->state.frags[*coded_fragi].qi==0;coded_fragi++);
-      theora_read1(&_dec->opb,&val);
+      theorapackB_read1(&_dec->opb,&val);
       flag=(int)val;
       while(coded_fragi<coded_fragi_end){
         int full_run;
@@ -806,7 +711,7 @@
           frag->qi+=flag;
         }
         if(full_run&&coded_fragi<coded_fragi_end){
-          theora_read1(&_dec->opb,&val);
+          theorapackB_read1(&_dec->opb,&val);
           flag=(int)val;
         }
         else flag=!flag;
@@ -909,7 +814,7 @@
   _token:      The token value to skip.
   _extra_bits: The extra bits attached to this token.
   Return: The decoded coefficient value.*/
-int oc_dct_token_dec1val(int _token,int _extra_bits){
+static int oc_dct_token_dec1val(int _token,int _extra_bits){
   return (*OC_TOKEN_DEC1VAL_TABLE[_token-OC_NDCT_EOB_TOKEN_MAX])(_token,
    _extra_bits);
 }
@@ -959,7 +864,7 @@
       _dec->dct_tokens[0][ti++]=(unsigned char)token;
       neb=OC_DCT_TOKEN_EXTRA_BITS[token];
       if(neb){
-        theora_read(&_dec->opb,neb,&val);
+        theorapackB_read(&_dec->opb,neb,&val);
         eb=(int)val;
         _dec->extra_bits[0][ebi++]=(ogg_uint16_t)eb;
       }
@@ -1031,7 +936,7 @@
       _dec->dct_tokens[_zzi][ti++]=(unsigned char)token;
       neb=OC_DCT_TOKEN_EXTRA_BITS[token];
       if(neb){
-        theora_read(&_dec->opb,neb,&val);
+        theorapackB_read(&_dec->opb,neb,&val);
         eb=(int)val;
         _dec->extra_bits[_zzi][ebi++]=(ogg_uint16_t)eb;
       }
@@ -1099,17 +1004,17 @@
   for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
     ntoks_left[pli][zzi]=_dec->state.ncoded_fragis[pli];
   }
-  theora_read(&_dec->opb,4,&val);
+  theorapackB_read(&_dec->opb,4,&val);
   huffi_y=(int)val;
-  theora_read(&_dec->opb,4,&val);
+  theorapackB_read(&_dec->opb,4,&val);
   huffi_c=(int)val;
   huff_idxs[0]=huffi_y;
   huff_idxs[1]=huff_idxs[2]=huffi_c;
   _dec->eob_runs[0][0]=0;
   eobs=oc_dec_dc_coeff_unpack(_dec,huff_idxs,ntoks_left);
-  theora_read(&_dec->opb,4,&val);
+  theorapackB_read(&_dec->opb,4,&val);
   huffi_y=(int)val;
-  theora_read(&_dec->opb,4,&val);
+  theorapackB_read(&_dec->opb,4,&val);
   huffi_c=(int)val;
   zzi=1;
   for(hgi=1;hgi<5;hgi++){
@@ -1141,7 +1046,7 @@
  ogg_int16_t _dct_coeffs[128],int *_zzi);
 
 /*Expands a zero run token.*/
-void oc_token_expand_zrl(int _token,int _extra_bits,
+static void oc_token_expand_zrl(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   int zzi;
   zzi=*_zzi;
@@ -1151,38 +1056,39 @@
 }
 
 /*Expands a constant, single-value token.*/
-void oc_token_expand_const(int _token,int _extra_bits,
+static void oc_token_expand_const(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   _dct_coeffs[(*_zzi)++]=(ogg_int16_t)oc_token_dec1val_const(_token);
 }
 
 /*Expands category 2 single-valued tokens.*/
-void oc_token_expand_cat2(int _token,int _extra_bits,
+static void oc_token_expand_cat2(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   _dct_coeffs[(*_zzi)++]=
    (ogg_int16_t)oc_token_dec1val_cat2(_token,_extra_bits);
 }
 
 /*Expands category 3 through 8 single-valued tokens.*/
-void oc_token_expand_cati(int _token,int _extra_bits,
+static void oc_token_expand_cati(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   _dct_coeffs[(*_zzi)++]=
    (ogg_int16_t)oc_token_dec1val_cati(_token,_extra_bits);
 }
 
 /*Expands a category 1a zero run/value combo token.*/
-void oc_token_expand_run_cat1a(int _token,int _extra_bits,
+static void oc_token_expand_run_cat1a(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   int zzi;
   int rl;
   zzi=*_zzi;
+  /*LOOP VECTORIZES.*/
   for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[zzi++]=0;
   _dct_coeffs[zzi++]=(ogg_int16_t)(1-(_extra_bits<<1));
   *_zzi=zzi;
 }
 
 /*Expands all other zero run/value combo tokens.*/
-void oc_token_expand_run(int _token,int _extra_bits,
+static void oc_token_expand_run(int _token,int _extra_bits,
  ogg_int16_t _dct_coeffs[128],int *_zzi){
   static const int NZEROS_ADJUST[OC_NDCT_RUN_MAX-OC_DCT_RUN_CAT1B]={
     6,10,1,2
@@ -1208,6 +1114,7 @@
   _token-=OC_DCT_RUN_CAT1B;
   rl=(_extra_bits&NZEROS_MASK[_token])+NZEROS_ADJUST[_token];
   zzi=*_zzi;
+  /*LOOP VECTORIZES.*/
   while(rl-->0)_dct_coeffs[zzi++]=0;
   valsigned[0]=VALUE_ADJUST[_token]+
    (_extra_bits>>VALUE_SHIFT[_token]&VALUE_MASK[_token]);
@@ -1318,13 +1225,13 @@
     if(_dec->pp_level<OC_PP_LEVEL_DEBLOCKC){
       _dec->variances=(int *)_ogg_realloc(_dec->variances,
        _dec->state.fplanes[0].nfrags*sizeof(_dec->variances[0]));
-      _dec->pp_frame_data=(unsigned char *)_ogg_realloc( 
+      _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
        _dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
       _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
       _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
-      _dec->pp_frame_buf[0].ystride=-_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].stride=-_dec->pp_frame_buf[0].width;
       _dec->pp_frame_buf[0].data=_dec->pp_frame_data+
-       (1-_dec->pp_frame_buf[0].height)*_dec->pp_frame_buf[0].ystride;
+       (1-_dec->pp_frame_buf[0].height)*_dec->pp_frame_buf[0].stride;
     }
     else{
       size_t y_sz;
@@ -1338,19 +1245,19 @@
       c_h=_dec->state.info.frame_height>>!(_dec->state.info.pixel_fmt&2);
       c_sz=c_w*c_h;
       frame_sz+=c_sz<<1;
-      _dec->pp_frame_data=(unsigned char *)_ogg_realloc( 
+      _dec->pp_frame_data=(unsigned char *)_ogg_realloc(
        _dec->pp_frame_data,frame_sz*sizeof(_dec->pp_frame_data[0]));
       _dec->pp_frame_buf[0].width=_dec->state.info.frame_width;
       _dec->pp_frame_buf[0].height=_dec->state.info.frame_height;
-      _dec->pp_frame_buf[0].ystride=_dec->pp_frame_buf[0].width;
+      _dec->pp_frame_buf[0].stride=_dec->pp_frame_buf[0].width;
       _dec->pp_frame_buf[0].data=_dec->pp_frame_data;
       _dec->pp_frame_buf[1].width=c_w;
       _dec->pp_frame_buf[1].height=c_h;
-      _dec->pp_frame_buf[1].ystride=_dec->pp_frame_buf[1].width;
+      _dec->pp_frame_buf[1].stride=_dec->pp_frame_buf[1].width;
       _dec->pp_frame_buf[1].data=_dec->pp_frame_buf[0].data+y_sz;
       _dec->pp_frame_buf[2].width=c_w;
       _dec->pp_frame_buf[2].height=c_h;
-      _dec->pp_frame_buf[2].ystride=_dec->pp_frame_buf[2].width;
+      _dec->pp_frame_buf[2].stride=_dec->pp_frame_buf[2].width;
       _dec->pp_frame_buf[2].data=_dec->pp_frame_buf[1].data+c_sz;
       oc_ycbcr_buffer_flip(_dec->pp_frame_buf,_dec->pp_frame_buf);
     }
@@ -1371,7 +1278,7 @@
   int  ti[3][64];
   int  ebi[3][64];
   int  eob_runs[3][64];
-  int  bounding_values[512];
+  int  bounding_values[256];
   int *coded_fragis[3];
   int *uncoded_fragis[3];
   int  fragy0[3];
@@ -1673,48 +1580,48 @@
    (_fragy_end+notdone-_fragy0-notstart)*fplane->nhfrags*sizeof(variance[0]));
   /*Except for the first time, we want to point to the middle of the row.*/
   y=(_fragy0<<3)+(notstart<<2);
-  dst=_dst->data+y*_dst->ystride;
-  src=_src->data+y*_src->ystride;
+  dst=_dst->data+y*_dst->stride;
+  src=_src->data+y*_src->stride;
   for(;y<4;y++){
     memcpy(dst,src,_dst->width*sizeof(dst[0]));
-    dst+=_dst->ystride;
-    src+=_src->ystride;
+    dst+=_dst->stride;
+    src+=_src->stride;
   }
   /*We also want to skip the last row in the frame for this loop.*/
   y_end=_fragy_end-!notdone<<3;
   for(;y<y_end;y+=8){
     qstep=_dec->pp_dc_scale[*dc_qi];
     flimit=(qstep*3)>>2;
-    oc_filter_hedge(dst,_dst->ystride,src-_src->ystride,_src->ystride,
+    oc_filter_hedge(dst,_dst->stride,src-_src->stride,_src->stride,
      qstep,flimit,variance,variance+fplane->nhfrags);
     variance++;
     dc_qi++;
     for(x=8;x<_dst->width;x+=8){
       qstep=_dec->pp_dc_scale[*dc_qi];
       flimit=(qstep*3)>>2;
-      oc_filter_hedge(dst+x,_dst->ystride,src+x-_src->ystride,_src->ystride,
+      oc_filter_hedge(dst+x,_dst->stride,src+x-_src->stride,_src->stride,
        qstep,flimit,variance,variance+fplane->nhfrags);
-      oc_filter_vedge(dst+x-(_dst->ystride<<2)-4,_dst->ystride,
+      oc_filter_vedge(dst+x-(_dst->stride<<2)-4,_dst->stride,
        qstep,flimit,variance-1);
       variance++;
       dc_qi++;
     }
-    dst+=_dst->ystride<<3;
-    src+=_src->ystride<<3;
+    dst+=_dst->stride<<3;
+    src+=_src->stride<<3;
   }
   /*And finally, handle the last row in the frame, if it's in the range.*/
   if(!notdone){
     for(;y<_dst->height;y++){
       memcpy(dst,src,_dst->width*sizeof(dst[0]));
-      dst+=_dst->ystride;
-      src+=_src->ystride;
+      dst+=_dst->stride;
+      src+=_src->stride;
     }
     /*Filter the last row of vertical block edges.*/
     dc_qi++;
     for(x=8;x<_dst->width;x+=8){
       qstep=_dec->pp_dc_scale[*dc_qi++];
       flimit=(qstep*3)>>2;
-      oc_filter_vedge(dst+x-(_dst->ystride<<3)-4,_dst->ystride,
+      oc_filter_vedge(dst+x-(_dst->stride<<3)-4,_dst->stride,
        qstep,flimit,variance++);
     }
   }
@@ -1766,71 +1673,54 @@
   for(by=0;by<8;by++){
     int a;
     int b;
-    int p;
-    int p1;
-    int a1;
+    int w;
     a=128;
     b=64;
-    p=src[0];
-    p1=*(src-!(_b&1));
-    a1=hmod[(bx<<3)+by];
-    a-=a1;
-    b+=a1*p1;
-    p1=psrc[0];
-    a1=vmod[(by<<3)+bx];
-    a-=a1;
-    b+=a1*p1;
-    p1=nsrc[0];
-    a1=vmod[(by+1<<3)+bx];
-    a-=a1;
-    b+=a1*p1;
-    p1=src[1];
-    a1=hmod[(bx+1<<3)+by];
-    a-=a1;
-    b+=a1*p1;
-    dst[0]=OC_CLAMP255(a*p+b>>7);
+    w=hmod[by];
+    a-=w;
+    b+=w**(src-!(_b&1));
+    w=vmod[(by<<3)];
+    a-=w;
+    b+=w*psrc[0];
+    w=vmod[(by+1<<3)];
+    a-=w;
+    b+=w*nsrc[0];
+    w=hmod[(1<<3)+by];
+    a-=w;
+    b+=w*src[1];
+    dst[0]=OC_CLAMP255(a*src[0]+b>>7);
     for(bx=1;bx<7;bx++){
       a=128;
       b=64;
-      p=src[bx];
-      p1=src[bx-1];
-      a1=hmod[(bx<<3)+by];
-      a-=a1;
-      b+=a1*p1;
-      p1=psrc[bx];
-      a1=vmod[(by<<3)+bx];
-      a-=a1;
-      b+=a1*p1;
-      p1=nsrc[bx];
-      a1=vmod[(by+1<<3)+bx];
-      a-=a1;
-      b+=a1*p1;
-      p1=src[bx+1];
-      a1=hmod[(bx+1<<3)+by];
-      a-=a1;
-      b+=a1*p1;
-      dst[bx]=OC_CLAMP255(a*p+b>>7);
+      w=hmod[(bx<<3)+by];
+      a-=w;
+      b+=w*src[bx-1];
+      w=vmod[(by<<3)+bx];
+      a-=w;
+      b+=w*psrc[bx];
+      w=vmod[(by+1<<3)+bx];
+      a-=w;
+      b+=w*nsrc[bx];
+      w=hmod[(bx+1<<3)+by];
+      a-=w;
+      b+=w*src[bx+1];
+      dst[bx]=OC_CLAMP255(a*src[bx]+b>>7);
     }
     a=128;
     b=64;
-    p=src[7];
-    p1=src[6];
-    a1=hmod[(bx<<3)+by];
-    a-=a1;
-    b+=a1*p1;
-    p1=psrc[7];
-    a1=vmod[(by<<3)+bx];
-    a-=a1;
-    b+=a1*p1;
-    p1=nsrc[7];
-    a1=vmod[(by+1<<3)+bx];
-    a-=a1;
-    b+=a1*p1;
-    p1=src[7+!(_b&2)];
-    a1=hmod[(bx+1<<3)+by];
-    a-=a1;
-    b+=a1*p1;
-    dst[7]=OC_CLAMP255(a*p+b>>7);
+    w=hmod[(7<<3)+by];
+    a-=w;
+    b+=w*src[6];
+    w=vmod[(by<<3)+7];
+    a-=w;
+    b+=w*psrc[7];
+    w=vmod[(by+1<<3)+7];
+    a-=w;
+    b+=w*nsrc[7];
+    w=hmod[(8<<3)+by];
+    a-=w;
+    b+=w*src[7+!(_b&2)];
+    dst[7]=OC_CLAMP255(a*src[7]+b>>7);
     dst+=_ystride;
     psrc=src;
     src=nsrc;
@@ -1845,7 +1735,7 @@
 
 static void oc_dec_dering_frag_rows(oc_dec_ctx *_dec,th_img_plane *_img,
  int _pli,int _fragy0,int _fragy_end){
-  th_img_plane  *iplane;
+  th_img_plane      *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag;
   int               *variance;
@@ -1864,7 +1754,7 @@
   strong=_dec->pp_level>=(_pli?OC_PP_LEVEL_SDERINGC:OC_PP_LEVEL_SDERINGY);
   sthresh=_pli?OC_DERING_THRESH4:OC_DERING_THRESH3;
   y=_fragy0<<3;
-  idata=iplane->data+y*iplane->ystride;
+  idata=iplane->data+y*iplane->stride;
   y_end=_fragy_end<<3;
   for(;y<y_end;y+=8){
     for(x=0;x<iplane->width;x+=8){
@@ -1875,30 +1765,30 @@
       var=*variance;
       b=(x<=0)|(x+8>=iplane->width)<<1|(y<=0)<<2|(y+8>=iplane->height)<<3;
       if(strong&&var>sthresh){
-        oc_dering_block(idata+x,iplane->ystride,b,
+        oc_dering_block(idata+x,iplane->stride,b,
          _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
-        if(_pli||(b&1)&&*(variance-1)>OC_DERING_THRESH4||
-         (b&2)&&variance[1]>OC_DERING_THRESH4||
-         (b&4)&&*(variance-fplane->nvfrags)>OC_DERING_THRESH4||
-         (b&8)&&variance[fplane->nvfrags]>OC_DERING_THRESH4){
-          oc_dering_block(idata+x,iplane->ystride,b,
+        if(_pli||!(b&1)&&*(variance-1)>OC_DERING_THRESH4||
+         !(b&2)&&variance[1]>OC_DERING_THRESH4||
+         !(b&4)&&*(variance-fplane->nvfrags)>OC_DERING_THRESH4||
+         !(b&8)&&variance[fplane->nvfrags]>OC_DERING_THRESH4){
+          oc_dering_block(idata+x,iplane->stride,b,
            _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
-          oc_dering_block(idata+x,iplane->ystride,b,
+          oc_dering_block(idata+x,iplane->stride,b,
            _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
         }
       }
       else if(var>OC_DERING_THRESH2){
-        oc_dering_block(idata+x,iplane->ystride,b,
+        oc_dering_block(idata+x,iplane->stride,b,
          _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],1);
       }
       else if(var>OC_DERING_THRESH1){
-        oc_dering_block(idata+x,iplane->ystride,b,
+        oc_dering_block(idata+x,iplane->stride,b,
          _dec->pp_dc_scale[qi],_dec->pp_sharp_mod[qi],0);
       }
       frag++;
       variance++;
     }
-    idata+=iplane->ystride<<3;
+    idata+=iplane->stride<<3;
   }
 }
 
@@ -1991,7 +1881,6 @@
   /*A completely empty packet indicates a dropped frame and is treated exactly
      like an inter frame with no coded blocks.
     Only proceed if we have a non-empty packet.*/
-
   if(_op->bytes!=0){
     oc_dec_pipeline_state pipe;
     th_ycbcr_buffer       stripe_buf;
@@ -2000,7 +1889,7 @@
     int                   pli;
     int                   notstart;
     int                   notdone;
-    oggpackB_readinit(&_dec->opb,_op->packet,_op->bytes);
+    theorapackB_readinit(&_dec->opb,_op->packet,_op->bytes);
     ret=oc_dec_frame_header_unpack(_dec);
     if(ret<0)return ret;
     /*Select a free buffer to use for the reconstructed version of this
@@ -2045,7 +1934,6 @@
     }
     oc_dec_block_qis_unpack(_dec);
     oc_dec_residual_tokens_unpack(_dec);
-
     /*Update granule position.
       This must be done before the striped decode callbacks so that the
        application knows what to do with the frame data.*/
@@ -2155,7 +2043,6 @@
       }
       notstart=1;
     }
-
     /*Finish filling in the reference frame borders.*/
     for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_dec->state,refi,pli);
     /*Update the reference frame indices.*/

Modified: branches/theora-thusnelda/lib/dec/dequant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/dequant.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/dequant.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -36,30 +36,30 @@
   int            qri;
   int            qi;
   int            i;
-  theora_read(_opb,3,&val);
+  theorapackB_read(_opb,3,&val);
   nbits=(int)val;
   for(qi=0;qi<64;qi++){
-    theora_read(_opb,nbits,&val);
+    theorapackB_read(_opb,nbits,&val);
     _qinfo->loop_filter_limits[qi]=(unsigned char)val;
   }
-  theora_read(_opb,4,&val);
+  theorapackB_read(_opb,4,&val);
   nbits=(int)val+1;
   for(qi=0;qi<64;qi++){
-    theora_read(_opb,nbits,&val);
+    theorapackB_read(_opb,nbits,&val);
     _qinfo->ac_scale[qi]=(ogg_uint16_t)val;
   }
-  theora_read(_opb,4,&val);
+  theorapackB_read(_opb,4,&val);
   nbits=(int)val+1;
   for(qi=0;qi<64;qi++){
-    theora_read(_opb,nbits,&val);
+    theorapackB_read(_opb,nbits,&val);
     _qinfo->dc_scale[qi]=(ogg_uint16_t)val;
   }
-  theora_read(_opb,9,&val);
+  theorapackB_read(_opb,9,&val);
   nbase_mats=(int)val+1;
   base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0]));
   for(bmi=0;bmi<nbase_mats;bmi++){
     for(ci=0;ci<64;ci++){
-      theora_read(_opb,8,&val);
+      theorapackB_read(_opb,8,&val);
       base_mats[bmi][ci]=(unsigned char)val;
     }
   }
@@ -72,12 +72,12 @@
     pli=i%3;
     qranges=_qinfo->qi_ranges[qti]+pli;
     if(i>0){
-      theora_read1(_opb,&val);
+      theorapackB_read1(_opb,&val);
       if(!val){
         int qtj;
         int plj;
         if(qti>0){
-          theora_read1(_opb,&val);
+          theorapackB_read1(_opb,&val);
           if(val){
             qtj=qti-1;
             plj=pli;
@@ -95,13 +95,13 @@
         continue;
       }
     }
-    theora_read(_opb,nbits,&val);
+    theorapackB_read(_opb,nbits,&val);
     indices[0]=(int)val;
     for(qi=qri=0;qi<63;){
-      theora_read(_opb,oc_ilog(62-qi),&val);
+      theorapackB_read(_opb,oc_ilog(62-qi),&val);
       sizes[qri]=(int)val+1;
       qi+=(int)val+1;
-      theora_read(_opb,nbits,&val);
+      theorapackB_read(_opb,nbits,&val);
       indices[++qri]=(int)val;
     }
     /*Note: The caller is responsible for cleaning up any partially
@@ -169,4 +169,3 @@
     _ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
   }
 }
-

Modified: branches/theora-thusnelda/lib/dec/dequant.h
===================================================================
--- branches/theora-thusnelda/lib/dec/dequant.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/dequant.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Deleted: branches/theora-thusnelda/lib/dec/enquant.h
===================================================================
--- branches/theora-thusnelda/lib/dec/enquant.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/enquant.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -1,43 +0,0 @@
-/********************************************************************
- *                                                                  *
- * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
- * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
- * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
- * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
- *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
- *                                                                  *
- ********************************************************************
-
-  function:
-    last mod: $Id$
-
- ********************************************************************/
-
-#if !defined(_enquant_H)
-# define _enquant_H (1)
-# include "quant.h"
-
-/*The amount to scale the forward quantizer value by.*/
-#define OC_FQUANT_SCALE ((ogg_uint32_t)1<<OC_FQUANT_SHIFT)
-/*The amount to add to the scaled forward quantizer for rounding.*/
-#define OC_FQUANT_ROUND (1<<OC_FQUANT_SHIFT-1)
-/*The amount to shift the resulting product by.*/
-#define OC_FQUANT_SHIFT (16)
-
-
-
-/*The default quantization parameters used by VP3.1.*/
-extern const th_quant_info TH_VP31_QUANT_INFO;
-/*Our default quantization parameters.*/
-extern const th_quant_info OC_DEF_QUANT_INFO[4];
-
-
-
-void oc_quant_params_pack(oggpack_buffer *_opb,
- const th_quant_info *_qinfo);
-void oc_enquant_tables_init(oc_quant_table *_dequant[2][3],
- oc_quant_table *_enquant[2][3],const th_quant_info *_qinfo);
-
-#endif

Modified: branches/theora-thusnelda/lib/dec/fragment.c
===================================================================
--- branches/theora-thusnelda/lib/dec/fragment.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/fragment.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -186,6 +186,7 @@
   if(pflags==0)return _pred_last[pred_frame];
   else{
     ret=PRED_SCALE[pflags][0]*p[0];
+    /*LOOP VECTORIZES.*/
     for(i=1;i<np;i++)ret+=PRED_SCALE[pflags][i]*p[i];
     ret=OC_DIV_POW2(ret,PRED_SHIFT[pflags],PRED_RMASK[pflags]);
   }

Modified: branches/theora-thusnelda/lib/dec/huffdec.c
===================================================================
--- branches/theora-thusnelda/lib/dec/huffdec.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/huffdec.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -25,6 +25,52 @@
 #define _ogg_offsetof(_type,_field)\
  ((size_t)((char *)&((_type *)0)->_field-(char *)0))
 
+/*These two functions are really part of the bitpack.c module, but
+  they are only used here. Declaring local static versions so they
+  can be inlined saves considerable function call overhead.*/
+
+/*Read in bits without advancing the bitptr.
+  Here we assume 0<=_bits&&_bits<=32.*/
+static int theorapackB_look(oggpack_buffer *_b,int _bits,long *_ret){
+  long ret;
+  long m;
+  long d;
+  m=32-_bits;
+  _bits+=_b->endbit;
+  d=_b->storage-_b->endbyte;
+  if(d<=4){
+    /*Not the main path.*/
+    if(d<=0){
+      *_ret=0L;
+      return -(_bits>d*8);
+    }
+    /*If we have some bits left, but not enough, return the ones we have.*/
+    if(d*8<_bits)_bits=d*8;
+  }
+  ret=_b->ptr[0]<<24+_b->endbit;
+  if(_bits>8){
+    ret|=_b->ptr[1]<<16+_b->endbit;
+    if(_bits>16){
+      ret|=_b->ptr[2]<<8+_b->endbit;
+      if(_bits>24){
+        ret|=_b->ptr[3]<<_b->endbit;
+        if(_bits>32)ret|=_b->ptr[4]>>8-_b->endbit;
+      }
+    }
+  }
+  *_ret=((ret&0xFFFFFFFF)>>(m>>1))>>(m+1>>1);
+  return 0;
+}
+
+/*advance the bitptr*/
+static void theorapackB_adv(oggpack_buffer *_b,int _bits){
+  _bits+=_b->endbit;
+  _b->ptr+=_bits>>3;
+  _b->endbyte+=_bits>>3;
+  _b->endbit=_bits&7;
+}
+
+
 /*The log_2 of the size of a lookup table is allowed to grow to relative to
    the number of unique nodes it contains.
   E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
@@ -63,7 +109,7 @@
   _node: The node to free.
          This may be NULL.*/
 static void oc_huff_node_free(oc_huff_node *_node){
-  free(_node);
+  _ogg_free(_node);
 }
 
 /*Frees the memory used by a Huffman tree.
@@ -77,7 +123,7 @@
     int inext;
     nchildren=1<<_node->nbits;
     for(i=0;i<nchildren;i=inext){
-      inext=i+(1<<_node->nbits-_node->nodes[i]->depth);
+      inext=i+(_node->nodes[i]!=NULL?1<<_node->nbits-_node->nodes[i]->depth:1);
       oc_huff_tree_free(_node->nodes[i]);
     }
   }
@@ -96,7 +142,7 @@
   long          bits;
   /*Prevent infinite recursion.*/
   if(++_depth>32)return TH_EBADHEADER;
-  if(theora_read1(_opb,&bits)<0)return TH_EBADHEADER;
+  if(theorapackB_read1(_opb,&bits)<0)return TH_EBADHEADER;
   /*Read an internal node:*/
   if(!bits){
     int ret;
@@ -106,12 +152,13 @@
     if(ret>=0)ret=oc_huff_tree_unpack(_opb,binode->nodes+1,_depth);
     if(ret<0){
       oc_huff_tree_free(binode);
+      *_binode=NULL;
       return ret;
     }
   }
   /*Read a leaf node:*/
   else{
-    if(theora_read(_opb,OC_NDCT_TOKEN_BITS,&bits)<0)return TH_EBADHEADER;
+    if(theorapackB_read(_opb,OC_NDCT_TOKEN_BITS,&bits)<0)return TH_EBADHEADER;
     binode=oc_huff_node_alloc(0);
     binode->depth=(unsigned char)(_depth>1);
     binode->token=(unsigned char)bits;
@@ -251,7 +298,7 @@
   _dst: The array to store the copy in.
   _src: The array of trees to copy.*/
 void oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- /*const*/ oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+ const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++)_dst[i]=oc_huff_tree_copy(_src[i]);
 }
@@ -270,9 +317,9 @@
 int oc_huff_token_decode(oggpack_buffer *_opb,const oc_huff_node *_node){
   long bits;
   while(_node->nbits!=0){
-    theora_look(_opb,_node->nbits,&bits);
+    theorapackB_look(_opb,_node->nbits,&bits);
     _node=_node->nodes[bits];
-    oggpackB_adv(_opb,_node->depth);
+    theorapackB_adv(_opb,_node->depth);
   }
   return _node->token;
 }

Modified: branches/theora-thusnelda/lib/dec/huffdec.h
===================================================================
--- branches/theora-thusnelda/lib/dec/huffdec.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/huffdec.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -33,7 +33,29 @@
   We do _not_ require that a subtree be complete to be collapsed, but instead
    store duplicate pointers in the table, and record the actual depth of the
    node below its parent.
-  This tells us the number of bits to advance the stream after reaching it.*/
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
 struct oc_huff_node{
   /*The number of bits of the code needed to descend through this node.
     0 indicates a leaf node.
@@ -61,7 +83,7 @@
 int oc_huff_trees_unpack(oggpack_buffer *_opb,
  oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
 void oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- /*const*/ oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
+ const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
 void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
 int oc_huff_token_decode(oggpack_buffer *_opb,const oc_huff_node *_node);
 

Modified: branches/theora-thusnelda/lib/dec/huffman.h
===================================================================
--- branches/theora-thusnelda/lib/dec/huffman.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/huffman.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/dec/idct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/idct.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/idct.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -169,7 +169,6 @@
   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 }
 
-
 /*Performs an inverse 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 relative to the orthonormal version of
    the transform.
@@ -204,7 +203,6 @@
   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 }
 
-
 /*Performs an inverse 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 relative to the orthonormal version of
    the transform.

Modified: branches/theora-thusnelda/lib/dec/idct.h
===================================================================
--- branches/theora-thusnelda/lib/dec/idct.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/idct.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/dec/info.c
===================================================================
--- branches/theora-thusnelda/lib/dec/info.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/info.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -354,8 +354,8 @@
   for(pli=0;pli<3;pli++){
     _dst[pli].width=_src[pli].width;
     _dst[pli].height=_src[pli].height;
-    _dst[pli].ystride=-_src[pli].ystride;
-    _dst[pli].data=_src[pli].data+(1-_dst[pli].height)*_dst[pli].ystride;
+    _dst[pli].stride=-_src[pli].stride;
+    _dst[pli].data=_src[pli].data+(1-_dst[pli].height)*_dst[pli].stride;
   }
 }
 
@@ -381,4 +381,3 @@
 int th_packet_iskeyframe(ogg_packet *_op){
   return _op->bytes<=0?0:_op->packet[0]&0x80?-1:!(_op->packet[0]&0x40);
 }
-

Modified: branches/theora-thusnelda/lib/dec/ocintrin.h
===================================================================
--- branches/theora-thusnelda/lib/dec/ocintrin.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/ocintrin.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/dec/quant.c
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/quant.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -21,8 +21,8 @@
 #include "quant.h"
 #include "decint.h"
 
-unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
-unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
+static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
+static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
 
 /*Initializes the dequantization tables from a set of quantizer info.
   Currently the dequantizer (and elsewhere enquantizer) tables are expected to
@@ -38,95 +38,84 @@
    matrices being used for the current frame, and to recalculate these as the
    qi values change between frames (this is what VP3 did).*/
 void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
-			    int _pp_dc_scale[64],
-			    const th_quant_info *_qinfo){
-  int          qti; /* coding mode: intra or inter */
-  int          pli; /* Y U V */
+ int _pp_dc_scale[64],const th_quant_info *_qinfo){
+  /*coding mode: intra or inter.*/
+  int          qti;
+  /*Y', C_b, C_r*/
+  int          pli;
   for(qti=0;qti<2;qti++){
     for(pli=0;pli<3;pli++){
       oc_quant_tables stage;
-
-      int qi;  /* quality index */
-      int qri; /* range iterator */
-      
-      for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges; qri++){
-	th_quant_base base;
-	
-	ogg_uint32_t      q;
-	int               qi_start;
-	int               qi_end;
-	int               ci;
-	memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
-	       sizeof(base));
-
-	qi_start=qi;
-	if(qri==_qinfo->qi_ranges[qti][pli].nranges)
-	  qi_end=qi+1;
-	else 
-	  qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
-	
-	/* Iterate over quality indicies in this range */
-	for(;;){
-	  
-	  /*In the original VP3.2 code, the rounding offset and the size of the
-	    dead zone around 0 were controlled by a "sharpness" parameter.
-	    The size of our dead zone is now controlled by the per-coefficient
-	    quality thresholds returned by our HVS module.
-	    We round down from a more accurate value when the quality of the
-	    reconstruction does not fall below our threshold and it saves bits.
-	    Hence, all of that VP3.2 code is gone from here, and the remaining
-	    floating point code has been implemented as equivalent integer code
-	    with exact precision.*/
-
-	  /* for postprocess, not dequant */
-	  if(_pp_dc_scale!=NULL)
-	    _pp_dc_scale[qi]=(int)((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/160);
-
-	  /*Scale DC the coefficient from the proper table.*/
-	  q=((ogg_uint32_t)_qinfo->dc_scale[qi]*base[0]/100)<<2;
-	  q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	  stage[qi][0]=(ogg_uint16_t)q;
-	  
-	  /*Now scale AC coefficients from the proper table.*/
-	  for(ci=1;ci<64;ci++){
-	    q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
-	    q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	    stage[qi][ci]=(ogg_uint16_t)q;
-	  }
-	  
-	  if(++qi>=qi_end)break;
-	  
-	  /*Interpolate the next base matrix.*/
-	  for(ci=0;ci<64;ci++){
-	    base[ci]=(unsigned char)
-	      ((2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-		   (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-		+_qinfo->qi_ranges[qti][pli].sizes[qri])/
-	       (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
-	  }
-	}
+      /*Quality index.*/
+      int qi;
+      /*Range iterator.*/
+      int qri;
+      for(qi=0,qri=0; qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
+        th_quant_base base;
+        ogg_uint32_t  q;
+        int           qi_start;
+        int           qi_end;
+        int           ci;
+        memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
+         sizeof(base));
+        qi_start=qi;
+        if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
+        else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
+        /*Iterate over quality indicies in this range.*/
+        for(;;){
+          ogg_uint32_t qfac;
+          /*In the original VP3.2 code, the rounding offset and the size of the
+             dead zone around 0 were controlled by a "sharpness" parameter.
+            The size of our dead zone is now controlled by the per-coefficient
+             quality thresholds returned by our HVS module.
+            We round down from a more accurate value when the quality of the
+             reconstruction does not fall below our threshold and it saves bits.
+            Hence, all of that VP3.2 code is gone from here, and the remaining
+             floating point code has been implemented as equivalent integer code
+             with exact precision.*/
+          qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
+          /*For postprocessing, not dequantization.*/
+          if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
+          /*Scale DC the coefficient from the proper table.*/
+          q=(qfac/100)<<2;
+          q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          stage[qi][0]=(ogg_uint16_t)q;
+          /*Now scale AC coefficients from the proper table.*/
+          for(ci=1;ci<64;ci++){
+            q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[ci]/100)<<2;
+            q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+            stage[qi][ci]=(ogg_uint16_t)q;
+          }
+          if(++qi>=qi_end)break;
+          /*Interpolate the next base matrix.*/
+          for(ci=0;ci<64;ci++){
+            base[ci]=(unsigned char)(
+             (2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+             (qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+             +_qinfo->qi_ranges[qti][pli].sizes[qri])/
+             (2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
+          }
+        }
       }
-
-      /* Staging matricies complete; commit to memory only if this
-	 isn't a duplicate of a preceeding plane. This simple check
-	 helps us improve cache coherency later.*/
+      /*Staging matricies complete; commit to memory only if this isn't a
+         duplicate of a preceeding plane.
+        This simple check helps us improve cache coherency later.*/
       {
-	int dupe = 0;
-	int i,j;
-	for(i=0;i<=qti;i++){
-	  for(j=0;j<(i<qti?3:pli);j++){
-	    if(!memcmp(stage,_dequant[i][j],sizeof(stage))){
-	      dupe = 1;
-	      break;
-	    }
-	  }
-	  if(dupe)break;
-	}
-	if(dupe){
-	  _dequant[qti][pli]=_dequant[i][j];
-	}else{
-	  memcpy(_dequant[qti][pli],stage,sizeof(stage));
-	}
+        int dupe;
+        int qtj;
+        int plj;
+        dupe=0;
+        for(qtj=0;qtj<=qti;qtj++){
+          for(plj=0;plj<(qtj<qti?3:pli);plj++){
+            if(!memcmp(stage,_dequant[qtj][plj],sizeof(stage))){
+              dupe=1;
+              break;
+            }
+          }
+          if(dupe)break;
+        }
+        if(dupe)_dequant[qti][pli]=_dequant[qtj][plj];
+        else memcpy(_dequant[qti][pli],stage,sizeof(stage));
       }
     }
   }

Modified: branches/theora-thusnelda/lib/dec/quant.h
===================================================================
--- branches/theora-thusnelda/lib/dec/quant.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/quant.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -24,23 +24,11 @@
 typedef oc_quant_table oc_quant_tables[64];
 
 
-
 /*Maximum scaled quantizer value.*/
 #define OC_QUANT_MAX          (1024<<2)
 
 
-
-/*Minimum scaled DC coefficient frame quantizer value for intra and inter
-   modes.*/
-extern unsigned OC_DC_QUANT_MIN[2];
-/*Minimum scaled AC coefficient frame quantizer value for intra and inter
-   modes.*/
-extern unsigned OC_AC_QUANT_MIN[2];
-
-
-
 void oc_dequant_tables_init(oc_quant_table *_dequant[2][3],
-			    int _pp_dc_scale[64],
-			    const th_quant_info *_qinfo);
+ int _pp_dc_scale[64],const th_quant_info *_qinfo);
 
 #endif

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -20,8 +20,12 @@
 #include "../internal.h"
 #include "idct.h"
 #if defined(USE_ASM)
+#if defined(_MSC_VER)
+# include "x86_vc/x86int.h"
+#else
 # include "x86/x86int.h"
 #endif
+#endif
 #if defined(OC_DUMP_IMAGES)
 # include <stdio.h>
 # include "png.h"
@@ -486,12 +490,12 @@
   /*Set up the width, height and stride for the image buffers.*/
   _state->ref_frame_bufs[0][0].width=info->frame_width;
   _state->ref_frame_bufs[0][0].height=info->frame_height;
-  _state->ref_frame_bufs[0][0].ystride=yhstride;
+  _state->ref_frame_bufs[0][0].stride=yhstride;
   _state->ref_frame_bufs[0][1].width=_state->ref_frame_bufs[0][2].width=
    info->frame_width>>!(info->pixel_fmt&1);
   _state->ref_frame_bufs[0][1].height=_state->ref_frame_bufs[0][2].height=
    info->frame_height>>!(info->pixel_fmt&2);
-  _state->ref_frame_bufs[0][1].ystride=_state->ref_frame_bufs[0][2].ystride=
+  _state->ref_frame_bufs[0][1].stride=_state->ref_frame_bufs[0][2].stride=
    chstride;
   memcpy(_state->ref_frame_bufs[1],_state->ref_frame_bufs[0],
    sizeof(_state->ref_frame_bufs[0]));
@@ -544,6 +548,7 @@
 
 
 int oc_state_init(oc_theora_state *_state,const th_info *_info){
+  int old_granpos;
   /*First validate the parameters.*/
   if(_info==NULL)return TH_EFAULT;
   /*The width and height of the encoded frame must be multiples of 16.
@@ -579,8 +584,15 @@
   if(_info->keyframe_granule_shift<0||_info->keyframe_granule_shift>31){
     _state->info.keyframe_granule_shift=31;
   }
-  _state->keyframe_num=0;
-  _state->curframe_num=-1;
+  _state->keyframe_num=1;
+  _state->curframe_num=0;
+  /*3.2.0 streams mark the frame index instead of the frame count.
+    This was changed with stream version 3.2.1 to conform to other Ogg
+     codecs.
+    We subtract an extra one from the frame number for old streams.*/
+  old_granpos=!TH_VERSION_CHECK(_info,3,2,1);
+  _state->curframe_num-=old_granpos;
+  _state->keyframe_num-=old_granpos;
   return 0;
 }
 
@@ -607,15 +619,15 @@
   int               hpadding;
   hpadding=OC_UMV_PADDING>>(_pli!=0&&!(_state->info.pixel_fmt&1));
   iplane=_state->ref_frame_bufs[_refi]+_pli;
-  apix=iplane->data+_y0*iplane->ystride;
+  apix=iplane->data+_y0*iplane->stride;
   bpix=apix+iplane->width-1;
-  epix=iplane->data+_yend*iplane->ystride;
+  epix=iplane->data+_yend*iplane->stride;
   /*Note the use of != instead of <, which allows ystride to be negative.*/
   while(apix!=epix){
     memset(apix-hpadding,apix[0],hpadding);
     memset(bpix+1,bpix[0],hpadding);
-    apix+=iplane->ystride;
-    bpix+=iplane->ystride;
+    apix+=iplane->stride;
+    bpix+=iplane->stride;
   }
 }
 
@@ -638,13 +650,13 @@
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fullw=iplane->width+(hpadding<<1);
   apix=iplane->data-hpadding;
-  bpix=iplane->data+(iplane->height-1)*iplane->ystride-hpadding;
-  epix=apix-iplane->ystride*vpadding;
+  bpix=iplane->data+(iplane->height-1)*iplane->stride-hpadding;
+  epix=apix-iplane->stride*vpadding;
   while(apix!=epix){
-    memcpy(apix-iplane->ystride,apix,fullw);
-    memcpy(bpix+iplane->ystride,bpix,fullw);
-    apix-=iplane->ystride;
-    bpix+=iplane->ystride;
+    memcpy(apix-iplane->stride,apix,fullw);
+    memcpy(bpix+iplane->stride,bpix,fullw);
+    apix-=iplane->stride;
+    bpix+=iplane->stride;
   }
 }
 
@@ -697,7 +709,7 @@
         frag->buffer[_buf_idx]=hpix;
         hpix+=8;
       }
-      vpix+=iplane->ystride<<3;
+      vpix+=iplane->stride<<3;
     }
   }
 }
@@ -713,22 +725,19 @@
 
 /*Determines the offsets in an image buffer to use for motion compensation.
   _state:   The Theora state the offsets are to be computed with.
-  _offset0: Returns the offset for the first buffer.
-  _offset1: Returns the offset for the second buffer, if the motion vector
-             has non-zero fractional components.
+  _offsets: Returns the offset for the buffer(s).
+            _offsets[0] is always set.
+            _offsets[1] is set if the motion vector has non-zero fractional
+             components.
   _dx:      The X component of the motion vector.
   _dy:      The Y component of the motion vector.
   _ystride: The Y stride in the buffer the motion vector points into.
   _pli:     The color plane index.
   Return: The number of offsets returned: 1 or 2.*/
-int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offset0,
- int *_offset1,int _dx,int _dy,int _ystride,int _pli){
-  int offset0;
-  int offset1;
+int oc_state_get_mv_offsets(oc_theora_state *_state,int _offsets[2],
+ int _dx,int _dy,int _ystride,int _pli){
   int xprec;
   int yprec;
-  int xsign;
-  int ysign;
   int xfrac;
   int yfrac;
   /*Here is a brief description of how Theora handles motion vectors:
@@ -747,67 +756,44 @@
      appropriate amount, always truncating _away_ from zero.*/
   /*These two variables decide whether we are in half- or quarter-pixel
      precision in each component.*/
-  xprec=1+(!(_state->info.pixel_fmt&1)&!!_pli);
-  yprec=1+(!(_state->info.pixel_fmt&2)&!!_pli);
-  /*These two variables are either 0 for a non-negative vector or all 1's for
-     a negative one.*/
-  xsign=-(_dx<0);
-  ysign=-(_dy<0);
+  xprec=1+(!(_state->info.pixel_fmt&1)&&_pli);
+  yprec=1+(!(_state->info.pixel_fmt&2)&&_pli);
   /*These two variables are either 0 if all the fractional bits are 0 or 1 if
      any of them are non-zero.*/
   xfrac=!!(_dx&(1<<xprec)-1);
   yfrac=!!(_dy&(1<<yprec)-1);
-  /*This branchless code is equivalent to:
-  if(_dx<0){
-    if(_dy<0){
-      offset0=-(-_dx>>xprec)-(-_dy>>yprec)*_ystride;
-    }
-    else{
-      offset0=-(-_dx>>xprec)+(_dy>>yprec)*_ystride;
-    }
-  }
-  else{
-    if(_dy<0){
-      offset0=(_dx>>xprec)-(-_dy>>yprec)*_ystride;
-    }
-    else{
-      offset0=(_dx>>xprec)+(_dy>>yprec)*_ystride;
-    }
-  }*/
-  *_offset0=offset0=(_dx>>xprec)+(xfrac&xsign)+
-   ((_dy>>yprec)+(yfrac&ysign))*_ystride;
+  _offsets[0]=(_dx>>xprec)+(_dy>>yprec)*_ystride;
   if(xfrac||yfrac){
-    int o[2];
     /*This branchless code is equivalent to:
+    if(_dx<0)_offests[0]=-(-_dx>>xprec);
+    else _offsets[0]=(_dx>>xprec);
+    if(_dy<0)_offsets[0]-=(-_dy>>yprec)*_ystride;
+    else _offsets[0]+=(_dy>>yprec)*_ystride;
+    _offsets[1]=_offsets[0];
     if(xfrac){
-      if(_dx<0)offset1=offset0-1;
-      else offset1=offset0+1;
+      if(_dx<0)_offsets[1]++;
+      else _offsets[1]--;
     }
-    else offset1=offset0;*/
-    o[0]=offset0;
-    o[1]=offset0+(xsign|1);
-    offset1=o[xfrac];
-    /*This branchless code is equivalent to:
     if(yfrac){
-      if(_dy<0)offset1-=ref_stride;
-      else offset1+=ref_stride;
+      if(_dy<0)_offsets[1]+=_ystride;
+      else _offsets[1]-=_ystride;
     }*/
-    o[0]=offset1;
-    o[1]=offset1+(_ystride&~ysign)-(_ystride&ysign);
-    *_offset1=o[yfrac];
+    _offsets[1]=_offsets[0];
+    _offsets[_dx>=0]+=xfrac;
+    _offsets[_dy>=0]+=_ystride&-yfrac;
     return 2;
   }
-  return 1;
+  else return 1;
 }
 
-void oc_state_frag_recon(oc_theora_state *_state, oc_fragment *_frag,
+void oc_state_frag_recon(oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   _state->opt_vtable.state_frag_recon(_state,_frag,_pli,_dct_coeffs,
    _last_zzi,_ncoefs,_dc_iquant,_ac_iquant);
 }
 
-void oc_state_frag_recon_c(oc_theora_state *_state, oc_fragment *_frag,
+void oc_state_frag_recon_c(oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant, const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t dct_buf[64];
@@ -845,8 +831,8 @@
     ogg_int16_t p;
     /*Why is the iquant product rounded in this case and no others?
       Who knows.*/
-
     p=(ogg_int16_t)((ogg_int32_t)_frag->dc*_dc_iquant+15>>5);
+    /*LOOP VECTORIZES.*/
     for(ci=0;ci<64;ci++)res_buf[ci]=p;
   }
   else{
@@ -857,7 +843,6 @@
       ci=OC_FZIG_ZAG[zzi];
       dct_buf[ci]=(ogg_int16_t)((ogg_int32_t)_dct_coeffs[zzi]*_ac_iquant[ci]);
     }
-
     /*Then, fill in the remainder of the coefficients with 0's, and perform
        the iDCT.*/
     if(_last_zzi<10){
@@ -868,11 +853,10 @@
       for(;zzi<64;zzi++)dct_buf[OC_FZIG_ZAG[zzi]]=0;
       oc_idct8x8_c(res_buf,dct_buf);
     }
-
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
     oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
@@ -880,19 +864,18 @@
   else{
     int ref_framei;
     int ref_ystride;
-    int mvoffset0;
-    int mvoffset1;
+    int mvoffsets[2];
     ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
-    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
-     _frag->mv[1],ref_ystride,_pli)>1){
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_frag->mv[0],_frag->mv[1],
+     ref_ystride,_pli)>1){
       oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
-       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,
+       _frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
     }
     else{
       oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
     }
   }
   oc_restore_fpu(_state);
@@ -921,8 +904,8 @@
   int        src_ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride;
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
     oc_fragment   *frag;
@@ -946,6 +929,9 @@
   for(y=0;y<8;y++){
     int f;
     f=_pix[0]-_pix[3]+3*(_pix[2]-_pix[1]);
+    /*The _bv array is used to compute the function
+      f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0));
+      where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/
     f=*(_bv+(f+4>>3));
     _pix[1]=OC_CLAMP255(_pix[1]+f);
     _pix[2]=OC_CLAMP255(_pix[2]-f);
@@ -959,6 +945,9 @@
   for(y=0;y<8;y++){
     int f;
     f=_pix[0]-_pix[_ystride*3]+3*(_pix[_ystride*2]-_pix[_ystride]);
+    /*The _bv array is used to compute the function
+      f=OC_CLAMPI(OC_MINI(-_2flimit-f,0),f,OC_MAXI(_2flimit-f,0));
+      where _2flimit=_state->loop_filter_limits[_state->qis[0]]<<1;*/
     f=*(_bv+(f+4>>3));
     _pix[_ystride]=OC_CLAMP255(_pix[_ystride]+f);
     _pix[_ystride*2]=OC_CLAMP255(_pix[_ystride*2]-f);
@@ -974,12 +963,12 @@
   int i;
   flimit=_state->loop_filter_limits[_state->qis[0]];
   if(flimit==0)return 1;
-  memset(_bv,0,sizeof(_bv[0])*512);
+  memset(_bv,0,sizeof(_bv[0])*256);
   for(i=0;i<flimit;i++){
-    _bv[256-i-flimit]=i-flimit;
-    _bv[256-i]=-i;
-    _bv[256+i]=i;
-    _bv[256+i+flimit]=flimit-i;
+    if(127-i-flimit>=0)_bv[127-i-flimit]=i-flimit;
+    _bv[127-i]=-i;
+    _bv[127+i]=i;
+    if(127+i+flimit<256)_bv[127+i+flimit]=flimit-i;
   }
   return 0;
 }
@@ -999,7 +988,7 @@
 }
 
 void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
- int _refi,int _pli,int _fragy0,int _fragy_end){  
+ int _refi,int _pli,int _fragy0,int _fragy_end){
   th_img_plane      *iplane;
   oc_fragment_plane *fplane;
   oc_fragment       *frag_top;
@@ -1008,10 +997,9 @@
   oc_fragment       *frag_end;
   oc_fragment       *frag0_end;
   oc_fragment       *frag_bot;
-  _bv+=256;
+  _bv+=127;
   iplane=_state->ref_frame_bufs[_refi]+_pli;
   fplane=_state->fplanes+_pli;
-
   /*The following loops are constructed somewhat non-intuitively on purpose.
     The main idea is: if a block boundary has at least one coded fragment on
      it, the filter is applied to it.
@@ -1027,20 +1015,19 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi],iplane->stride,_bv);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,_bv);
+          loop_filter_v(frag->buffer[_refi],iplane->stride,_bv);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,_bv);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->stride,_bv);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
           loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
-           iplane->ystride,_bv);
+           iplane->stride,_bv);
         }
       }
-
       frag++;
     }
     frag0+=fplane->nhfrags;
@@ -1103,9 +1090,9 @@
   y_row=_state->ref_frame_bufs[framei][0].data;
   u_row=_state->ref_frame_bufs[framei][1].data;
   v_row=_state->ref_frame_bufs[framei][2].data;
-  y_stride=_state->ref_frame_bufs[framei][0].ystride;
-  u_stride=_state->ref_frame_bufs[framei][1].ystride;
-  v_stride=_state->ref_frame_bufs[framei][2].ystride;
+  y_stride=_state->ref_frame_bufs[framei][0].stride;
+  u_stride=_state->ref_frame_bufs[framei][1].stride;
+  v_stride=_state->ref_frame_bufs[framei][2].stride;
   /*Chroma up-sampling is just done with a box filter.
     This is very likely what will actually be used in practice on a real
      display, and also removes one more layer to search in for the source of
@@ -1185,14 +1172,11 @@
     ogg_int64_t pframe;
     iframe=_granpos>>state->info.keyframe_granule_shift;
     pframe=_granpos-(iframe<<state->info.keyframe_granule_shift);
-
-    /* 3.2.0 streams mark the frame index instead of the frame count
-     * this was changed with stream version 3.2.1 */ 
-    if(state->info.version_subminor < 1) {
-      return iframe+pframe + 1;
-    } else {
-      return iframe+pframe;
-    }
+    /*3.2.0 streams store the frame index in the granule position.
+      3.2.1 and later store the frame count.
+      We return the index, so adjust the value if we have a 3.2.1 or later
+       stream.*/
+    return iframe+pframe-TH_VERSION_CHECK(&state->info,3,2,1);
   }
   return -1;
 }
@@ -1201,7 +1185,8 @@
   oc_theora_state *state;
   state=(oc_theora_state *)_encdec;
   if(_granpos>=0){
-      return th_granule_frame(_encdec, _granpos)*((double)state->info.fps_denominator/state->info.fps_numerator);
+    return (th_granule_frame(_encdec, _granpos)+1)*(
+     (double)state->info.fps_denominator/state->info.fps_numerator);
   }
   return -1;
 }

Modified: branches/theora-thusnelda/lib/dec/x86/mmxfrag.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxfrag.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/x86/mmxfrag.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -5,8 +5,8 @@
  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
- * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -16,44 +16,128 @@
  ********************************************************************/
 
 /*MMX acceleration of fragment reconstruction for motion compensation.
-  Originally written by Rudolf Marek.*/
+  Originally written by Rudolf Marek.
+  Additional optimization by Nils Pipenbrinck.
+  Note: Loops are unrolled for best performance.
+  The iteration each instruction belongs to is marked in the comments as #i.*/
 #include "x86int.h"
+#include <stddef.h>
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V128=
- 0x0080008000800080LL;
-
 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _dst_ystride,
  const ogg_int16_t *_residue){
-  int i;
-  for(i=8;i-->0;){
-    __asm__ __volatile__(
-      /*Set mm0 to 0x0080008000800080.*/
-      "movq %[OC_V128],%%mm0\n\t"
-      /*First four input values*/
-      "movq (%[residue]),%%mm2\n\t"
-      /*Set mm1=mm0.*/
-      "movq %%mm0,%%mm1\n\t"
-      /*Next four input values.*/
-      "movq 8(%[residue]),%%mm3\n\t"
-      /*Add 128 and saturate to 16 bits.*/
-      "paddsw %%mm3,%%mm1\n\t"
-      /*_residue+=16*/
-      "lea 0x10(%[residue]),%[residue]\n\t"
-      /*Add 128 and saturate to 16 bits.*/
-      "paddsw %%mm2,%%mm0\n\t"
-      /*Pack saturate with next(high) four values.*/
-      "packuswb %%mm1,%%mm0\n\t"
-      /*Writeback.*/
-      "movq %%mm0,(%[dst])\n\t"
-      /*_dst+=_dst_ystride*/
-      "lea  (%[dst],%[dst_ystride]),%[dst]\n\t"
-      :[dst]"+r"(_dst),[residue]"+r"(_residue)
-      :[dst_ystride]"r"((long)_dst_ystride),[OC_V128]"m"(OC_V128)
-      :"memory"
-    );
-  }
+  __asm__ __volatile__(
+    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
+    /*#0 Load low residue.*/
+    "movq 0*8(%[residue]),%%mm1\n\t"
+    /*#0 Load high residue.*/
+    "movq 1*8(%[residue]),%%mm2\n\t"
+    /*Set mm0 to 0x8000800080008000.*/
+    "psllw $15,%%mm0\n\t"
+    /*#1 Load low residue.*/
+    "movq 2*8(%[residue]),%%mm3\n\t"
+    /*#1 Load high residue.*/
+    "movq 3*8(%[residue]),%%mm4\n\t"
+    /*Set mm0 to 0x0080008000800080.*/
+    "psrlw $8,%%mm0\n\t"
+    /*#2 Load low residue.*/
+    "movq 4*8(%[residue]),%%mm5\n\t"
+    /*#2 Load high residue.*/
+    "movq 5*8(%[residue]),%%mm6\n\t"
+    /*#0 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#0 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#0 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#1 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#1 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#1 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#2 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#2 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#2 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#0 Write row.*/
+    "movq %%mm1,(%[dst])\n\t"
+    /*#1 Write row.*/
+    "movq %%mm3,(%[dst],%[dst_ystride])\n\t"
+    /*#2 Write row.*/
+    "movq %%mm5,(%[dst],%[dst_ystride],2)\n\t"
+    /*#3 Load low residue.*/
+    "movq 6*8(%[residue]),%%mm1\n\t"
+    /*#3 Load high residue.*/
+    "movq 7*8(%[residue]),%%mm2\n\t"
+    /*#4 Load high residue.*/
+    "movq 8*8(%[residue]),%%mm3\n\t"
+    /*#4 Load high residue.*/
+    "movq 9*8(%[residue]),%%mm4\n\t"
+    /*#5 Load high residue.*/
+    "movq 10*8(%[residue]),%%mm5\n\t"
+    /*#5 Load high residue.*/
+    "movq 11*8(%[residue]),%%mm6\n\t"
+    /*#3 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#3 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#3 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#4 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#4 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#4 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#5 Bias low  residue.*/
+    "paddsw %%mm0,%%mm5\n\t"
+    /*#5 Bias high residue.*/
+    "paddsw %%mm0,%%mm6\n\t"
+    /*#5 Pack to byte.*/
+    "packuswb %%mm6,%%mm5\n\t"
+    /*#3 Write row.*/
+    "movq %%mm1,(%[dst],%[dst_ystride3])\n\t"
+    /*#4 Write row.*/
+    "movq %%mm3,(%[dst4])\n\t"
+    /*#5 Write row.*/
+    "movq %%mm5,(%[dst4],%[dst_ystride])\n\t"
+    /*#6 Load low residue.*/
+    "movq 12*8(%[residue]),%%mm1\n\t"
+    /*#6 Load high residue.*/
+    "movq 13*8(%[residue]),%%mm2\n\t"
+    /*#7 Load low residue.*/
+    "movq 14*8(%[residue]),%%mm3\n\t"
+    /*#7 Load high residue.*/
+    "movq 15*8(%[residue]),%%mm4\n\t"
+    /*#6 Bias low  residue.*/
+    "paddsw %%mm0,%%mm1\n\t"
+    /*#6 Bias high residue.*/
+    "paddsw %%mm0,%%mm2\n\t"
+    /*#6 Pack to byte.*/
+    "packuswb %%mm2,%%mm1\n\t"
+    /*#7 Bias low  residue.*/
+    "paddsw %%mm0,%%mm3\n\t"
+    /*#7 Bias high residue.*/
+    "paddsw %%mm0,%%mm4\n\t"
+    /*#7 Pack to byte.*/
+    "packuswb %%mm4,%%mm3\n\t"
+    /*#6 Write row.*/
+    "movq %%mm1,(%[dst4],%[dst_ystride],2)\n\t"
+    /*#7 Write row.*/
+    "movq %%mm3,(%[dst4],%[dst_ystride3])\n\t"
+    :
+    :[residue]"r"(_residue),
+     [dst]"r"(_dst),
+     [dst4]"r"(_dst+(_dst_ystride<<2)),
+     [dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+     [dst_ystride3]"r"((ptrdiff_t)_dst_ystride*3)
+    :"memory"
+  );
 }
 
 void oc_frag_recon_inter_mmx(unsigned char *_dst,int _dst_ystride,
@@ -61,179 +145,146 @@
   int i;
   /*Zero mm0.*/
   __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
-  for(i=8;i-->0;){
+  for(i=4;i-->0;){
     __asm__ __volatile__(
-      /*Load mm2 with _src*/
-      "movq (%[src]),%%mm2\n\t"
-      /*Copy mm2 to mm3.*/
-      "movq %%mm2,%%mm3\n\t"
-      /*Expand high part of _src to 16 bits.*/
+      /*#0 Load source.*/
+      "movq (%[src]),%%mm3\n\t"
+      /*#1 Load source.*/
+      "movq (%[src],%[src_ystride]),%%mm7\n\t"
+      /*#0 Get copy of src.*/
+      "movq %%mm3,%%mm4\n\t"
+      /*#0 Expand high source.*/
+      "punpckhbw %%mm0,%%mm4\n\t"
+      /*#0 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm3\n\t"
+      /*#0 Add residue high.*/
+      "paddsw 8(%[residue]),%%mm4\n\t"
+      /*#1 Get copy of src.*/
+      "movq %%mm7,%%mm2\n\t"
+      /*#0 Add residue low.*/
+      "paddsw (%[residue]), %%mm3\n\t"
+      /*#1 Expand high source.*/
       "punpckhbw %%mm0,%%mm2\n\t"
-      /*Expand low part of _src to 16 bits.*/
-      "punpcklbw %%mm0,%%mm3\n\t"
-      /*Add low part with low part of residue.*/
-      "paddsw (%[residue]),%%mm3\n\t"
-      /*High with high.*/
-      "paddsw 8(%[residue]),%%mm2\n\t"
-      /*Pack and saturate to mm3.*/
-      "packuswb %%mm2,%%mm3\n\t"
-      /*_src+=_src_ystride*/
-      "lea (%[src],%[src_ystride]),%[src]\n\t"
-      /*_residue+=16*/
-      "lea 0x10(%[residue]),%[residue]\n\t"
-      /*Put mm3 to dest.*/
+      /*#0 Pack final row pixels.*/
+      "packuswb %%mm4,%%mm3\n\t"
+      /*#1 Expand low  source.*/
+      "punpcklbw %%mm0,%%mm7\n\t"
+      /*#1 Add residue low.*/
+      "paddsw 16(%[residue]),%%mm7\n\t"
+      /*#1 Add residue high.*/
+      "paddsw 24(%[residue]),%%mm2\n\t"
+      /*Advance residue.*/
+      "lea 32(%[residue]),%[residue]\n\t"
+      /*#1 Pack final row pixels.*/
+      "packuswb %%mm2,%%mm7\n\t"
+      /*Advance src.*/
+      "lea (%[src],%[src_ystride],2),%[src]\n\t"
+      /*#0 Write row.*/
       "movq %%mm3,(%[dst])\n\t"
-      /*_dst+=_dst_ystride*/
-      "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
-      :[dst]"+r"(_dst),[src]"+r"(_src),[residue]"+r"(_residue)
-      :[dst_ystride]"r"((long)_dst_ystride),
-       [src_ystride]"r"((long)_src_ystride)
+      /*#1 Write row.*/
+      "movq %%mm7,(%[dst],%[dst_ystride])\n\t"
+      /*Advance dst.*/
+      "lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
+      :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
+      :[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
+       [src_ystride]"r"((ptrdiff_t)_src_ystride)
       :"memory"
     );
   }
 }
 
-#if defined(__amd64__)||defined(__x86_64__)
-
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
  const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
  int _src2_ystride,const ogg_int16_t *_residue){
   int i;
-  __asm__ __volatile__(
-    /*Zero mm0.*/
-    "pxor %%mm0,%%mm0\n\t"
-    /*Load mm2 with _src1.*/
-    "movq (%[src1]),%%mm2\n\t"
-    :[src1]"+r"(_src1)
-    :
-  );
-  for(i=8;i-->0;){
+  /*NOTE: This assumes that
+     _dst_ystride==_src1_ystride&&_dst_ystride==_src2_ystride.
+    This is currently always the case, but a slower fallback version will need
+     to be written if it ever is not.*/
+  /*Zero mm7.*/
+  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
+  for(i=4;i-->0;){
     __asm__ __volatile__(
-     /*Packed _src2.*/ 
-     "movq (%[src2]),%%mm4\n\t"
-     /*Copy packed src1 to mm3.*/
-     "movq %%mm2,%%mm3\n\t"
-     /*Copy packed src2 to mm5.*/
-     "movq %%mm4,%%mm5\n\t"
-     /*Expand low part of src1 to mm2.*/
-     "punpcklbw %%mm0,%%mm2\n\t"
-     /*Expand Low part of src2 to mm4.*/
-     "punpcklbw %%mm0,%%mm4\n\t"
-     /*_src1+=_src1_ystride*/
-     "lea (%[src1],%[src1_ystride]),%[src1]\n\t"
-     /*Expand high part of src1 to mm3.*/
-     "punpckhbw %%mm0,%%mm3\n\t"
-     /*Expand high part of src2 to mm5.*/
-     "punpckhbw %%mm0,%%mm5\n\t"
-     /*Add low parts of src1 and src2.*/
-     "paddsw %%mm2,%%mm4\n\t"
-     /*Add high parts of src1 and src2.*/
-     "paddsw %%mm3,%%mm5\n\t"
-     /*_src2+=_src2_ystride.*/
-     "lea (%[src2],%[src2_ystride]),%[src2]\n\t"
-     /*Load mm2 with _src1.*/
-     "movq (%[src1]),%%mm2\n\t"
-     /*Shift logical 1 to right o 2 dolu.*/
-     "psrlw $1,%%mm4\n\t"
-     /*Shift logical 1 to right.*/
-     "psrlw $1,%%mm5\n\t"
-     /*Add low parts wwith low parts.*/
-     "paddsw (%[residue]),%%mm4\n\t"
-     /*Add highparts with high.*/
-     "paddsw 8(%[residue]),%%mm5\n\t"
-     /*Pack saturate high to low.*/
-     "packuswb %%mm5,%%mm4\n\t"
-     /*_residue+=16.*/
-     "lea 0x10(%[residue]),%[residue]\n\t"
-     /*Write to dst.*/
-     "movq %%mm4,(%[dst])\n\t"
-     /*_dst+=_dst_ystride*/
-     "lea (%[dst],%[dst_ystride]),%[dst]\n\t"
+      /*#0 Load src1.*/
+      "movq (%[src1]),%%mm0\n\t"
+      /*#0 Load src2.*/
+      "movq (%[src2]),%%mm2\n\t"
+      /*#0 Copy src1.*/
+      "movq %%mm0,%%mm1\n\t"
+      /*#0 Copy src2.*/
+      "movq %%mm2,%%mm3\n\t"
+      /*#1 Load src1.*/
+      "movq (%[src1],%[ystride]),%%mm4\n\t"
+      /*#0 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm0\n\t"
+      /*#1 Load src2.*/
+      "movq (%[src2],%[ystride]),%%mm5\n\t"
+      /*#0 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm1\n\t"
+      /*#0 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm2\n\t"
+      /*#0 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*Advance src1 ptr.*/
+      "lea (%[src1],%[ystride],2),%[src1]\n\t"
+      /*Advance src2 ptr.*/
+      "lea (%[src2],%[ystride],2),%[src2]\n\t"
+      /*#0 Lower src1+src2.*/
+      "paddsw %%mm2,%%mm0\n\t"
+      /*#0 Higher src1+src2.*/
+      "paddsw %%mm3,%%mm1\n\t"
+      /*#1 Copy src1.*/
+      "movq %%mm4,%%mm2\n\t"
+      /*#0 Build lo average.*/
+      "psraw $1,%%mm0\n\t"
+      /*#1 Copy src2.*/
+      "movq %%mm5,%%mm3\n\t"
+      /*#1 Unpack lower src1.*/
+      "punpcklbw %%mm7,%%mm4\n\t"
+      /*#0 Build hi average.*/
+      "psraw $1,%%mm1\n\t"
+      /*#1 Unpack higher src1.*/
+      "punpckhbw %%mm7,%%mm2\n\t"
+      /*#0 low+=residue.*/
+      "paddsw (%[residue]),%%mm0\n\t"
+      /*#1 Unpack lower src2.*/
+      "punpcklbw %%mm7,%%mm5\n\t"
+      /*#0 high+=residue.*/
+      "paddsw 8(%[residue]),%%mm1\n\t"
+      /*#1 Unpack higher src2.*/
+      "punpckhbw %%mm7,%%mm3\n\t"
+      /*#1 Lower src1+src2.*/
+      "paddsw %%mm4,%%mm5\n\t"
+      /*#0 Pack and saturate.*/
+      "packuswb %%mm1,%%mm0\n\t"
+      /*#1 Higher src1+src2.*/
+      "paddsw %%mm2,%%mm3\n\t"
+      /*#0 Write row.*/
+      "movq %%mm0,(%[dst])\n\t"
+      /*#1 Build lo average.*/
+      "psraw $1,%%mm5\n\t"
+      /*#1 Build hi average.*/
+      "psraw $1,%%mm3\n\t"
+      /*#1 low+=residue.*/
+      "paddsw 16(%[residue]),%%mm5\n\t"
+      /*#1 high+=residue.*/
+      "paddsw 24(%[residue]),%%mm3\n\t"
+      /*#1 Pack and saturate.*/
+      "packuswb  %%mm3,%%mm5\n\t"
+      /*#1 Write row ptr.*/
+      "movq %%mm5,(%[dst],%[ystride])\n\t"
+      /*Advance residue ptr.*/
+      "add $32,%[residue]\n\t"
+      /*Advance dest ptr.*/
+      "lea (%[dst],%[ystride],2),%[dst]\n\t"
      :[dst]"+r"(_dst),[residue]"+r"(_residue),
       [src1]"+r"(_src1),[src2]"+r"(_src2)
-     :[dst_ystride]"r"((long)_dst_ystride),
-      [src1_ystride]"r"((long)_src1_ystride),
-      [src2_ystride]"r"((long)_src2_ystride)
+     :[ystride]"r"((ptrdiff_t)_dst_ystride)
      :"memory"
     );
   }
 }
 
-#else
-
-void oc_frag_recon_inter2_mmx(unsigned char *_dst,int _dst_ystride,
- const unsigned char *_src1,int _src1_ystride,const unsigned char *_src2,
- int _src2_ystride,const ogg_int16_t *_residue){
-  long a;
-  int  i;
-  __asm__ __volatile__(
-    /*Zero mm0.*/
-    "pxor %%mm0,%%mm0\n\t"
-    /*Load mm2 with _src1.*/
-    "movq (%[src1]),%%mm2\n\t"
-    :[src1]"+r"(_src1)
-    :
-  );
-  for(i=8;i-->0;){
-    __asm__ __volatile__(
-     /*Packed _src2.*/ 
-     "movq (%[src2]),%%mm4\n\t"
-     /*Copy packed src1 to mm3.*/
-     "movq %%mm2,%%mm3\n\t"
-     /*Copy packed src2 to mm5.*/
-     "movq %%mm4,%%mm5\n\t"
-     /*eax=_src1_ystride*/
-     "mov %[src1_ystride],%[a]\n\t"
-     /*Expand low part of src1 to mm2.*/
-     "punpcklbw %%mm0,%%mm2\n\t"
-     /*Expand Low part of src2 to mm4.*/
-     "punpcklbw %%mm0,%%mm4\n\t"
-     /*_src1+=_src1_ystride*/
-     "lea (%[src1],%[a]),%[src1]\n\t"
-     /*Expand high part of src1 to mm3.*/
-     "punpckhbw %%mm0,%%mm3\n\t"
-     /*Expand high part of src2 to mm5.*/
-     "punpckhbw %%mm0,%%mm5\n\t"
-     /*eax=_src2_ystride*/
-     "mov %[src2_ystride],%[a]\n\t"
-     /*Add low parts of src1 and src2.*/
-     "paddsw %%mm2,%%mm4\n\t"
-     /*Add high parts of src1 and src2.*/
-     "paddsw %%mm3,%%mm5\n\t"
-     /*_src2+=_src2_ystride.*/
-     "lea (%[src2],%[a]),%[src2]\n\t"
-     /*Load mm2 with _src1.*/
-     "movq (%[src1]),%%mm2\n\t"
-     /*Shift logical 1 to right o 2 dolu.*/
-     "psrlw $1,%%mm4\n\t"
-     /*Shift logical 1 to right.*/
-     "psrlw $1,%%mm5\n\t"
-     /*Add low parts wwith low parts.*/
-     "paddsw (%[residue]),%%mm4\n\t"
-     /*Add highparts with high.*/
-     "paddsw 8(%[residue]),%%mm5\n\t"
-     /*eax=_dst_ystride.*/
-     "mov %[dst_ystride],%[a]\n\t"
-     /*Pack saturate high to low.*/
-     "packuswb %%mm5,%%mm4\n\t"
-     /*_residue+=16.*/
-     "lea 0x10(%[residue]),%[residue]\n\t"
-     /*Write to dst.*/
-     "movq %%mm4,(%[dst])\n\t"
-     /*_dst+=_dst_ystride*/
-     "lea (%[dst],%[a]),%[dst]\n\t"
-     :[a]"=&a"(a),[dst]"+r"(_dst),[residue]"+r"(_residue),
-      [src1]"+r"(_src1),[src2]"+r"(_src2)
-     :[dst_ystride]"m"((long)_dst_ystride),
-      [src1_ystride]"m"((long)_src1_ystride),
-      [src2_ystride]"m"((long)_src2_ystride)
-     :"memory"
-    );
-  }
-}
-
-#endif
-
 void oc_restore_fpu_mmx(void){
   __asm__ __volatile__("emms\n\t");
 }

Modified: branches/theora-thusnelda/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -26,22 +26,16 @@
 #if defined(USE_ASM)
 
 /*These are offsets into the table of constants below.*/
-/*4 masks, in order: low word to high.*/
-#define OC_MASK_OFFSET    (0)
 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
-#define OC_COSINE_OFFSET (32)
+#define OC_COSINE_OFFSET (0)
 /*A row of 8's.*/
-#define OC_EIGHT_OFFSET  (88)
+#define OC_EIGHT_OFFSET  (56)
 
 
 
 /*A table of constants used by the MMX routines.*/
 static const ogg_uint16_t __attribute__((aligned(8),used))
- OC_IDCT_CONSTS[(4+7+1)*4]={
-  65535,    0,    0,    0,
-      0,65535,    0,    0,
-      0,    0,65535,    0,
-      0,    0,    0,65535,
+ OC_IDCT_CONSTS[(7+1)*4]={
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
   (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
@@ -143,7 +137,7 @@
 
 /*38+8=46 cycles.*/
 #define OC_ROW_IDCT \
-  "#OC\n" \
+  "#OC_ROW_IDCT\n" \
   OC_IDCT_BEGIN \
   /*r3=D'*/ \
   "movq "OC_I(2)",%%mm3\n\t" \
@@ -316,7 +310,6 @@
   "#end OC_COLUMN_IDCT\n\t" \
 
 #define OC_MID(_m,_i) OC_M2STR(_m+(_i)*8)"(%[c])"
-#define OC_M(_i)      OC_MID(OC_MASK_OFFSET,_i)
 #define OC_C(_i)      OC_MID(OC_COSINE_OFFSET,_i-1)
 #define OC_8          OC_MID(OC_EIGHT_OFFSET,0)
 

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -19,14 +19,10 @@
   Originally written by Rudolf Marek.*/
 #include "x86int.h"
 #include "../../internal.h"
+#include <stddef.h>
 
 #if defined(USE_ASM)
 
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3=
- 0x0003000300030003LL;
-static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4=
- 0x0004000400040004LL;
-
 static const __attribute__((aligned(8),used)) int OC_FZIG_ZAGMMX[64]={
    0, 8, 1, 2, 9,16,24,17,
   10, 3,32,11,18,25, 4,12,
@@ -40,9 +36,9 @@
 
 
 
-void oc_state_frag_recon_mmx(oc_theora_state *_state, oc_fragment *_frag,
-			     int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
-			     ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
+void oc_state_frag_recon_mmx(oc_theora_state *_state,oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
+ ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]){
   ogg_int16_t  __attribute__((aligned(8))) res_buf[64];
   int dst_framei;
   int dst_ystride;
@@ -150,27 +146,26 @@
   }
   /*Fill in the target buffer.*/
   dst_framei=_state->ref_frame_idx[OC_FRAME_SELF];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
   /*For now ystride values in all ref frames assumed to be equal.*/
   if(_frag->mbmode==OC_MODE_INTRA){
-    oc_frag_recon_intra(_state,_frag->buffer[dst_framei],dst_ystride,res_buf);
+    oc_frag_recon_intra_mmx(_frag->buffer[dst_framei],dst_ystride,res_buf);
   }
   else{
     int ref_framei;
     int ref_ystride;
-    int mvoffset0;
-    int mvoffset1;
+    int mvoffsets[2];
     ref_framei=_state->ref_frame_idx[OC_FRAME_FOR_MODE[_frag->mbmode]];
-    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].ystride;
-    if(oc_state_get_mv_offsets(_state,&mvoffset0,&mvoffset1,_frag->mv[0],
-     _frag->mv[1],ref_ystride,_pli)>1){
-      oc_frag_recon_inter2(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,
-       _frag->buffer[ref_framei]+mvoffset1,ref_ystride,res_buf);
+    ref_ystride=_state->ref_frame_bufs[ref_framei][_pli].stride;
+    if(oc_state_get_mv_offsets(_state,mvoffsets,_frag->mv[0],_frag->mv[1],
+     ref_ystride,_pli)>1){
+      oc_frag_recon_inter2_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,
+       _frag->buffer[ref_framei]+mvoffsets[1],ref_ystride,res_buf);
     }
     else{
-      oc_frag_recon_inter(_state,_frag->buffer[dst_framei],dst_ystride,
-       _frag->buffer[ref_framei]+mvoffset0,ref_ystride,res_buf);
+      oc_frag_recon_inter_mmx(_frag->buffer[dst_framei],dst_ystride,
+       _frag->buffer[ref_framei]+mvoffsets[0],ref_ystride,res_buf);
     }
   }
   oc_restore_fpu(_state);
@@ -188,26 +183,26 @@
   const int *fragi;
   const int *fragi_end;
   int        dst_framei;
-  long       dst_ystride;
+  ptrdiff_t  dst_ystride;
   int        src_framei;
-  long       src_ystride;
+  ptrdiff_t  src_ystride;
   dst_framei=_state->ref_frame_idx[_dst_frame];
   src_framei=_state->ref_frame_idx[_src_frame];
-  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].ystride;
-  src_ystride=_state->ref_frame_bufs[src_framei][_pli].ystride;
+  dst_ystride=_state->ref_frame_bufs[dst_framei][_pli].stride;
+  src_ystride=_state->ref_frame_bufs[src_framei][_pli].stride;
   fragi_end=_fragis+_nfragis;
   for(fragi=_fragis;fragi<fragi_end;fragi++){
     oc_fragment   *frag;
     unsigned char *dst;
     unsigned char *src;
-    long           esi;
+    ptrdiff_t      s;
     frag=_state->frags+*fragi;
     dst=frag->buffer[dst_framei];
     src=frag->buffer[src_framei];
     __asm__ __volatile__(
       /*src+0*src_ystride*/
       "movq (%[src]),%%mm0\n\t"
-      /*esi=src_ystride*3*/
+      /*s=src_ystride*3*/
       "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
       /*src+1*src_ystride*/
       "movq (%[src],%[src_ystride]),%%mm1\n\t"
@@ -217,7 +212,7 @@
       "movq (%[src],%[s]),%%mm3\n\t"
       /*dst+0*dst_ystride*/
       "movq %%mm0,(%[dst])\n\t"
-      /*esi=dst_ystride*3*/
+      /*s=dst_ystride*3*/
       "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
       /*dst+1*dst_ystride*/
       "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
@@ -231,7 +226,7 @@
       "lea (%[dst],%[dst_ystride],4),%[dst]\n\t"
       /*src+0*src_ystride*/
       "movq (%[src]),%%mm0\n\t"
-      /*esi=src_ystride*3*/
+      /*s=src_ystride*3*/
       "lea (%[src_ystride],%[src_ystride],2),%[s]\n\t"
       /*src+1*src_ystride*/
       "movq (%[src],%[src_ystride]),%%mm1\n\t"
@@ -241,7 +236,7 @@
       "movq (%[src],%[s]),%%mm3\n\t"
       /*dst+0*dst_ystride*/
       "movq %%mm0,(%[dst])\n\t"
-      /*esi=dst_ystride*3*/
+      /*s=dst_ystride*3*/
       "lea (%[dst_ystride],%[dst_ystride],2),%[s]\n\t"
       /*dst+1*dst_ystride*/
       "movq %%mm1,(%[dst],%[dst_ystride])\n\t"
@@ -249,7 +244,7 @@
       "movq %%mm2,(%[dst],%[dst_ystride],2)\n\t"
       /*dst+3*dst_ystride*/
       "movq %%mm3,(%[dst],%[s])\n\t"
-      :[s]"=&S"(esi)
+      :[s]"=&r"(s)
       :[dst]"r"(dst),[src]"r"(src),[dst_ystride]"r"(dst_ystride),
        [src_ystride]"r"(src_ystride)
       :"memory"
@@ -261,12 +256,12 @@
 
 static void loop_filter_v(unsigned char *_pix,int _ystride,
  const ogg_int16_t *_ll){
-  long esi;
+  ptrdiff_t s;
   _pix-=_ystride*2;
   __asm__ __volatile__(
     /*mm0=0*/
     "pxor %%mm0,%%mm0\n\t"
-    /*esi=_ystride*3*/
+    /*s=_ystride*3*/
     "lea (%[ystride],%[ystride],2),%[s]\n\t"
     /*mm7=_pix[0...8]*/
     "movq (%[pix]),%%mm7\n\t"
@@ -297,19 +292,21 @@
     "punpcklbw %%mm0,%%mm4\n\t"
     "punpckhbw %%mm0,%%mm3\n\t"
     "punpcklbw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V3],%%mm0\n\t"
-    /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    /*mm0=3 3 3 3
+      mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/
+    "pcmpeqw %%mm0,%%mm0\n\t"
     "psubw %%mm5,%%mm3\n\t"
+    "psrlw $14,%%mm0\n\t"
     "psubw %%mm4,%%mm2\n\t"
     /*Scale by 3.*/
     "pmullw %%mm0,%%mm3\n\t"
     "pmullw %%mm0,%%mm2\n\t"
-    /*Preload...*/
-    "movq %[OC_V4],%%mm0\n\t"
-    /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
+    /*mm0=4 4 4 4
+      f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+
        3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/
+    "psrlw $1,%%mm0\n\t"
     "paddw %%mm7,%%mm3\n\t"
+    "psllw $2,%%mm0\n\t"
     "paddw %%mm6,%%mm2\n\t"
     /*Add 4.*/
     "paddw %%mm0,%%mm3\n\t"
@@ -431,9 +428,8 @@
     /*Write it back out.*/
     "movq %%mm4,(%[pix],%[ystride])\n\t"
     "movq %%mm1,(%[pix],%[ystride],2)\n\t"
-    :[s]"=&S"(esi)
-    :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll),
-     [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :[s]"=&r"(s)
+    :[pix]"r"(_pix),[ystride]"r"((ptrdiff_t)_ystride),[ll]"r"(_ll)
     :"memory"
   );
 }
@@ -442,14 +438,16 @@
   Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all
    four p0's to one register we must transpose the values in four mmx regs.
   When half is done we repeat this for the rest.*/
-static void loop_filter_h4(unsigned char *_pix,long _ystride,
+static void loop_filter_h4(unsigned char *_pix,ptrdiff_t _ystride,
  const ogg_int16_t *_ll){
-  long esi;
-  long edi;
+  ptrdiff_t s;
+  /*d doesn't technically need to be 64-bit on x86-64, but making it so will
+     help avoid partial register stalls.*/
+  ptrdiff_t d;
   __asm__ __volatile__(
     /*x x x x 3 2 1 0*/
     "movd (%[pix]),%%mm0\n\t"
-    /*esi=_ystride*3*/
+    /*s=_ystride*3*/
     "lea (%[ystride],%[ystride],2),%[s]\n\t"
     /*x x x x 7 6 5 4*/
     "movd (%[pix],%[ystride]),%%mm1\n\t"
@@ -484,14 +482,20 @@
     "psubw %%mm3,%%mm1\n\t"
     /*Save a copy of pix[2] for later.*/
     "movq %%mm0,%%mm4\n\t"
-    /*mm0=mm0-mm5==pix[2]-pix[1]*/
+    /*mm2=3 3 3 3
+      mm0=mm0-mm5==pix[2]-pix[1]*/
+    "pcmpeqw %%mm2,%%mm2\n\t"
     "psubw %%mm5,%%mm0\n\t"
+    "psrlw $14,%%mm2\n\t"
     /*Scale by 3.*/
-    "pmullw %[OC_V3],%%mm0\n\t"
-    /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "pmullw %%mm2,%%mm0\n\t"
+    /*mm2=4 4 4 4
+      f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/
+    "psrlw $1,%%mm2\n\t"
     "paddw %%mm1,%%mm0\n\t"
+    "psllw $2,%%mm2\n\t"
     /*Add 4.*/
-    "paddw %[OC_V4],%%mm0\n\t"
+    "paddw %%mm2,%%mm0\n\t"
     /*"Divide" by 8, producing the residuals R_i.*/
     "psraw $3,%%mm0\n\t"
     /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/
@@ -556,21 +560,21 @@
     "packuswb %%mm7,%%mm4\n\t"
     /*mm5=E D A 9 6 5 2 1*/
     "punpcklbw %%mm4,%%mm5\n\t"
-    /*edi=6 5 2 1*/
-    "movd %%mm5,%%edi\n\t"
-    "movw %%di,1(%[pix])\n\t"
+    /*d=6 5 2 1*/
+    "movd %%mm5,%[d]\n\t"
+    "movw %w[d],1(%[pix])\n\t"
     /*Why is there such a big stall here?*/
     "psrlq $32,%%mm5\n\t"
-    "shrl $16,%%edi\n\t"
-    "movw %%di,1(%[pix],%[ystride])\n\t"
-    /*edi=E D A 9*/
-    "movd %%mm5,%%edi\n\t"
-    "movw %%di,1(%[pix],%[ystride],2)\n\t"
-    "shrl $16,%%edi\n\t"
-    "movw %%di,1(%[pix],%[s])\n\t"
-    :[s]"=&S"(esi),[d]"=&D"(edi),
+    "shr $16,%[d]\n\t"
+    "movw %w[d],1(%[pix],%[ystride])\n\t"
+    /*d=E D A 9*/
+    "movd %%mm5,%[d]\n\t"
+    "movw %w[d],1(%[pix],%[ystride],2)\n\t"
+    "shr $16,%[d]\n\t"
+    "movw %w[d],1(%[pix],%[s])\n\t"
+    :[s]"=&r"(s),[d]"=&r"(d),
      [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll)
-    :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4)
+    :
     :"memory"
   );
 }
@@ -584,8 +588,8 @@
 
 /*We copy the whole function because the MMX routines will be inlined 4 times,
    and we can do just a single emms call at the end this way.
-  We also do not utilize the _bv lookup table, instead computing the values
-   that would lie in it on the fly.*/
+  We also do not use the _bv lookup table, instead computing the values that
+   would lie in it on the fly.*/
 
 /*Apply the loop filter to a given set of fragment rows in the given plane.
   The filter may be run on the bottom edge, affecting pixels in the next row of
@@ -625,17 +629,17 @@
     while(frag<frag_end){
       if(frag->coded){
         if(frag>frag0){
-          loop_filter_h(frag->buffer[_refi],iplane->ystride,ll);
+          loop_filter_h(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag0>frag_top){
-          loop_filter_v(frag->buffer[_refi],iplane->ystride,ll);
+          loop_filter_v(frag->buffer[_refi],iplane->stride,ll);
         }
         if(frag+1<frag_end&&!(frag+1)->coded){
-          loop_filter_h(frag->buffer[_refi]+8,iplane->ystride,ll);
+          loop_filter_h(frag->buffer[_refi]+8,iplane->stride,ll);
         }
         if(frag+fplane->nhfrags<frag_bot&&!(frag+fplane->nhfrags)->coded){
           loop_filter_v((frag+fplane->nhfrags)->buffer[_refi],
-           iplane->ystride,ll);
+           iplane->stride,ll);
         }
       }
       frag++;

Modified: branches/theora-thusnelda/lib/dec/x86/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -30,13 +30,13 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy_mmx(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon_mmx(oc_theora_state *_state,oc_fragment *_frag,                                               
- int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,                                                             
+void oc_state_frag_recon_mmx(oc_theora_state *_state,oc_fragment *_frag,
+ int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_restore_fpu_mmx(void);
 void oc_idct8x8_mmx(ogg_int16_t _y[64]);
 void oc_idct8x8_10_mmx(ogg_int16_t _y[64]);
 void oc_fill_idct_constants_mmx(void);
-void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,                                                    
-  int _refi,int _pli,int _fragy0,int _fragy_end);
+void oc_state_loop_filter_frag_rows_mmx(oc_theora_state *_state,int *_bv,
+ int _refi,int _pli,int _fragy0,int _fragy_end);
 #endif

Modified: branches/theora-thusnelda/lib/dec/x86/x86state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/dec/x86/x86state.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -19,7 +19,7 @@
 
 #if defined(USE_ASM)
 
-#include "../../cpu.h"
+#include "../../cpu.c"
 
 void oc_state_vtable_init_x86(oc_theora_state *_state){
   _state->cpu_flags=oc_cpu_flags_get();

Copied: branches/theora-thusnelda/lib/dec/x86_vc (from rev 15674, trunk/theora/lib/dec/x86_vc)

Modified: branches/theora-thusnelda/lib/enc/dct_decode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dct_decode.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/dct_decode.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 

Modified: branches/theora-thusnelda/lib/enc/dsp.c
===================================================================
--- branches/theora-thusnelda/lib/enc/dsp.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/dsp.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -18,6 +18,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "codec_internal.h"
+#include "../cpu.c"
 
 #define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2)
 #define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b)))

Copied: branches/theora-thusnelda/lib/enc/encapiwrapper.c (from rev 15592, trunk/theora/lib/enc/encapiwrapper.c)
===================================================================
--- branches/theora-thusnelda/lib/enc/encapiwrapper.c	                        (rev 0)
+++ branches/theora-thusnelda/lib/enc/encapiwrapper.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -0,0 +1,1141 @@
+#include <string.h>
+#include "theora/theoraenc.h"
+#include "theora/theora.h"
+#include "codec_internal.h"
+#include "../dec/ocintrin.h"
+
+/*Wrapper to translate the new API into the old API.
+  Eventually we need to convert the old functions to support the new API
+   natively and do the translation the other way.
+  theora-exp already the necessary code to do so.*/
+
+
+
+/*The default Huffman codes used for VP3.1.
+  It's kind of useless to include this, as TH_ENCCTL_SET_HUFFMAN_CODES is not
+   actually implemented in the old encoder, but it's part of the public API.*/
+const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]={
+  {
+    {0x002D, 6},{0x0026, 7},{0x0166, 9},{0x004E, 8},
+    {0x02CE,10},{0x059E,11},{0x027D,11},{0x0008, 5},
+    {0x04F9,12},{0x000F, 4},{0x000E, 4},{0x001B, 5},
+    {0x0006, 4},{0x0008, 4},{0x0005, 4},{0x001A, 5},
+    {0x0015, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x0029, 6},
+    {0x0028, 6},{0x00B2, 8},{0x04F8,12},{0x059F,11},
+    {0x009E, 9},{0x013F,10},{0x0012, 6},{0x0058, 7}
+  },
+  {
+    {0x0010, 5},{0x0047, 7},{0x01FF, 9},{0x008C, 8},
+    {0x03FC,10},{0x046A,11},{0x0469,11},{0x0022, 6},
+    {0x11A1,13},{0x000E, 4},{0x000D, 4},{0x0004, 4},
+    {0x0005, 4},{0x0009, 4},{0x0006, 4},{0x001E, 5},
+    {0x0016, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x000A, 4},{0x0017, 5},{0x007D, 7},
+    {0x007E, 7},{0x011B, 9},{0x08D1,12},{0x03FD,10},
+    {0x046B,11},{0x11A0,13},{0x007C, 7},{0x00FE, 8}
+  },
+  {
+    {0x0016, 5},{0x0020, 6},{0x0086, 8},{0x0087, 8},
+    {0x0367,10},{0x06CC,11},{0x06CB,11},{0x006E, 7},
+    {0x366D,14},{0x000F, 4},{0x000E, 4},{0x0004, 4},
+    {0x0005, 4},{0x000A, 4},{0x0006, 4},{0x001A, 5},
+    {0x0011, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
+    {0x0000, 3},{0x0009, 4},{0x0017, 5},{0x006F, 7},
+    {0x006D, 7},{0x0364,10},{0x0D9A,12},{0x06CA,11},
+    {0x1B37,13},{0x366C,14},{0x0042, 7},{0x00D8, 8}
+  },
+  {
+    {0x0000, 4},{0x002D, 6},{0x00F7, 8},{0x0058, 7},
+    {0x0167, 9},{0x02CB,10},{0x02CA,10},{0x000E, 6},
+    {0x1661,13},{0x0003, 3},{0x0002, 3},{0x0008, 4},
+    {0x0009, 4},{0x000D, 4},{0x0002, 4},{0x001F, 5},
+    {0x0017, 5},{0x0001, 4},{0x000C, 4},{0x000E, 4},
+    {0x000A, 4},{0x0006, 5},{0x0078, 7},{0x000F, 6},
+    {0x007A, 7},{0x0164, 9},{0x0599,11},{0x02CD,10},
+    {0x0B31,12},{0x1660,13},{0x0079, 7},{0x00F6, 8}
+  },
+  {
+    {0x0003, 4},{0x003C, 6},{0x000F, 7},{0x007A, 7},
+    {0x001D, 8},{0x0020, 9},{0x0072,10},{0x0006, 6},
+    {0x0399,13},{0x0004, 3},{0x0005, 3},{0x0005, 4},
+    {0x0006, 4},{0x000E, 4},{0x0004, 4},{0x0000, 4},
+    {0x0019, 5},{0x0002, 4},{0x000D, 4},{0x0007, 4},
+    {0x001F, 5},{0x0030, 6},{0x0011, 8},{0x0031, 6},
+    {0x0005, 6},{0x0021, 9},{0x00E7,11},{0x0038, 9},
+    {0x01CD,12},{0x0398,13},{0x007B, 7},{0x0009, 7}
+  },
+  {
+    {0x0009, 4},{0x0002, 5},{0x0074, 7},{0x0007, 6},
+    {0x00EC, 8},{0x00D1, 9},{0x01A6,10},{0x0006, 6},
+    {0x0D21,13},{0x0005, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x000F, 4},{0x0004, 4},{0x0000, 4},
+    {0x001C, 5},{0x0002, 4},{0x0005, 4},{0x0003, 4},
+    {0x000C, 5},{0x0035, 7},{0x01A7,10},{0x001B, 6},
+    {0x0077, 7},{0x01A5,10},{0x0349,11},{0x00D0, 9},
+    {0x0691,12},{0x0D20,13},{0x0075, 7},{0x00ED, 8}
+  },
+  {
+    {0x000A, 4},{0x000C, 5},{0x0012, 6},{0x001B, 6},
+    {0x00B7, 8},{0x016C, 9},{0x0099, 9},{0x005A, 7},
+    {0x16D8,13},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 3},{0x0005, 4},{0x0017, 5},
+    {0x000E, 5},{0x0002, 4},{0x0003, 4},{0x000F, 5},
+    {0x001A, 6},{0x004D, 8},{0x2DB3,14},{0x002C, 6},
+    {0x0011, 6},{0x02DA,10},{0x05B7,11},{0x0098, 9},
+    {0x0B6D,12},{0x2DB2,14},{0x0010, 6},{0x0027, 7}
+  },
+  {
+    {0x000D, 4},{0x000F, 5},{0x001D, 6},{0x0008, 5},
+    {0x0051, 7},{0x0056, 8},{0x00AF, 9},{0x002A, 7},
+    {0x148A,13},{0x0007, 3},{0x0000, 2},{0x0008, 4},
+    {0x0009, 4},{0x000C, 4},{0x0006, 4},{0x0017, 5},
+    {0x000B, 5},{0x0016, 5},{0x0015, 5},{0x0009, 5},
+    {0x0050, 7},{0x00AE, 9},{0x2917,14},{0x001C, 6},
+    {0x0014, 6},{0x0290,10},{0x0523,11},{0x0149, 9},
+    {0x0A44,12},{0x2916,14},{0x0053, 7},{0x00A5, 8}
+  },
+  {
+    {0x0001, 4},{0x001D, 6},{0x00F5, 8},{0x00F4, 8},
+    {0x024D,10},{0x0499,11},{0x0498,11},{0x0001, 5},
+    {0x0021, 6},{0x0006, 3},{0x0005, 3},{0x0006, 4},
+    {0x0005, 4},{0x0002, 4},{0x0007, 5},{0x0025, 6},
+    {0x007B, 7},{0x001C, 6},{0x0020, 6},{0x000D, 6},
+    {0x0048, 7},{0x0092, 8},{0x0127, 9},{0x000E, 4},
+    {0x0004, 4},{0x0011, 5},{0x000C, 6},{0x003C, 6},
+    {0x000F, 5},{0x0000, 5},{0x001F, 5},{0x0013, 5}
+  },
+  {
+    {0x0005, 4},{0x003C, 6},{0x0040, 7},{0x000D, 7},
+    {0x0031, 9},{0x0061,10},{0x0060,10},{0x0002, 5},
+    {0x00F5, 8},{0x0006, 3},{0x0005, 3},{0x0007, 4},
+    {0x0006, 4},{0x0002, 4},{0x0009, 5},{0x0025, 6},
+    {0x0007, 6},{0x0021, 6},{0x0024, 6},{0x0010, 6},
+    {0x0041, 7},{0x00F4, 8},{0x0019, 8},{0x000E, 4},
+    {0x0003, 4},{0x0011, 5},{0x0011, 6},{0x003F, 6},
+    {0x003E, 6},{0x007B, 7},{0x0000, 4},{0x0013, 5}
+  },
+  {
+    {0x000A, 4},{0x0007, 5},{0x0001, 6},{0x0009, 6},
+    {0x0131, 9},{0x0261,10},{0x0260,10},{0x0015, 6},
+    {0x0001, 7},{0x0007, 3},{0x0006, 3},{0x0008, 4},
+    {0x0007, 4},{0x0006, 4},{0x0012, 5},{0x002F, 6},
+    {0x0014, 6},{0x0027, 6},{0x002D, 6},{0x0016, 6},
+    {0x004D, 7},{0x0099, 8},{0x0000, 7},{0x0004, 4},
+    {0x0001, 4},{0x0005, 5},{0x0017, 6},{0x002E, 6},
+    {0x002C, 6},{0x0008, 6},{0x0006, 5},{0x0001, 5}
+  },
+  {
+    {0x0000, 3},{0x000E, 5},{0x0017, 6},{0x002A, 6},
+    {0x0010, 7},{0x00F9,10},{0x00F8,10},{0x001E, 7},
+    {0x003F, 8},{0x0007, 3},{0x0006, 3},{0x0009, 4},
+    {0x0008, 4},{0x0006, 4},{0x000F, 5},{0x0005, 5},
+    {0x0016, 6},{0x0029, 6},{0x002B, 6},{0x0015, 6},
+    {0x0050, 7},{0x0011, 7},{0x007D, 9},{0x0004, 4},
+    {0x0017, 5},{0x0006, 5},{0x0014, 6},{0x002C, 6},
+    {0x002D, 6},{0x000E, 6},{0x0009, 6},{0x0051, 7}
+  },
+  {
+    {0x0002, 3},{0x0018, 5},{0x002F, 6},{0x000D, 5},
+    {0x0053, 7},{0x0295,10},{0x0294,10},{0x00A4, 8},
+    {0x007C, 8},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x000C, 5},{0x0028, 6},
+    {0x006A, 7},{0x001E, 6},{0x001D, 6},{0x0069, 7},
+    {0x00D7, 8},{0x007D, 8},{0x014B, 9},{0x0019, 5},
+    {0x0016, 5},{0x002E, 6},{0x001C, 6},{0x002B, 6},
+    {0x002A, 6},{0x0068, 7},{0x003F, 7},{0x00D6, 8}
+  },
+  {
+    {0x0002, 3},{0x001B, 5},{0x000C, 5},{0x0018, 5},
+    {0x0029, 6},{0x007F, 8},{0x02F0,10},{0x0198, 9},
+    {0x0179, 9},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x001A, 5},{0x000D, 5},{0x002A, 6},
+    {0x0064, 7},{0x001E, 6},{0x0067, 7},{0x005F, 7},
+    {0x00CD, 8},{0x007E, 8},{0x02F1,10},{0x0016, 5},
+    {0x000E, 5},{0x002E, 6},{0x0065, 7},{0x002B, 6},
+    {0x0028, 6},{0x003E, 7},{0x00BD, 8},{0x0199, 9}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0016, 5},{0x0006, 4},
+    {0x0036, 6},{0x005C, 7},{0x015D, 9},{0x015C, 9},
+    {0x02BF,10},{0x0000, 2},{0x0007, 3},{0x0009, 4},
+    {0x0008, 4},{0x0018, 5},{0x0034, 6},{0x002A, 6},
+    {0x005E, 7},{0x006A, 7},{0x0064, 7},{0x005D, 7},
+    {0x00CB, 8},{0x00AD, 8},{0x02BE,10},{0x0014, 5},
+    {0x0033, 6},{0x006E, 7},{0x005F, 7},{0x006F, 7},
+    {0x006B, 7},{0x00CA, 8},{0x00AC, 8},{0x015E, 9}
+  },
+  {
+    {0x000F, 4},{0x001D, 5},{0x0018, 5},{0x000B, 4},
+    {0x0019, 5},{0x0029, 6},{0x00D6, 8},{0x0551,11},
+    {0x0AA1,12},{0x0001, 2},{0x0000, 2},{0x0009, 4},
+    {0x0008, 4},{0x001B, 5},{0x0038, 6},{0x0028, 6},
+    {0x0057, 7},{0x006A, 7},{0x0068, 7},{0x0056, 7},
+    {0x00E5, 8},{0x0155, 9},{0x0AA0,12},{0x0073, 7},
+    {0x0069, 7},{0x00D7, 8},{0x00AB, 8},{0x00E4, 8},
+    {0x00A9, 8},{0x0151, 9},{0x0150, 9},{0x02A9,10}
+  },
+  {
+    {0x0008, 5},{0x0025, 7},{0x017A, 9},{0x02F7,10},
+    {0x0BDB,12},{0x17B4,13},{0x2F6B,14},{0x001D, 5},
+    {0x2F6A,14},{0x0008, 4},{0x0007, 4},{0x0001, 4},
+    {0x0002, 4},{0x000A, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0009, 4},{0x000D, 4},{0x000F, 4},
+    {0x000C, 4},{0x0003, 4},{0x000A, 5},{0x0016, 5},
+    {0x0013, 6},{0x005D, 7},{0x0024, 7},{0x00BC, 8},
+    {0x005C, 7},{0x05EC,11},{0x000B, 5},{0x005F, 7}
+  },
+  {
+    {0x000F, 5},{0x0010, 6},{0x004B, 8},{0x00C6, 8},
+    {0x031D,10},{0x0C71,12},{0x0C70,12},{0x0001, 4},
+    {0x0C73,12},{0x0008, 4},{0x0009, 4},{0x0002, 4},
+    {0x0003, 4},{0x000B, 4},{0x0006, 4},{0x0000, 4},
+    {0x001C, 5},{0x0005, 4},{0x000D, 4},{0x000F, 4},
+    {0x000A, 4},{0x0019, 5},{0x0013, 6},{0x001D, 5},
+    {0x0030, 6},{0x0062, 7},{0x0024, 7},{0x004A, 8},
+    {0x018F, 9},{0x0C72,12},{0x000E, 5},{0x0011, 6}
+  },
+  {
+    {0x001B, 5},{0x0003, 6},{0x008D, 8},{0x0040, 7},
+    {0x0239,10},{0x0471,11},{0x08E0,12},{0x0003, 4},
+    {0x11C3,13},{0x000A, 4},{0x0009, 4},{0x0004, 4},
+    {0x0005, 4},{0x000E, 4},{0x0007, 4},{0x0001, 4},
+    {0x001E, 5},{0x0006, 4},{0x000C, 4},{0x000B, 4},
+    {0x0002, 4},{0x0000, 5},{0x0041, 7},{0x001F, 5},
+    {0x0022, 6},{0x0002, 6},{0x008F, 8},{0x008C, 8},
+    {0x011D, 9},{0x11C2,13},{0x001A, 5},{0x0021, 6}
+  },
+  {
+    {0x001F, 5},{0x0003, 6},{0x0003, 7},{0x0043, 7},
+    {0x000B, 9},{0x0015,10},{0x0051,12},{0x0003, 4},
+    {0x0050,12},{0x000D, 4},{0x000C, 4},{0x0004, 4},
+    {0x0006, 4},{0x000E, 4},{0x000A, 4},{0x0001, 4},
+    {0x001E, 5},{0x0005, 4},{0x0009, 4},{0x0007, 4},
+    {0x0011, 5},{0x0002, 6},{0x0004, 8},{0x0002, 4},
+    {0x002D, 6},{0x0020, 6},{0x0042, 7},{0x0001, 7},
+    {0x0000, 7},{0x0029,11},{0x0017, 5},{0x002C, 6}
+  },
+  {
+    {0x0003, 4},{0x001F, 6},{0x003A, 7},{0x005D, 7},
+    {0x0173, 9},{0x02E4,10},{0x172D,13},{0x0004, 4},
+    {0x172C,13},{0x000F, 4},{0x000E, 4},{0x0009, 4},
+    {0x0008, 4},{0x000C, 4},{0x000A, 4},{0x0001, 4},
+    {0x0016, 5},{0x0002, 4},{0x0005, 4},{0x001A, 5},
+    {0x002F, 6},{0x0038, 7},{0x05CA,11},{0x0006, 4},
+    {0x0037, 6},{0x001E, 6},{0x003B, 7},{0x0039, 7},
+    {0x00B8, 8},{0x0B97,12},{0x0000, 4},{0x0036, 6}
+  },
+  {
+    {0x0006, 4},{0x0037, 6},{0x005D, 7},{0x000C, 6},
+    {0x00B9, 8},{0x02E3,10},{0x05C4,11},{0x0004, 4},
+    {0x1715,13},{0x0000, 3},{0x000F, 4},{0x0008, 4},
+    {0x0007, 4},{0x000C, 4},{0x0009, 4},{0x001D, 5},
+    {0x0016, 5},{0x001C, 5},{0x001A, 5},{0x000B, 5},
+    {0x005E, 7},{0x0170, 9},{0x1714,13},{0x000A, 4},
+    {0x000A, 5},{0x0036, 6},{0x005F, 7},{0x001B, 7},
+    {0x001A, 7},{0x0B8B,12},{0x0002, 4},{0x0007, 5}
+  },
+  {
+    {0x000C, 4},{0x000B, 5},{0x0079, 7},{0x0022, 6},
+    {0x00F0, 8},{0x0119, 9},{0x0230,10},{0x001D, 5},
+    {0x08C4,12},{0x0001, 3},{0x0000, 3},{0x000A, 4},
+    {0x0009, 4},{0x000B, 4},{0x0007, 4},{0x001C, 5},
+    {0x003D, 6},{0x000D, 5},{0x0008, 5},{0x0015, 6},
+    {0x008D, 8},{0x118B,13},{0x118A,13},{0x000D, 4},
+    {0x0010, 5},{0x0009, 5},{0x0014, 6},{0x0047, 7},
+    {0x00F1, 8},{0x0463,11},{0x001F, 5},{0x000C, 5}
+  },
+  {
+    {0x0000, 3},{0x001A, 5},{0x0033, 6},{0x000C, 5},
+    {0x0046, 7},{0x01E3, 9},{0x03C5,10},{0x0017, 5},
+    {0x1E21,13},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x000A, 4},{0x0007, 4},{0x001B, 5},{0x003D, 6},
+    {0x001B, 6},{0x0022, 6},{0x0079, 7},{0x00F0, 8},
+    {0x1E20,13},{0x1E23,13},{0x1E22,13},{0x000E, 4},
+    {0x0016, 5},{0x0018, 5},{0x0032, 6},{0x001A, 6},
+    {0x0047, 7},{0x0789,11},{0x001F, 5},{0x0010, 5}
+  },
+  {
+    {0x001D, 5},{0x0061, 7},{0x004E, 8},{0x009E, 9},
+    {0x027C,11},{0x09F5,13},{0x09F4,13},{0x0003, 4},
+    {0x0060, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000A, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0031, 6},{0x0008, 5},{0x0038, 6},{0x0012, 6},
+    {0x0026, 7},{0x013F,10},{0x04FB,12},{0x000D, 4},
+    {0x0002, 4},{0x000C, 5},{0x0039, 6},{0x001C, 6},
+    {0x000F, 5},{0x001D, 6},{0x0008, 4},{0x0019, 5}
+  },
+  {
+    {0x0007, 4},{0x0019, 6},{0x00AB, 8},{0x00AA, 8},
+    {0x0119,10},{0x0461,12},{0x0460,12},{0x001B, 5},
+    {0x0047, 8},{0x0001, 3},{0x0000, 3},{0x000C, 4},
+    {0x000B, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
+    {0x0035, 6},{0x003D, 6},{0x003C, 6},{0x0018, 6},
+    {0x0022, 7},{0x008D, 9},{0x0231,11},{0x000E, 4},
+    {0x001F, 5},{0x0009, 5},{0x002B, 6},{0x0010, 6},
+    {0x0034, 6},{0x0054, 7},{0x0008, 4},{0x0014, 5}
+  },
+  {
+    {0x000C, 4},{0x0005, 5},{0x0008, 6},{0x005B, 7},
+    {0x004D, 9},{0x0131,11},{0x0261,12},{0x001A, 5},
+    {0x0012, 7},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x0009, 4},{0x0006, 4},{0x001B, 5},{0x0006, 5},
+    {0x001C, 6},{0x002C, 6},{0x0015, 6},{0x005A, 7},
+    {0x0027, 8},{0x0099,10},{0x0260,12},{0x000E, 4},
+    {0x0004, 4},{0x000F, 5},{0x0007, 5},{0x001D, 6},
+    {0x000B, 5},{0x0014, 6},{0x0008, 4},{0x0017, 5}
+  },
+  {
+    {0x000F, 4},{0x0013, 5},{0x0075, 7},{0x0024, 6},
+    {0x0095, 8},{0x0251,10},{0x04A0,11},{0x0010, 5},
+    {0x00C8, 8},{0x0002, 3},{0x0001, 3},{0x0001, 4},
+    {0x0000, 4},{0x001A, 5},{0x0011, 5},{0x002C, 6},
+    {0x0065, 7},{0x0074, 7},{0x004B, 7},{0x00C9, 8},
+    {0x0129, 9},{0x0943,12},{0x0942,12},{0x0003, 3},
+    {0x000A, 4},{0x001C, 5},{0x0018, 5},{0x0033, 6},
+    {0x0017, 5},{0x002D, 6},{0x001B, 5},{0x003B, 6}
+  },
+  {
+    {0x0003, 3},{0x001A, 5},{0x002D, 6},{0x0038, 6},
+    {0x0028, 7},{0x0395,10},{0x0E51,12},{0x0037, 6},
+    {0x00E4, 8},{0x0001, 3},{0x0000, 3},{0x001F, 5},
+    {0x001E, 5},{0x0017, 5},{0x003A, 6},{0x0073, 7},
+    {0x002A, 7},{0x002B, 7},{0x0029, 7},{0x01CB, 9},
+    {0x0729,11},{0x1CA1,13},{0x1CA0,13},{0x0004, 3},
+    {0x000A, 4},{0x0004, 4},{0x0018, 5},{0x0036, 6},
+    {0x000B, 5},{0x002C, 6},{0x0019, 5},{0x003B, 6}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0017, 5},
+    {0x0075, 7},{0x01F5, 9},{0x07D1,11},{0x0017, 6},
+    {0x01F6, 9},{0x0001, 3},{0x0000, 3},{0x001B, 5},
+    {0x001A, 5},{0x000A, 5},{0x0032, 6},{0x0074, 7},
+    {0x00F8, 8},{0x00F9, 8},{0x01F7, 9},{0x03E9,10},
+    {0x0FA0,12},{0x1F43,13},{0x1F42,13},{0x0003, 3},
+    {0x000A, 4},{0x001E, 5},{0x001C, 5},{0x003B, 6},
+    {0x0018, 5},{0x0016, 6},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0004, 3},{0x0007, 4},{0x0018, 5},{0x001E, 5},
+    {0x0036, 6},{0x0031, 7},{0x0177, 9},{0x0077, 7},
+    {0x0176, 9},{0x0001, 3},{0x0000, 3},{0x001A, 5},
+    {0x0019, 5},{0x003A, 6},{0x0019, 6},{0x005C, 7},
+    {0x00BA, 8},{0x0061, 8},{0x00C1, 9},{0x0180,10},
+    {0x0302,11},{0x0607,12},{0x0606,12},{0x0002, 3},
+    {0x000A, 4},{0x001F, 5},{0x001C, 5},{0x0037, 6},
+    {0x0016, 5},{0x0076, 7},{0x000D, 5},{0x002F, 6}
+  },
+  {
+    {0x0000, 3},{0x000A, 4},{0x001A, 5},{0x000C, 4},
+    {0x001D, 5},{0x0039, 6},{0x0078, 7},{0x005E, 7},
+    {0x0393,11},{0x0002, 3},{0x0001, 3},{0x0016, 5},
+    {0x000F, 5},{0x002E, 6},{0x005F, 7},{0x0073, 8},
+    {0x00E5, 9},{0x01C8,10},{0x0E4A,13},{0x1C97,14},
+    {0x1C96,14},{0x0E49,13},{0x0E48,13},{0x0004, 3},
+    {0x0006, 4},{0x001F, 5},{0x001B, 5},{0x001D, 6},
+    {0x0038, 6},{0x0038, 7},{0x003D, 6},{0x0079, 7}
+  },
+  {
+    {0x000B, 5},{0x002B, 7},{0x0054, 8},{0x01B7, 9},
+    {0x06D9,11},{0x0DB1,12},{0x0DB0,12},{0x0002, 4},
+    {0x00AB, 9},{0x0009, 4},{0x000A, 4},{0x0007, 4},
+    {0x0008, 4},{0x000F, 4},{0x000C, 4},{0x0003, 4},
+    {0x001D, 5},{0x0004, 4},{0x000B, 4},{0x0006, 4},
+    {0x001A, 5},{0x0003, 6},{0x00AA, 9},{0x0001, 4},
+    {0x0000, 5},{0x0014, 6},{0x006C, 7},{0x00DA, 8},
+    {0x0002, 6},{0x036D,10},{0x001C, 5},{0x0037, 6}
+  },
+  {
+    {0x001D, 5},{0x0004, 6},{0x00B6, 8},{0x006A, 8},
+    {0x05B9,11},{0x16E1,13},{0x16E0,13},{0x0007, 4},
+    {0x016F, 9},{0x000C, 4},{0x000D, 4},{0x0009, 4},
+    {0x0008, 4},{0x000F, 4},{0x000A, 4},{0x0003, 4},
+    {0x0017, 5},{0x0002, 4},{0x0004, 4},{0x001C, 5},
+    {0x002C, 6},{0x006B, 8},{0x0B71,12},{0x0005, 4},
+    {0x0003, 5},{0x001B, 6},{0x005A, 7},{0x0034, 7},
+    {0x0005, 6},{0x02DD,10},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x0003, 4},{0x007F, 7},{0x00A1, 8},{0x00A0, 8},
+    {0x020C,10},{0x0834,12},{0x106B,13},{0x0007, 4},
+    {0x0082, 8},{0x000E, 4},{0x000D, 4},{0x000B, 4},
+    {0x000C, 4},{0x0000, 3},{0x0009, 4},{0x0002, 4},
+    {0x0011, 5},{0x001E, 5},{0x0015, 5},{0x003E, 6},
+    {0x0040, 7},{0x041B,11},{0x106A,13},{0x0006, 4},
+    {0x000A, 5},{0x0029, 6},{0x007E, 7},{0x0051, 7},
+    {0x0021, 6},{0x0107, 9},{0x0004, 4},{0x000B, 5}
+  },
+  {
+    {0x0007, 4},{0x001B, 6},{0x00F6, 8},{0x00E9, 8},
+    {0x03A1,10},{0x0740,11},{0x0E82,12},{0x001F, 5},
+    {0x01EF, 9},{0x0001, 3},{0x0002, 3},{0x000B, 4},
+    {0x000C, 4},{0x000D, 4},{0x0008, 4},{0x001C, 5},
+    {0x0003, 5},{0x0012, 5},{0x0002, 5},{0x0075, 7},
+    {0x01D1, 9},{0x1D07,13},{0x1D06,13},{0x000A, 4},
+    {0x0013, 5},{0x003B, 6},{0x001A, 6},{0x007A, 7},
+    {0x003C, 6},{0x01EE, 9},{0x0000, 4},{0x000C, 5}
+  },
+  {
+    {0x000D, 4},{0x003D, 6},{0x0042, 7},{0x0037, 7},
+    {0x00D9, 9},{0x0362,11},{0x06C6,12},{0x001F, 5},
+    {0x0086, 8},{0x0001, 3},{0x0002, 3},{0x000C, 4},
+    {0x000B, 4},{0x000A, 4},{0x0001, 4},{0x000F, 5},
+    {0x0025, 6},{0x003C, 6},{0x001A, 6},{0x0087, 8},
+    {0x01B0,10},{0x0D8F,13},{0x0D8E,13},{0x000E, 4},
+    {0x0013, 5},{0x000C, 5},{0x0024, 6},{0x0020, 6},
+    {0x0011, 5},{0x006D, 8},{0x0000, 4},{0x000E, 5}
+  },
+  {
+    {0x0000, 3},{0x0012, 5},{0x0076, 7},{0x0077, 7},
+    {0x014D, 9},{0x0533,11},{0x14C9,13},{0x0013, 5},
+    {0x00A5, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x0008, 4},{0x001A, 5},{0x002B, 6},
+    {0x0075, 7},{0x0074, 7},{0x00A7, 8},{0x0298,10},
+    {0x14C8,13},{0x14CB,13},{0x14CA,13},{0x000F, 4},
+    {0x001C, 5},{0x0007, 5},{0x002A, 6},{0x0028, 6},
+    {0x001B, 5},{0x00A4, 8},{0x0002, 4},{0x0006, 5}
+  },
+  {
+    {0x0002, 3},{0x001A, 5},{0x002B, 6},{0x003A, 6},
+    {0x00ED, 8},{0x0283,10},{0x0A0A,12},{0x0004, 5},
+    {0x00A1, 8},{0x0004, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001F, 5},{0x0006, 5},{0x0077, 7},
+    {0x00A3, 8},{0x00A2, 8},{0x0140, 9},{0x1417,13},
+    {0x1416,13},{0x0A09,12},{0x0A08,12},{0x0000, 3},
+    {0x001E, 5},{0x0007, 5},{0x002A, 6},{0x0029, 6},
+    {0x001C, 5},{0x00EC, 8},{0x001B, 5},{0x0005, 5}
+  },
+  {
+    {0x0002, 3},{0x0002, 4},{0x0018, 5},{0x001D, 5},
+    {0x0035, 6},{0x00E4, 8},{0x01CF,11},{0x001D, 7},
+    {0x0072, 9},{0x0004, 3},{0x0005, 3},{0x0006, 4},
+    {0x0007, 4},{0x0006, 5},{0x0073, 7},{0x0038, 8},
+    {0x01CE,11},{0x039B,12},{0x0398,12},{0x0733,13},
+    {0x0732,13},{0x0735,13},{0x0734,13},{0x0000, 3},
+    {0x001F, 5},{0x001B, 5},{0x0034, 6},{0x000F, 6},
+    {0x001E, 5},{0x00E5, 8},{0x0019, 5},{0x0038, 6}
+  },
+  {
+    {0x0016, 5},{0x0050, 7},{0x0172, 9},{0x02E7,10},
+    {0x1732,13},{0x2E67,14},{0x2E66,14},{0x0006, 4},
+    {0x0051, 7},{0x0001, 3},{0x0000, 3},{0x000D, 4},
+    {0x000C, 4},{0x0009, 4},{0x001C, 5},{0x0009, 5},
+    {0x001C, 6},{0x001D, 6},{0x005D, 7},{0x00B8, 8},
+    {0x05CD,11},{0x1731,13},{0x1730,13},{0x000F, 4},
+    {0x0005, 4},{0x000F, 5},{0x0008, 5},{0x0029, 6},
+    {0x001D, 5},{0x002F, 6},{0x0008, 4},{0x0015, 5}
+  },
+  {
+    {0x0009, 4},{0x0021, 6},{0x0040, 7},{0x00AD, 8},
+    {0x02B0,10},{0x1589,13},{0x1588,13},{0x001C, 5},
+    {0x005F, 7},{0x0000, 3},{0x000F, 4},{0x000D, 4},
+    {0x000C, 4},{0x0006, 4},{0x0011, 5},{0x002A, 6},
+    {0x0057, 7},{0x005E, 7},{0x0041, 7},{0x0159, 9},
+    {0x0563,11},{0x158B,13},{0x158A,13},{0x0001, 3},
+    {0x0005, 4},{0x0014, 5},{0x003B, 6},{0x002E, 6},
+    {0x0004, 4},{0x003A, 6},{0x0007, 4},{0x0016, 5}
+  },
+  {
+    {0x000E, 4},{0x0007, 5},{0x0046, 7},{0x0045, 7},
+    {0x0064, 9},{0x032A,12},{0x0657,13},{0x0018, 5},
+    {0x000D, 6},{0x0000, 3},{0x000F, 4},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0036, 6},{0x0047, 7},
+    {0x0044, 7},{0x0018, 7},{0x0033, 8},{0x00CB,10},
+    {0x0656,13},{0x0329,12},{0x0328,12},{0x0002, 3},
+    {0x0006, 4},{0x0019, 5},{0x000E, 5},{0x0037, 6},
+    {0x0009, 4},{0x000F, 5},{0x0002, 4},{0x0010, 5}
+  },
+  {
+    {0x0003, 3},{0x0018, 5},{0x0023, 6},{0x0077, 7},
+    {0x0194, 9},{0x1956,13},{0x32AF,14},{0x003A, 6},
+    {0x0076, 7},{0x0002, 3},{0x0001, 3},{0x001F, 5},
+    {0x001E, 5},{0x0014, 5},{0x0022, 6},{0x0064, 7},
+    {0x0197, 9},{0x0196, 9},{0x032B,10},{0x0654,11},
+    {0x32AE,14},{0x1955,13},{0x1954,13},{0x0000, 3},
+    {0x0009, 4},{0x001C, 5},{0x0015, 5},{0x0010, 5},
+    {0x000D, 4},{0x0017, 5},{0x0016, 5},{0x0033, 6}
+  },
+  {
+    {0x0005, 3},{0x0006, 4},{0x003E, 6},{0x0010, 5},
+    {0x0048, 7},{0x093F,12},{0x24FA,14},{0x0032, 6},
+    {0x0067, 7},{0x0002, 3},{0x0001, 3},{0x001B, 5},
+    {0x001E, 5},{0x0034, 6},{0x0066, 7},{0x0092, 8},
+    {0x0126, 9},{0x024E,10},{0x049E,11},{0x49F7,15},
+    {0x49F6,15},{0x24F9,14},{0x24F8,14},{0x0000, 3},
+    {0x0007, 4},{0x0018, 5},{0x0011, 5},{0x003F, 6},
+    {0x000E, 4},{0x0013, 5},{0x0035, 6},{0x0025, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x0012, 5},{0x001C, 5},
+    {0x001C, 6},{0x00EA, 9},{0x1D75,14},{0x001E, 6},
+    {0x0066, 7},{0x0001, 3},{0x0002, 3},{0x001B, 5},
+    {0x001A, 5},{0x001F, 6},{0x003B, 7},{0x0074, 8},
+    {0x01D6,10},{0x03AF,11},{0x1D74,14},{0x1D77,14},
+    {0x1D76,14},{0x0EB9,13},{0x0EB8,13},{0x000F, 4},
+    {0x0006, 4},{0x0013, 5},{0x003B, 6},{0x003A, 6},
+    {0x0000, 3},{0x0018, 5},{0x0032, 6},{0x0067, 7}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x001B, 5},{0x000C, 4},
+    {0x000D, 5},{0x00E6, 8},{0x0684,11},{0x0072, 7},
+    {0x00E7, 8},{0x0002, 3},{0x0001, 3},{0x0017, 5},
+    {0x0016, 5},{0x0018, 6},{0x00D1, 8},{0x01A0, 9},
+    {0x0686,11},{0x0D0F,12},{0x0D0A,12},{0x1A17,13},
+    {0x1A16,13},{0x1A1D,13},{0x1A1C,13},{0x000F, 4},
+    {0x001D, 5},{0x000E, 5},{0x0035, 6},{0x0038, 6},
+    {0x0000, 3},{0x000F, 5},{0x0019, 6},{0x0069, 7}
+  },
+  {
+    {0x0003, 3},{0x000C, 4},{0x001B, 5},{0x0000, 3},
+    {0x0003, 4},{0x002E, 6},{0x0051, 9},{0x00BC, 8},
+    {0x0053, 9},{0x0004, 3},{0x0002, 3},{0x0016, 5},
+    {0x0015, 5},{0x0015, 7},{0x0050, 9},{0x00A4,10},
+    {0x0294,12},{0x052B,13},{0x052A,13},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x000E, 4},
+    {0x001A, 5},{0x0004, 5},{0x0028, 6},{0x0029, 6},
+    {0x000F, 4},{0x000B, 6},{0x005F, 7},{0x00BD, 8}
+  },
+  {
+    {0x0003, 4},{0x0009, 6},{0x00D0, 8},{0x01A3, 9},
+    {0x0344,10},{0x0D14,12},{0x1A2B,13},{0x0004, 4},
+    {0x0015, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
+    {0x000C, 4},{0x000E, 4},{0x0009, 4},{0x001B, 5},
+    {0x000A, 5},{0x0014, 5},{0x000D, 5},{0x002A, 6},
+    {0x0014, 7},{0x068B,11},{0x1A2A,13},{0x0008, 4},
+    {0x000B, 5},{0x002B, 6},{0x000B, 6},{0x0069, 7},
+    {0x0035, 6},{0x0008, 6},{0x0007, 4},{0x000C, 5}
+  },
+  {
+    {0x000A, 4},{0x003C, 6},{0x0032, 7},{0x0030, 7},
+    {0x00C5, 9},{0x0621,12},{0x0620,12},{0x001F, 5},
+    {0x0033, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 4},{0x0004, 4},{0x000D, 5},
+    {0x0026, 6},{0x0027, 6},{0x0014, 6},{0x0063, 8},
+    {0x0189,10},{0x0623,12},{0x0622,12},{0x000B, 4},
+    {0x0012, 5},{0x003D, 6},{0x0022, 6},{0x0015, 6},
+    {0x000B, 5},{0x0023, 6},{0x0007, 4},{0x0010, 5}
+  },
+  {
+    {0x000F, 4},{0x000C, 5},{0x0043, 7},{0x0010, 6},
+    {0x0044, 8},{0x0114,10},{0x0455,12},{0x0018, 5},
+    {0x0023, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
+    {0x000D, 4},{0x0009, 4},{0x0019, 5},{0x0009, 5},
+    {0x0017, 6},{0x0016, 6},{0x0042, 7},{0x008B, 9},
+    {0x0454,12},{0x0457,12},{0x0456,12},{0x000B, 4},
+    {0x0015, 5},{0x000A, 5},{0x0029, 6},{0x0020, 6},
+    {0x000D, 5},{0x0028, 6},{0x0007, 4},{0x0011, 5}
+  },
+  {
+    {0x0001, 3},{0x001A, 5},{0x0029, 6},{0x002A, 6},
+    {0x00A0, 8},{0x0285,10},{0x1425,13},{0x0002, 5},
+    {0x0000, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0008, 4},{0x0012, 5},{0x0001, 6},
+    {0x0051, 7},{0x0001, 7},{0x0143, 9},{0x0508,11},
+    {0x1424,13},{0x1427,13},{0x1426,13},{0x000F, 4},
+    {0x001C, 5},{0x0003, 5},{0x0037, 6},{0x002B, 6},
+    {0x0013, 5},{0x0036, 6},{0x001D, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x001F, 5},{0x003D, 6},{0x0006, 5},
+    {0x0016, 7},{0x0053, 9},{0x014A,11},{0x0034, 6},
+    {0x002A, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
+    {0x000C, 4},{0x001C, 5},{0x0037, 6},{0x0017, 7},
+    {0x002B, 8},{0x0028, 8},{0x00A4,10},{0x052D,13},
+    {0x052C,13},{0x052F,13},{0x052E,13},{0x0000, 3},
+    {0x001D, 5},{0x0007, 5},{0x0004, 5},{0x0035, 6},
+    {0x0014, 5},{0x0036, 6},{0x0015, 5},{0x003C, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0007, 5},{0x001D, 5},
+    {0x0009, 6},{0x01F3, 9},{0x07C7,11},{0x0008, 6},
+    {0x01F0, 9},{0x0003, 3},{0x0002, 3},{0x000D, 4},
+    {0x000C, 4},{0x0017, 5},{0x007D, 7},{0x01F2, 9},
+    {0x07C6,11},{0x07C5,11},{0x1F12,13},{0x3E27,14},
+    {0x3E26,14},{0x1F11,13},{0x1F10,13},{0x0000, 3},
+    {0x001E, 5},{0x0006, 5},{0x0039, 6},{0x0038, 6},
+    {0x003F, 6},{0x002C, 6},{0x0005, 5},{0x002D, 6}
+  },
+  {
+    {0x0002, 3},{0x0007, 4},{0x0018, 5},{0x0003, 4},
+    {0x0005, 5},{0x0035, 7},{0x004F, 9},{0x0012, 7},
+    {0x04E5,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000E, 4},{0x0033, 6},{0x0026, 8},{0x009D,10},
+    {0x04E4,13},{0x04E7,13},{0x04E6,13},{0x04E1,13},
+    {0x04E0,13},{0x04E3,13},{0x04E2,13},{0x0000, 3},
+    {0x001F, 5},{0x000C, 5},{0x003D, 6},{0x003C, 6},
+    {0x0032, 6},{0x0034, 7},{0x001B, 6},{0x0008, 6}
+  },
+  {
+    {0x0000, 3},{0x0004, 4},{0x001C, 5},{0x000F, 4},
+    {0x0002, 4},{0x0007, 5},{0x0075, 7},{0x00E8, 8},
+    {0x1D2A,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
+    {0x000C, 4},{0x0077, 7},{0x0E96,12},{0x3A57,14},
+    {0x3A56,14},{0x3A5D,14},{0x3A5C,14},{0x3A5F,14},
+    {0x3A5E,14},{0x1D29,13},{0x1D28,13},{0x0003, 3},
+    {0x0006, 5},{0x000A, 5},{0x002C, 7},{0x0017, 6},
+    {0x0076, 7},{0x01D3, 9},{0x03A4,10},{0x002D, 7}
+  },
+  {
+    {0x000A, 4},{0x0024, 6},{0x00BF, 8},{0x0085, 8},
+    {0x0211,10},{0x0842,12},{0x1087,13},{0x0018, 5},
+    {0x0020, 6},{0x0001, 3},{0x0002, 3},{0x000E, 4},
+    {0x000D, 4},{0x0007, 4},{0x0013, 5},{0x0025, 6},
+    {0x005E, 7},{0x0043, 7},{0x00BE, 8},{0x0109, 9},
+    {0x1086,13},{0x0841,12},{0x0840,12},{0x000F, 4},
+    {0x0001, 4},{0x0011, 5},{0x0000, 5},{0x002E, 6},
+    {0x0019, 5},{0x0001, 5},{0x0006, 4},{0x0016, 5}
+  },
+  {
+    {0x0002, 3},{0x000F, 5},{0x006F, 7},{0x0061, 7},
+    {0x0374,10},{0x1BA8,13},{0x3753,14},{0x0012, 5},
+    {0x0036, 6},{0x0000, 3},{0x0001, 3},{0x000A, 4},
+    {0x000B, 4},{0x001A, 5},{0x0031, 6},{0x0060, 7},
+    {0x00DC, 8},{0x01BB, 9},{0x06EB,11},{0x1BAB,13},
+    {0x3752,14},{0x3755,14},{0x3754,14},{0x000E, 4},
+    {0x0006, 4},{0x0013, 5},{0x000E, 5},{0x003E, 6},
+    {0x0008, 4},{0x001E, 5},{0x0019, 5},{0x003F, 6}
+  },
+  {
+    {0x0003, 3},{0x001C, 5},{0x0025, 6},{0x0024, 6},
+    {0x01DA, 9},{0x1DBD,13},{0x3B7C,14},{0x003C, 6},
+    {0x003D, 6},{0x0000, 3},{0x0001, 3},{0x000B, 4},
+    {0x000A, 4},{0x000B, 5},{0x0077, 7},{0x00EC, 8},
+    {0x03B6,10},{0x076E,11},{0x1DBF,13},{0x76FB,15},
+    {0x76FA,15},{0x3B79,14},{0x3B78,14},{0x000D, 4},
+    {0x001F, 5},{0x0013, 5},{0x000A, 5},{0x0008, 5},
+    {0x000C, 4},{0x0008, 4},{0x0009, 5},{0x003A, 6}
+  },
+  {
+    {0x0005, 3},{0x0003, 4},{0x0004, 5},{0x0010, 5},
+    {0x008F, 8},{0x0475,11},{0x11D1,13},{0x0079, 7},
+    {0x0027, 6},{0x0002, 3},{0x0003, 3},{0x0001, 4},
+    {0x0000, 4},{0x0026, 6},{0x0046, 7},{0x011C, 9},
+    {0x0477,11},{0x08ED,12},{0x11D0,13},{0x11D3,13},
+    {0x11D2,13},{0x11D9,13},{0x11D8,13},{0x000D, 4},
+    {0x001F, 5},{0x0012, 5},{0x0005, 5},{0x003D, 6},
+    {0x000C, 4},{0x000E, 4},{0x0022, 6},{0x0078, 7}
+  },
+  {
+    {0x0005, 3},{0x000C, 4},{0x001B, 5},{0x0000, 4},
+    {0x0006, 6},{0x03E2,10},{0x3E3D,14},{0x000F, 7},
+    {0x0034, 6},{0x0003, 3},{0x0002, 3},{0x001E, 5},
+    {0x001D, 5},{0x007D, 7},{0x01F0, 9},{0x07C6,11},
+    {0x3E3C,14},{0x3E3F,14},{0x3E3E,14},{0x3E39,14},
+    {0x3E38,14},{0x3E3B,14},{0x3E3A,14},{0x0008, 4},
+    {0x001C, 5},{0x0002, 5},{0x003F, 6},{0x0035, 6},
+    {0x0009, 4},{0x0001, 3},{0x000E, 7},{0x00F9, 8}
+  },
+  {
+    {0x0004, 3},{0x000B, 4},{0x0001, 4},{0x000A, 4},
+    {0x001E, 6},{0x00E0, 9},{0x0E1E,13},{0x0071, 8},
+    {0x0039, 7},{0x0007, 3},{0x0006, 3},{0x000D, 5},
+    {0x000C, 5},{0x0020, 7},{0x01C2,10},{0x1C3F,14},
+    {0x1C3E,14},{0x0E19,13},{0x0E18,13},{0x0E1B,13},
+    {0x0E1A,13},{0x0E1D,13},{0x0E1C,13},{0x0000, 4},
+    {0x0009, 5},{0x001D, 6},{0x001F, 6},{0x0011, 6},
+    {0x0005, 4},{0x0001, 3},{0x0043, 8},{0x0042, 8}
+  },
+  {
+    {0x0004, 3},{0x000D, 4},{0x0007, 4},{0x0002, 3},
+    {0x0014, 5},{0x016C, 9},{0x16D1,13},{0x02DF,10},
+    {0x016E, 9},{0x0000, 2},{0x0007, 3},{0x002C, 6},
+    {0x002B, 6},{0x02DE,10},{0x16D0,13},{0x16D3,13},
+    {0x16D2,13},{0x2DB5,14},{0x2DB4,14},{0x2DB7,14},
+    {0x2DB6,14},{0x16D9,13},{0x16D8,13},{0x000C, 5},
+    {0x002A, 6},{0x005A, 7},{0x001B, 6},{0x001A, 6},
+    {0x0017, 5},{0x000C, 4},{0x05B7,11},{0x05B5,11}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  },
+  {
+    {0x0000, 3},{0x0010, 5},{0x0072, 7},{0x0071, 7},
+    {0x0154, 9},{0x0AAB,12},{0x0AA8,12},{0x0014, 5},
+    {0x0070, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
+    {0x000B, 4},{0x0003, 4},{0x0011, 5},{0x0073, 7},
+    {0x0054, 7},{0x00AB, 8},{0x02AB,10},{0x1553,13},
+    {0x1552,13},{0x1555,13},{0x1554,13},{0x000D, 4},
+    {0x001E, 5},{0x0012, 5},{0x003E, 6},{0x002B, 6},
+    {0x0002, 4},{0x003F, 6},{0x001D, 5},{0x0013, 5}
+  },
+  {
+    {0x0003, 3},{0x001F, 5},{0x0029, 6},{0x003D, 6},
+    {0x000C, 7},{0x0069,10},{0x0345,13},{0x0002, 5},
+    {0x0028, 6},{0x0002, 3},{0x0001, 3},{0x000E, 4},
+    {0x000C, 4},{0x0015, 5},{0x0007, 6},{0x001B, 8},
+    {0x006B,10},{0x006A,10},{0x0344,13},{0x0347,13},
+    {0x0346,13},{0x01A1,12},{0x01A0,12},{0x000B, 4},
+    {0x001A, 5},{0x0012, 5},{0x0000, 5},{0x003C, 6},
+    {0x0008, 4},{0x001B, 5},{0x0013, 5},{0x0001, 5}
+  },
+  {
+    {0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0014, 5},
+    {0x0056, 7},{0x015C, 9},{0x15D5,13},{0x003C, 6},
+    {0x002A, 6},{0x0000, 3},{0x0001, 3},{0x000E, 4},
+    {0x000D, 4},{0x000C, 5},{0x00AF, 8},{0x02BB,10},
+    {0x15D4,13},{0x15D7,13},{0x15D6,13},{0x15D1,13},
+    {0x15D0,13},{0x15D3,13},{0x15D2,13},{0x000B, 4},
+    {0x0019, 5},{0x000D, 5},{0x003E, 6},{0x0031, 6},
+    {0x0007, 4},{0x0005, 4},{0x003D, 6},{0x0030, 6}
+  },
+  {
+    {0x0005, 3},{0x0008, 4},{0x001A, 5},{0x0000, 4},
+    {0x0036, 6},{0x0011, 8},{0x0106,12},{0x000A, 7},
+    {0x006E, 7},{0x0002, 3},{0x0003, 3},{0x0003, 4},
+    {0x0002, 4},{0x006F, 7},{0x0021, 9},{0x020F,13},
+    {0x020E,13},{0x0101,12},{0x0100,12},{0x0103,12},
+    {0x0102,12},{0x0105,12},{0x0104,12},{0x000C, 4},
+    {0x001E, 5},{0x0003, 5},{0x003E, 6},{0x003F, 6},
+    {0x0009, 4},{0x000E, 4},{0x000B, 7},{0x0009, 7}
+  },
+  {
+    {0x0002, 3},{0x000E, 4},{0x001E, 5},{0x000C, 4},
+    {0x001F, 5},{0x006E, 7},{0x00AD,10},{0x00AF,10},
+    {0x0014, 7},{0x0004, 3},{0x0003, 3},{0x001A, 5},
+    {0x0017, 5},{0x002A, 8},{0x0576,13},{0x0AEF,14},
+    {0x0AEE,14},{0x0571,13},{0x0570,13},{0x0573,13},
+    {0x0572,13},{0x0575,13},{0x0574,13},{0x0003, 4},
+    {0x0016, 5},{0x0004, 5},{0x0036, 6},{0x000B, 6},
+    {0x000A, 4},{0x0000, 3},{0x006F, 7},{0x00AC,10}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
+    {0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
+    {0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
+    {0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
+    {0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
+    {0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
+    {0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
+    {0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
+  },
+  {
+    {0x0003, 3},{0x0011, 5},{0x0020, 6},{0x0074, 7},
+    {0x010D, 9},{0x0863,12},{0x0860,12},{0x000A, 5},
+    {0x0075, 7},{0x0001, 3},{0x0000, 3},{0x000B, 4},
+    {0x000A, 4},{0x0018, 5},{0x0038, 6},{0x0042, 7},
+    {0x010F, 9},{0x010E, 9},{0x0219,10},{0x10C3,13},
+    {0x10C2,13},{0x10C5,13},{0x10C4,13},{0x000F, 4},
+    {0x0004, 4},{0x0019, 5},{0x000B, 5},{0x0039, 6},
+    {0x0009, 4},{0x001B, 5},{0x001A, 5},{0x003B, 6}
+  },
+  {
+    {0x0005, 3},{0x0001, 4},{0x003E, 6},{0x0001, 5},
+    {0x00E2, 8},{0x1C6F,13},{0x38D9,14},{0x0039, 6},
+    {0x001F, 6},{0x0002, 3},{0x0001, 3},{0x0009, 4},
+    {0x0008, 4},{0x0000, 5},{0x0070, 7},{0x01C7, 9},
+    {0x038C,10},{0x071A,11},{0x38D8,14},{0x38DB,14},
+    {0x38DA,14},{0x38DD,14},{0x38DC,14},{0x000D, 4},
+    {0x001D, 5},{0x000E, 5},{0x003F, 6},{0x003C, 6},
+    {0x000C, 4},{0x0006, 4},{0x003D, 6},{0x001E, 6}
+  },
+  {
+    {0x0006, 3},{0x000B, 4},{0x0011, 5},{0x001E, 5},
+    {0x0074, 7},{0x03AA,10},{0x1D5C,13},{0x0001, 6},
+    {0x0021, 6},{0x0001, 3},{0x0002, 3},{0x0007, 4},
+    {0x0006, 4},{0x003E, 6},{0x00EB, 8},{0x01D4, 9},
+    {0x0EAF,12},{0x3ABB,14},{0x3ABA,14},{0x1D59,13},
+    {0x1D58,13},{0x1D5B,13},{0x1D5A,13},{0x000A, 4},
+    {0x001C, 5},{0x0001, 5},{0x003F, 6},{0x003B, 6},
+    {0x0001, 4},{0x0009, 4},{0x0020, 6},{0x0000, 6}
+  },
+  {
+    {0x0004, 3},{0x000A, 4},{0x0017, 5},{0x0004, 4},
+    {0x0016, 6},{0x016A, 9},{0x16B1,13},{0x0017, 7},
+    {0x005B, 7},{0x0006, 3},{0x0007, 3},{0x0001, 4},
+    {0x0000, 4},{0x000A, 6},{0x02D7,10},{0x0B5A,12},
+    {0x16B0,13},{0x16B3,13},{0x16B2,13},{0x2D6D,14},
+    {0x2D6C,14},{0x2D6F,14},{0x2D6E,14},{0x0006, 4},
+    {0x000A, 5},{0x0004, 5},{0x002C, 6},{0x0017, 6},
+    {0x0003, 4},{0x0007, 4},{0x0016, 7},{0x00B4, 8}
+  },
+  {
+    {0x0005, 3},{0x000D, 4},{0x0005, 4},{0x0009, 4},
+    {0x0033, 6},{0x0193, 9},{0x192C,13},{0x0061, 8},
+    {0x0031, 7},{0x0000, 2},{0x0007, 3},{0x0010, 5},
+    {0x0011, 5},{0x00C8, 8},{0x192F,13},{0x325B,14},
+    {0x325A,14},{0x1929,13},{0x1928,13},{0x192B,13},
+    {0x192A,13},{0x325D,14},{0x325C,14},{0x0018, 5},
+    {0x001A, 6},{0x001B, 6},{0x0065, 7},{0x0019, 6},
+    {0x0004, 4},{0x0007, 4},{0x0060, 8},{0x0324,10}
+  },
+  {
+    {0x0006, 3},{0x0000, 3},{0x0002, 4},{0x000F, 4},
+    {0x0039, 6},{0x01D9, 9},{0x1D82,13},{0x0761,11},
+    {0x03BE,10},{0x0001, 2},{0x0002, 2},{0x000F, 6},
+    {0x000E, 6},{0x0762,11},{0x3B07,14},{0x3B06,14},
+    {0x3B1D,14},{0x3B1C,14},{0x3B1F,14},{0x3B1E,14},
+    {0x3B19,14},{0x3B18,14},{0x3B1B,14},{0x0038, 6},
+    {0x01DE, 9},{0x00ED, 8},{0x03BF,10},{0x00EE, 8},
+    {0x003A, 6},{0x0006, 5},{0x0EC0,12},{0x3B1A,14}
+  },
+  {
+    {0x0000, 2},{0x0002, 3},{0x000F, 5},{0x0006, 4},
+    {0x001C, 6},{0x01D0,10},{0x0E8C,13},{0x1D1B,14},
+    {0x1D1A,14},{0x0003, 2},{0x0002, 2},{0x00EA, 9},
+    {0x00E9, 9},{0x0E89,13},{0x0E88,13},{0x0E8B,13},
+    {0x0E8A,13},{0x1D65,14},{0x1D64,14},{0x1D67,14},
+    {0x1D66,14},{0x1D61,14},{0x1D60,14},{0x03AD,11},
+    {0x1D63,14},{0x1D62,14},{0x1D1D,14},{0x1D1C,14},
+    {0x003B, 7},{0x01D7,10},{0x1D1F,14},{0x1D1E,14}
+  },
+  {
+    {0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
+    {0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
+    {0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
+    {0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
+    {0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
+    {0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
+    {0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
+    {0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
+  }
+};
+
+
+
+static void th_info2theora_info(theora_info *_ci,const th_info *_info){
+  _ci->version_major=_info->version_major;
+  _ci->version_minor=_info->version_minor;
+  _ci->version_subminor=_info->version_subminor;
+  _ci->width=_info->frame_width;
+  _ci->height=_info->frame_height;
+  _ci->frame_width=_info->pic_width;
+  _ci->frame_height=_info->pic_height;
+  _ci->offset_x=_info->pic_x;
+  _ci->offset_y=_info->pic_y;
+  _ci->fps_numerator=_info->fps_numerator;
+  _ci->fps_denominator=_info->fps_denominator;
+  _ci->aspect_numerator=_info->aspect_numerator;
+  _ci->aspect_denominator=_info->aspect_denominator;
+  switch(_info->colorspace){
+    case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
+    case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
+    default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
+  }
+  switch(_info->pixel_fmt){
+    case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
+    case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
+    case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
+    default:_ci->pixelformat=OC_PF_RSVD;
+  }
+  _ci->target_bitrate=_info->target_bitrate;
+  _ci->quality=_info->quality;
+  _ci->codec_setup=NULL;
+  /*Defaults from old encoder_example... eventually most of these should go
+     away when we make the encoder no longer use them.*/
+  _ci->dropframes_p=0;
+  _ci->keyframe_auto_p=1;
+  _ci->keyframe_frequency=1<<_info->keyframe_granule_shift;
+  _ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
+  _ci->keyframe_data_target_bitrate=
+   _info->target_bitrate+(_info->target_bitrate>>1);
+  _ci->keyframe_auto_threshold=80;
+  _ci->keyframe_mindistance=8;
+  _ci->noise_sensitivity=1;
+  _ci->sharpness=0;
+  _ci->quick_p=1;
+}
+
+static int _ilog(unsigned _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
+
+
+
+struct th_enc_ctx{
+  /*This is required at the start of the struct for the common functions to
+     work.*/
+  th_info        info;
+  /*The actual encoder.*/
+  theora_state   state;
+  /*A temporary buffer for input frames.
+    This is needed if the U and V strides differ, or padding is required.*/
+  unsigned char *buf;
+};
+
+
+th_enc_ctx *th_encode_alloc(const th_info *_info){
+  theora_info  ci;
+  th_enc_ctx  *enc;
+  th_info2theora_info(&ci,_info);
+  /*Do a bunch of checks the new API does, but the old one didn't.*/
+  if((_info->frame_width&0xF)||(_info->frame_height&0xF)||
+   _info->frame_width>=0x100000||_info->frame_height>=0x100000||
+   _info->pic_x+_info->pic_width>_info->frame_width||
+   _info->pic_y+_info->pic_height>_info->frame_height||
+   _info->pic_x>255||
+   _info->frame_height-_info->pic_height-_info->pic_y>255||
+   _info->colorspace<0||_info->colorspace>=TH_CS_NSPACES||
+   _info->pixel_fmt<0||_info->pixel_fmt>=TH_PF_NFORMATS){
+    enc=NULL;
+  }
+  else{
+    enc=(th_enc_ctx *)_ogg_malloc(sizeof(*enc));
+    if(theora_encode_init(&enc->state,&ci)<0){
+      _ogg_free(enc);
+      enc=NULL;
+    }
+    else{
+      if(_info->frame_width>_info->pic_width||
+       _info->frame_height>_info->pic_height){
+        enc->buf=_ogg_malloc((_info->frame_width*_info->frame_height+
+         ((_info->frame_width>>!(_info->pixel_fmt&1))*
+         (_info->frame_height>>!(_info->pixel_fmt&2))<<1))*sizeof(*enc->buf));
+      }
+      else enc->buf=NULL;
+      memcpy(&enc->info,_info,sizeof(enc->info));
+      /*Overwrite values theora_encode_init() can change; don't trust the user.*/
+      enc->info.version_major=ci.version_major;
+      enc->info.version_minor=ci.version_minor;
+      enc->info.version_subminor=ci.version_subminor;
+      enc->info.quality=ci.quality;
+      enc->info.target_bitrate=ci.target_bitrate;
+      enc->info.fps_numerator=ci.fps_numerator;
+      enc->info.fps_denominator=ci.fps_denominator;
+      enc->info.keyframe_granule_shift=_ilog(ci.keyframe_frequency_force-1);
+    }
+  }
+  return enc;
+}
+
+int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
+  return theora_control(&_enc->state,_req,_buf,_buf_sz);
+}
+
+int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_comments,
+ ogg_packet *_op){
+  theora_state *te;
+  CP_INSTANCE  *cpi;
+  if(_enc==NULL||_op==NULL)return OC_FAULT;
+  te=&_enc->state;
+  cpi=(CP_INSTANCE *)te->internal_encode;
+  switch(cpi->doneflag){
+    case -3:{
+      theora_encode_header(te,_op);
+      return -cpi->doneflag++;
+    }break;
+    case -2:{
+      if(_comments==NULL)return OC_FAULT;
+      theora_encode_comment((theora_comment *)_comments,_op);
+      /*The old API does not require a theora_state struct when writing the
+         comment header, so it can't use its internal buffer and relies on the
+         application to free it.
+        The old documentation is wrong on this subject, and this breaks on
+         Windows when linking against multiple versions of libc (which is
+         almost always done when, e.g., using DLLs built with mingw32).
+        The new API _does_ require a th_enc_ctx, and states that libtheora owns
+         the memory.
+        Thus we move the contents of this packet into our internal
+         oggpack_buffer so it can be properly reclaimed.*/
+      oggpackB_reset(cpi->oggbuffer);
+      oggpackB_writecopy(cpi->oggbuffer,_op->packet,_op->bytes*8);
+      _ogg_free(_op->packet);
+      _op->packet=oggpackB_get_buffer(cpi->oggbuffer);
+      return -cpi->doneflag++;
+    }break;
+    case -1:{
+      theora_encode_tables(te,_op);
+      return -cpi->doneflag++;
+    }break;
+    case 0:return 0;
+    default:return OC_EINVAL;
+  }
+}
+
+/*Copies the picture region of the _src image plane into _dst and pads the rest
+   of _dst using a diffusion extension method.
+  We could do much better (e.g., the DCT-based low frequency extension method
+   in theora-exp's fdct.c) if we were to pad after motion compensation, but
+   that would require significant changes to the encoder.*/
+static unsigned char *th_encode_copy_pad_plane(th_img_plane *_dst,
+ unsigned char *_buf,th_img_plane *_src,
+ ogg_uint32_t _pic_x,ogg_uint32_t _pic_y,
+ ogg_uint32_t _pic_width,ogg_uint32_t _pic_height){
+  size_t buf_sz;
+  _dst->width=_src->width;
+  _dst->height=_src->height;
+  _dst->stride=_src->width;
+  _dst->data=_buf;
+  buf_sz=_dst->width*_dst->height*sizeof(*_dst->data);
+  /*If we have _no_ data, just encode a dull green.*/
+  if(_pic_width==0||_pic_height==0)memset(_dst->data,0,buf_sz);
+  else{
+    unsigned char *dst;
+    unsigned char *src;
+    ogg_uint32_t   x;
+    ogg_uint32_t   y;
+    int            dstride;
+    int            sstride;
+    /*Step 1: Copy the data we do have.*/
+    dstride=_dst->stride;
+    sstride=_src->stride;
+    dst=_dst->data+_pic_y*dstride+_pic_x;
+    src=_src->data+_pic_y*sstride+_pic_x;
+    for(y=0;y<_pic_height;y++){
+      memcpy(dst,src,_pic_width);
+      dst+=dstride;
+      src+=sstride;
+    }
+    /*Step 2: Copy the border into any blocks that are 100% padding.
+      There's probably smarter things we could do than this.*/
+    /*Left side.*/
+    for(x=_pic_x;x-->0;){
+      dst=_dst->data+_pic_y*dstride+x;
+      for(y=0;y<_pic_height;y++){
+        dst[0]=(dst[1]<<1)+(dst-(dstride&-(y>0)))[1]+
+         (dst+(dstride&-(y+1<_pic_height)))[1]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Right side.*/
+    for(x=_pic_x+_pic_width;x<_dst->width;x++){
+      dst=_dst->data+_pic_y*dstride+x-1;
+      for(y=0;y<_pic_height;y++){
+        dst[1]=(dst[0]<<1)+(dst-(dstride&-(y>0)))[0]+
+         (dst+(dstride&-(y+1<_pic_height)))[0]+2>>2;
+        dst+=dstride;
+      }
+    }
+    /*Top.*/
+    dst=_dst->data+_pic_y*dstride;
+    for(y=_pic_y;y-->0;){
+      for(x=0;x<_dst->width;x++){
+        (dst-dstride)[x]=(dst[x]<<1)+dst[x-(x>0)]+dst[x+(x+1<_dst->width)]+2>>2;
+      }
+      dst-=dstride;
+    }
+    /*Bottom.*/
+    dst=_dst->data+(_pic_y+_pic_height)*dstride;
+    for(y=_pic_y+_pic_height;y<_dst->height;y++){
+      for(x=0;x<_dst->width;x++){
+        dst[x]=((dst-dstride)[x]<<1)+(dst-dstride)[x-(x>0)]+
+         (dst-dstride)[x+(x+1<_dst->width)]+2>>2;
+      }
+      dst+=dstride;
+    }
+  }
+  _buf+=buf_sz;
+  return _buf;
+}
+
+int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr){
+  CP_INSTANCE     *cpi;
+  theora_state    *te;
+  th_img_plane    *pycbcr;
+  th_ycbcr_buffer  ycbcr;
+  yuv_buffer       yuv;
+  ogg_uint32_t     pic_width;
+  ogg_uint32_t     pic_height;
+  int              hdec;
+  int              vdec;
+  int              ret;
+  if(_enc==NULL||_ycbcr==NULL)return OC_FAULT;
+  te=&_enc->state;
+  /*theora_encode_YUVin() does not bother to check uv_width and uv_height, and
+     then uses them.
+    This is arguably okay (it will most likely lead to a crash if they're
+     wrong, which will make the developer who passed them fix the problem), but
+     our API promises to return an error code instead.*/
+  cpi=(CP_INSTANCE *)te->internal_encode;
+  hdec=!(cpi->info.pixelformat&1);
+  vdec=!(cpi->info.pixelformat&2);
+  if(_ycbcr[0].width!=cpi->info.width||
+   _ycbcr[0].height!=cpi->info.height||
+   _ycbcr[1].width!=_ycbcr[0].width>>hdec||
+   _ycbcr[1].height!=_ycbcr[0].height>>vdec||
+   _ycbcr[2].width!=_ycbcr[1].width||_ycbcr[2].height!=_ycbcr[1].height){
+    return OC_EINVAL;
+  }
+  pic_width=cpi->info.frame_width;
+  pic_height=cpi->info.frame_height;
+  /*We can only directly use the input buffer if no padding is required (since
+     the new API is documented not to use values outside the picture region)
+     and if the strides for the Cb and Cr planes are the same, since the old
+     API had no way to specify different ones.*/
+  if(_ycbcr[0].width==pic_width&&_ycbcr[0].height==pic_height&&
+   _ycbcr[1].stride==_ycbcr[2].stride){
+    pycbcr=_ycbcr;
+  }
+  else{
+    unsigned char *buf;
+    int            pic_x;
+    int            pic_y;
+    int            pli;
+    pic_x=cpi->info.offset_x;
+    pic_y=cpi->info.offset_y;
+    if(_ycbcr[0].width>pic_width||_ycbcr[0].height>pic_height){
+      buf=th_encode_copy_pad_plane(ycbcr+0,_enc->buf,_ycbcr+0,
+       pic_x,pic_y,pic_width,pic_height);
+    }
+    else{
+      /*If only the strides differ, we can still avoid copying the luma plane.*/
+      memcpy(ycbcr+0,_ycbcr+0,sizeof(ycbcr[0]));
+      if(_enc->buf==NULL){
+        _enc->buf=(unsigned char *)_ogg_malloc(
+         (_ycbcr[1].width*_ycbcr[1].height<<1)*sizeof(*_enc->buf));
+      }
+      buf=_enc->buf;
+    }
+    for(pli=1;pli<3;pli++){
+      int x0;
+      int y0;
+      x0=pic_x>>hdec;
+      y0=pic_y>>vdec;
+      buf=th_encode_copy_pad_plane(ycbcr+pli,buf,_ycbcr+pli,
+       x0,y0,(pic_x+pic_width+hdec>>hdec)-x0,(pic_y+pic_height+vdec>>vdec)-y0);
+    }
+    pycbcr=ycbcr;
+  }
+  yuv.y_width=pycbcr[0].width;
+  yuv.y_height=pycbcr[0].height;
+  yuv.uv_width=pycbcr[1].width;
+  yuv.uv_height=pycbcr[1].height;
+  yuv.y_stride=pycbcr[0].stride;
+  yuv.y=pycbcr[0].data;
+  yuv.uv_stride=pycbcr[1].stride;
+  yuv.u=pycbcr[1].data;
+  yuv.v=pycbcr[2].data;
+  ret=theora_encode_YUVin(te,&yuv);
+  return ret;
+}
+
+int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op){
+  if(_enc==NULL)return OC_FAULT;
+  return theora_encode_packetout(&_enc->state,_last,_op);
+}
+
+void th_encode_free(th_enc_ctx *_enc){
+  if(_enc!=NULL){
+    theora_clear(&_enc->state);
+    _ogg_free(_enc->buf);
+    _ogg_free(_enc);
+  }
+}

Modified: branches/theora-thusnelda/lib/enc/encoder_quant.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/encoder_quant.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -21,17 +21,22 @@
 #include "quant_lookup.h"
 
 #define OC_QUANT_MAX        (1024<<2)
-//unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
-//unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
+static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
+static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
 #define OC_MAXI(_a,_b)      ((_a)<(_b)?(_b):(_a))
 #define OC_MINI(_a,_b)      ((_a)>(_b)?(_b):(_a))
 #define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
 
+static int ilog(unsigned _v){
+  int ret;
+  for(ret=0;_v;ret++)_v>>=1;
+  return ret;
+}
 
 void WriteQTables(CP_INSTANCE *cpi,oggpack_buffer* _opb) {
-  
-  th_quant_info *_qinfo = &cpi->quant_info; 
-  
+
+  th_quant_info *_qinfo = &cpi->quant_info;
+
   const th_quant_ranges *qranges;
   const th_quant_base   *base_mats[2*3*64];
   int                    indices[2][3][64];
@@ -46,22 +51,22 @@
   int                    plj;
   int                    bmi;
   int                    i;
-  
+
   /*Unlike the scale tables, we can't assume the maximum value will be in
      index 0, so search for it here.*/
   i=_qinfo->loop_filter_limits[0];
   for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]);
-  nbits=oc_ilog(i);
+  nbits=ilog(i);
   oggpackB_write(_opb,nbits,3);
   for(qi=0;qi<64;qi++){
     oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
   }
   /* 580 bits for VP3.*/
-  nbits=OC_MAXI(oc_ilog(_qinfo->ac_scale[0]),1);
+  nbits=OC_MAXI(ilog(_qinfo->ac_scale[0]),1);
   oggpackB_write(_opb,nbits-1,4);
   for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
   /* 516 bits for VP3.*/
-  nbits=OC_MAXI(oc_ilog(_qinfo->dc_scale[0]),1);
+  nbits=OC_MAXI(ilog(_qinfo->dc_scale[0]),1);
   oggpackB_write(_opb,nbits-1,4);
   for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
   /*Consolidate any duplicate base matrices.*/
@@ -92,7 +97,7 @@
   /*Now store quant ranges and their associated indices into the base matrix
      list.
      46 bits for VP3 matrices.*/
-  nbits=oc_ilog(nbase_mats-1);
+  nbits=ilog(nbase_mats-1);
   for(i=0;i<6;i++){
     qti=i/3;
     pli=i%3;
@@ -122,7 +127,7 @@
     }
     oggpackB_write(_opb,indices[qti][pli][0],nbits);
     for(qi=qri=0;qi<63;qri++){
-      oggpackB_write(_opb,qranges->sizes[qri]-1,oc_ilog(62-qi));
+      oggpackB_write(_opb,qranges->sizes[qri]-1,ilog(62-qi));
       qi+=qranges->sizes[qri];
       oggpackB_write(_opb,indices[qti][pli][qri+1],nbits);
     }
@@ -140,51 +145,51 @@
     for(pli=0;pli<3;pli++){
       int qi;  /* quality index */
       int qri; /* range iterator */
-      
+
       for(qi=0,qri=0; qri<=qinfo->qi_ranges[qti][pli].nranges; qri++){
-	th_quant_base base;
-	
-	ogg_uint32_t      q;
-	int               qi_start;
-	int               qi_end;
-	int               ci;
-	memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
-	       sizeof(base));
-	
-	qi_start=qi;
-	if(qri==qinfo->qi_ranges[qti][pli].nranges)
-	  qi_end=qi+1;
-	else 
-	  qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
-	
-	/* Iterate over quality indicies in this range */
-	for(;;){
-	  
-	  /*Scale DC the coefficient from the proper table.*/
-	  q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
-	  q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	  cpi->quant_tables[qti][pli][0][qi]=(ogg_uint16_t)q;
-	  cpi->iquant_tables[qti][pli][qi][0]=(ogg_int32_t)(((1<<31))/q+1);
+        th_quant_base base;
 
-	  /*Now scale AC coefficients from the proper table.*/
-	  for(ci=1;ci<64;ci++){
-	    q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
-	    q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
-	    cpi->quant_tables[qti][pli][zigzag_index[ci]][qi]=(ogg_uint16_t)q;
-	    cpi->iquant_tables[qti][pli][qi][zigzag_index[ci]]=(ogg_int32_t)(((1<<31))/q+1);
-	  }
-	  
-	  if(++qi>=qi_end)break;
-	  
-	  /*Interpolate the next base matrix.*/
-	  for(ci=0;ci<64;ci++){
-	    base[ci]=(unsigned char)
-	      ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
-		   (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
-		+qinfo->qi_ranges[qti][pli].sizes[qri])/
-	       (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
-	  }
-	}
+        ogg_uint32_t      q;
+        int               qi_start;
+        int               qi_end;
+        int               ci;
+        memcpy(base,qinfo->qi_ranges[qti][pli].base_matrices[qri],
+               sizeof(base));
+
+        qi_start=qi;
+        if(qri==qinfo->qi_ranges[qti][pli].nranges)
+          qi_end=qi+1;
+        else
+          qi_end=qi+qinfo->qi_ranges[qti][pli].sizes[qri];
+
+        /* Iterate over quality indicies in this range */
+        for(;;){
+
+          /*Scale DC the coefficient from the proper table.*/
+          q=((ogg_uint32_t)qinfo->dc_scale[qi]*base[0]/100)<<2;
+          q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+          cpi->quant_tables[qti][pli][0][qi]=(ogg_uint16_t)q;
+          cpi->iquant_tables[qti][pli][qi][0]=(ogg_int32_t)(((1<<31))/q+1);
+
+          /*Now scale AC coefficients from the proper table.*/
+          for(ci=1;ci<64;ci++){
+            q=((ogg_uint32_t)qinfo->ac_scale[qi]*base[ci]/100)<<2;
+            q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
+            cpi->quant_tables[qti][pli][zigzag_index[ci]][qi]=(ogg_uint16_t)q;
+            cpi->iquant_tables[qti][pli][qi][zigzag_index[ci]]=(ogg_int32_t)(((1<<31))/q+1);
+          }
+
+          if(++qi>=qi_end)break;
+
+          /*Interpolate the next base matrix.*/
+          for(ci=0;ci<64;ci++){
+            base[ci]=(unsigned char)
+              ((2*((qi_end-qi)*qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
+                   (qi-qi_start)*qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
+                +qinfo->qi_ranges[qti][pli].sizes[qri])/
+               (2*qinfo->qi_ranges[qti][pli].sizes[qri]));
+          }
+        }
       }
     }
   }

Modified: branches/theora-thusnelda/lib/enc/encoder_toplevel.c
===================================================================
--- branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/encoder_toplevel.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -171,6 +171,8 @@
   cpi->readyflag = 1;
   
   cpi->HeadersWritten = 0;
+  /*We overload this flag to track header output.*/
+  cpi->doneflag=-3;
 
   return 0;
 }

Modified: branches/theora-thusnelda/lib/enc/mode.c
===================================================================
--- branches/theora-thusnelda/lib/enc/mode.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/mode.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -641,21 +641,21 @@
   /* collect rho metrics, quantize */
   {
     int i;
-    //quant_tables *qq = ps->qq[mode != CODE_INTRA];
+    quant_tables *qq = ps->qq[mode != CODE_INTRA];
     
     for(i=0;i<64;i++){
       int v = buffer[dezigzag_index[i]];
-      //int pos;
-      //int val = abs(buffer[ii])<<1;
-      //ogg_int16_t *qqq = (*qq)[i];
-      //for(pos=64;pos>0;pos--)
-      //if(val < qqq[pos-1])break;
+      int pos;
+      int val = abs(v)<<1;
+      ogg_int16_t *qqq = (*qq)[i];
+      for(pos=64;pos>0;pos--)
+      if(val < qqq[pos-1])break;
       
       /* rho-domain distribution */
-      //rho_count[pos]++;
+      rho_count[pos]++;
 
-      if((abs(v)<<1)>=dequant[i]){
-	int val = (((iq[i]>>15)*v) + (1<<15) + (((iq[i]&0x7fff)*v)>>15)) >>16;
+      if(val>=dequant[i]){
+	val = (((iq[i]>>15)*v) + (1<<15) + (((iq[i]&0x7fff)*v)>>15)) >>16;
 	data[i] = (val>511?511:(val<-511?-511:val));
 	nonzero=i;
       }else{
@@ -1132,10 +1132,10 @@
     fr_finishsb(cpi,&fr);
   }
 
-  //for(i=1;i<65;i++)
-  //rho_count[i]+=rho_count[i-1];
+  for(i=1;i<65;i++)
+  rho_count[i]+=rho_count[i-1];
 
-  //memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
+  memcpy(cpi->rho_count,rho_count,sizeof(rho_count));
   if(cpi->FrameType != KEY_FRAME){
     
     if(interbits>intrabits) return 1; /* short circuit */

Modified: branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/dct_decode_mmx.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -17,7 +17,7 @@
 
 #include <stdlib.h>
 
-#include "codec_internal.h"
+#include "../codec_internal.h"
 
 #if defined(USE_ASM)
 

Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmx.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -17,8 +17,8 @@
 
 #include <stdlib.h>
 
-#include "codec_internal.h"
-#include "dsp.h"
+#include "../codec_internal.h"
+#include "../dsp.h"
 
 #if defined(USE_ASM)
 
@@ -29,12 +29,12 @@
 #define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b))))
 
 static void sub8x8__mmx (const unsigned char *FiltPtr, const unsigned char *ReconPtr,
-			 ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine) 
+                         ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine)
 {
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
 
     ".rept 8                        \n\t"
     "  movq        (%0), %%mm0      \n\t" /* mm0 = FiltPtr */
@@ -67,13 +67,13 @@
 }
 
 static void sub8x8_128__mmx (const unsigned char *FiltPtr, ogg_int16_t *DctInputPtr,
-			     ogg_uint32_t PixelsPerLine) 
+                             ogg_uint32_t PixelsPerLine)
 {
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
 
-    "  pxor        %%mm7, %%mm7     \n\t" 
+    "  pxor        %%mm7, %%mm7     \n\t"
     "  movq        %[V128], %%mm1   \n\t"
 
     ".rept 8                        \n\t"

Modified: branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/dsp_mmxext.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -17,33 +17,33 @@
 
 #include <stdlib.h>
 
-#include "codec_internal.h"
-#include "dsp.h"
+#include "../codec_internal.h"
+#include "../dsp.h"
 
 #if defined(USE_ASM)
 
 static ogg_uint32_t sad8x8__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
-				    ogg_uint32_t stride)
+                                    ogg_uint32_t stride)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
     ".rept 7                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %3, %2                   \n\t"	/* Inc pointer into ref data */
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
     "  movd %%mm7, %0               \n\t"
 
      : "=r" (DiffVal),
@@ -57,21 +57,21 @@
 }
 
 static ogg_uint32_t sad8x8_thres__mmxext (const unsigned char *ptr1, const unsigned char *ptr2, 
-					  ogg_uint32_t stride, ogg_uint32_t thres)
+                                          ogg_uint32_t stride, ogg_uint32_t thres)
 {
   ogg_uint32_t  DiffVal;
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
 
     ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
-    "  add %3, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %3, %2                   \n\t"	/* Inc pointer into ref data */
+    "  add %3, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %3, %2                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
     "  movd %%mm7, %0               \n\t"
@@ -94,32 +94,32 @@
 
   __asm__ __volatile__ (
     "  .balign 16                   \n\t"
-    "  pxor %%mm7, %%mm7            \n\t" 	/* mm7 contains the result */
+    "  pxor %%mm7, %%mm7            \n\t"       /* mm7 contains the result */
     ".rept 8                        \n\t"
-    "  movq (%1), %%mm0             \n\t"	/* take 8 bytes */
+    "  movq (%1), %%mm0             \n\t"       /* take 8 bytes */
     "  movq (%2), %%mm1             \n\t"
     "  movq (%3), %%mm2             \n\t"
     "  pavgb %%mm2, %%mm1           \n\t"
     "  psadbw %%mm1, %%mm0          \n\t"
 
-    "  add %4, %1                   \n\t"	/* Inc pointer into the new data */
-    "  paddw %%mm0, %%mm7           \n\t"	/* accumulate difference... */
-    "  add %4, %2                   \n\t"	/* Inc pointer into ref data */
-    "  add %4, %3                   \n\t"	/* Inc pointer into ref data */
+    "  add %4, %1                   \n\t"       /* Inc pointer into the new data */
+    "  paddw %%mm0, %%mm7           \n\t"       /* accumulate difference... */
+    "  add %4, %2                   \n\t"       /* Inc pointer into ref data */
+    "  add %4, %3                   \n\t"       /* Inc pointer into ref data */
     ".endr                          \n\t"
 
     "  movd %%mm7, %0               \n\t"
      : "=m" (DiffVal),
-       "+r" (SrcData), 
-       "+r" (RefDataPtr1), 
-       "+r" (RefDataPtr2) 
+       "+r" (SrcData),
+       "+r" (RefDataPtr1),
+       "+r" (RefDataPtr2)
      : "r" ((unsigned long)Stride)
      : "memory"
   );
 
   return DiffVal;
 }
-		
+
 void dsp_mmxext_init(DspFunctions *funcs)
 {
   funcs->sad8x8 = sad8x8__mmxext;

Modified: branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/fdct_mmx.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -14,8 +14,8 @@
 /* $Id: fdct_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $ */
 
 #include "theora/theora.h"
-#include "codec_internal.h"
-#include "dsp.h"
+#include "../codec_internal.h"
+#include "../dsp.h"
 
 #if defined(USE_ASM)
 
@@ -55,7 +55,7 @@
   "  psubsw      %%mm3, %%mm1       \n\t" /* mm1 = is12 - is56 */             \
   "  movq        %%mm0," #temp "    \n\t" /* Save is07 - is34 to free mm0; */ \
   "  paddsw      %%mm3, %%mm3       \n\t"                                     \
-  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56	= is1256 */   \
+  "  paddsw      %%mm1, %%mm3       \n\t" /* mm3 = is12 + 1s56  = is1256 */   \
                                                                               \
   "  psubsw    " #ip6 ", %%mm7      \n\t" /* mm7 = ip5 - ip6 = id56 */        \
   /* ------------------------------------------------------------------- */   \
@@ -81,7 +81,7 @@
   "  pmulhw      %[xC4S4], %%mm3    \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */ \
                                                                               \
   "  psrlw       $15, %%mm2         \n\t"                                     \
-  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )	 */ \
+  "  paddw       %%mm0, %%mm3       \n\t" /* mm3 = xC4S4 * ( is0734 +is1256 )    */ \
   "  paddw       %%mm2, %%mm3       \n\t" /* Truncate mm3, now it is op[0] */ \
                                                                               \
   "  movq        %%mm3," #ip0 "     \n\t"                                     \
@@ -132,16 +132,16 @@
   "  movq        %%mm1, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"				      \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm1       \n\t" /* mm0 = xC4S4 * ( is12 - is56 ) */ \
   "  paddw       %%mm2, %%mm1       \n\t" /* Truncate mm1, now it is icommon_product1 */ \
                                                                               \
   "  movq        %%mm7, %%mm2       \n\t"                                     \
-  "  movq        %%mm7, %%mm3       \n\t"			              \
+  "  movq        %%mm7, %%mm3       \n\t"                                     \
                                                                               \
   "  pmulhw      %%mm0, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */ \
-  "  psrlw       $15, %%mm2         \n\t"			              \
+  "  psrlw       $15, %%mm2         \n\t"                                     \
                                                                               \
   "  paddw       %%mm3, %%mm7       \n\t" /* mm7 = xC4S4 * ( id12 + id56 ) */ \
   "  paddw       %%mm2, %%mm7       \n\t" /* Truncate mm7, now it is icommon_product2 */ \
@@ -230,10 +230,10 @@
   "  paddw       %%mm6, %%mm7       \n\t" /* mm7 = xC3S5 * irot_input_y */    \
                                                                               \
   "  paddw       %%mm7, %%mm3       \n\t" /* ip5 */                           \
-  "  movq        %%mm3," #ip5 "     \n\t" 
+  "  movq        %%mm3," #ip5 "     \n\t"
 
 #define Transpose_mmx(ip0,ip1,ip2,ip3,ip4,ip5,ip6,ip7,                  \
-		      op0,op1,op2,op3,op4,op5,op6,op7)                  \
+                      op0,op1,op2,op3,op4,op5,op6,op7)                  \
   "  movq      " #ip0 ", %%mm0      \n\t" /* mm0 = a0 a1 a2 a3 */       \
   "  movq      " #ip4 ", %%mm4      \n\t" /* mm4 = e4 e5 e6 e7 */       \
   "  movq      " #ip1 ", %%mm1      \n\t" /* mm1 = b0 b1 b2 b3 */       \
@@ -247,9 +247,9 @@
   "  movq        %%mm4, %%mm1       \n\t" /* mm1 = e3 e2 e1 e0 */       \
   "  punpcklwd   %%mm5, %%mm4       \n\t" /* mm4 = f1 e1 f0 e0 */       \
   "  movq        %%mm0," #op0 "     \n\t" /* save a3 a2 a1 a0  */       \
-  "  punpckhwd	 %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
+  "  punpckhwd   %%mm5, %%mm1       \n\t" /* mm1 = f3 e3 f2 e2 */       \
   "  movq        %%mm6, %%mm0       \n\t" /* mm0 = g3 g2 g1 g0 */       \
-  "  punpcklwd	 %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
+  "  punpcklwd   %%mm7, %%mm6       \n\t" /* mm6 = h1 g1 h0 g0 */       \
   "  movq        %%mm4, %%mm5       \n\t" /* mm5 = f1 e1 f0 e0 */       \
   "  punpckldq   %%mm6, %%mm4       \n\t" /* mm4 = h0 g0 f0 e0 = MM4 */ \
   "  punpckhdq   %%mm6, %%mm5       \n\t" /* mm5 = h1 g1 f1 e1 = MM5 */ \
@@ -295,23 +295,23 @@
      * we will transpose the block of data to two 4x8 blocks???
      */
     Transpose_mmx (  (%0), 16(%0), 32(%0), 48(%0),  8(%0), 24(%0), 40(%0), 56(%0),
-		     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
+                     (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1))
     Fdct_mmx      (  (%1), 16(%1), 32(%1), 48(%1),  8(%1), 24(%1), 40(%1), 56(%1), (%2))
 
     Transpose_mmx (64(%0), 80(%0), 96(%0),112(%0), 72(%0), 88(%0),104(%0),120(%0),
-		   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
+                   64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      (64(%1), 80(%1), 96(%1),112(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     Transpose_mmx ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1),
-		    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
+                    0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1))
     Fdct_mmx      ( 0(%1), 16(%1), 32(%1), 48(%1), 64(%1), 80(%1), 96(%1),112(%1), (%2))
 
     Transpose_mmx ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1),
-		    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
+                    8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1))
     Fdct_mmx      ( 8(%1), 24(%1), 40(%1), 56(%1), 72(%1), 88(%1),104(%1),120(%1), (%2))
 
     "  emms                         \n\t"
-    
+
     : "+r" (InputData),
       "+r" (OutputData)
     : "r" (temp),

Modified: branches/theora-thusnelda/lib/enc/x86/idct_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/idct_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/idct_mmx.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -15,7 +15,7 @@
 
  ********************************************************************/
 
-#include "codec_internal.h"
+#include "../codec_internal.h"
 
 #if defined(USE_ASM)
 
@@ -56,82 +56,82 @@
 
 #define BeginIDCT "#BeginIDCT\n"    \
                                     \
-    "   movq    "I(3)",%%mm2\n"   \
+    "   movq    "I(3)",%%mm2\n"     \
                                     \
-    "   movq    "C(3)",%%mm6\n"   \
-    "   movq    %%mm2,%%mm4\n"     \
-    "   movq    "J(5)",%%mm7\n"   \
-    "   pmulhw  %%mm6,%%mm4\n"     \
-    "   movq    "C(5)",%%mm1\n"   \
-    "   pmulhw  %%mm7,%%mm6\n"     \
-    "   movq    %%mm1,%%mm5\n"     \
-    "   pmulhw  %%mm2,%%mm1\n"     \
-    "   movq    "I(1)",%%mm3\n"   \
-    "   pmulhw  %%mm7,%%mm5\n"     \
-    "   movq    "C(1)",%%mm0\n"   \
-    "   paddw   %%mm2,%%mm4\n"     \
-    "   paddw   %%mm7,%%mm6\n"     \
-    "   paddw   %%mm1,%%mm2\n"     \
-    "   movq    "J(7)",%%mm1\n"   \
-    "   paddw   %%mm5,%%mm7\n"     \
-    "   movq    %%mm0,%%mm5\n"     \
-    "   pmulhw  %%mm3,%%mm0\n"     \
-    "   paddsw  %%mm7,%%mm4\n"     \
-    "   pmulhw  %%mm1,%%mm5\n"     \
-    "   movq    "C(7)",%%mm7\n"   \
-    "   psubsw  %%mm2,%%mm6\n"     \
-    "   paddw   %%mm3,%%mm0\n"     \
-    "   pmulhw  %%mm7,%%mm3\n"     \
-    "   movq    "I(2)",%%mm2\n"   \
-    "   pmulhw  %%mm1,%%mm7\n"     \
-    "   paddw   %%mm1,%%mm5\n"     \
-    "   movq    %%mm2,%%mm1\n"     \
-    "   pmulhw  "C(2)",%%mm2\n"   \
-    "   psubsw  %%mm5,%%mm3\n"     \
-    "   movq    "J(6)",%%mm5\n"   \
-    "   paddsw  %%mm7,%%mm0\n"     \
-    "   movq    %%mm5,%%mm7\n"     \
-    "   psubsw  %%mm4,%%mm0\n"     \
-    "   pmulhw  "C(2)",%%mm5\n"   \
-    "   paddw   %%mm1,%%mm2\n"     \
-    "   pmulhw  "C(6)",%%mm1\n"   \
-    "   paddsw  %%mm4,%%mm4\n"     \
-    "   paddsw  %%mm0,%%mm4\n"     \
-    "   psubsw  %%mm6,%%mm3\n"     \
-    "   paddw   %%mm7,%%mm5\n"     \
-    "   paddsw  %%mm6,%%mm6\n"     \
-    "   pmulhw  "C(6)",%%mm7\n"   \
-    "   paddsw  %%mm3,%%mm6\n"     \
-    "   movq    %%mm4,"I(1)"\n"   \
-    "   psubsw  %%mm5,%%mm1\n"     \
-    "   movq    "C(4)",%%mm4\n"   \
-    "   movq    %%mm3,%%mm5\n"     \
-    "   pmulhw  %%mm4,%%mm3\n"     \
-    "   paddsw  %%mm2,%%mm7\n"     \
-    "   movq    %%mm6,"I(2)"\n"   \
-    "   movq    %%mm0,%%mm2\n"     \
-    "   movq    "I(0)",%%mm6\n"   \
-    "   pmulhw  %%mm4,%%mm0\n"     \
-    "   paddw   %%mm3,%%mm5\n"     \
+    "   movq    "C(3)",%%mm6\n"     \
+    "   movq    %%mm2,%%mm4\n"      \
+    "   movq    "J(5)",%%mm7\n"     \
+    "   pmulhw  %%mm6,%%mm4\n"      \
+    "   movq    "C(5)",%%mm1\n"     \
+    "   pmulhw  %%mm7,%%mm6\n"      \
+    "   movq    %%mm1,%%mm5\n"      \
+    "   pmulhw  %%mm2,%%mm1\n"      \
+    "   movq    "I(1)",%%mm3\n"     \
+    "   pmulhw  %%mm7,%%mm5\n"      \
+    "   movq    "C(1)",%%mm0\n"     \
+    "   paddw   %%mm2,%%mm4\n"      \
+    "   paddw   %%mm7,%%mm6\n"      \
+    "   paddw   %%mm1,%%mm2\n"      \
+    "   movq    "J(7)",%%mm1\n"     \
+    "   paddw   %%mm5,%%mm7\n"      \
+    "   movq    %%mm0,%%mm5\n"      \
+    "   pmulhw  %%mm3,%%mm0\n"      \
+    "   paddsw  %%mm7,%%mm4\n"      \
+    "   pmulhw  %%mm1,%%mm5\n"      \
+    "   movq    "C(7)",%%mm7\n"     \
+    "   psubsw  %%mm2,%%mm6\n"      \
+    "   paddw   %%mm3,%%mm0\n"      \
+    "   pmulhw  %%mm7,%%mm3\n"      \
+    "   movq    "I(2)",%%mm2\n"     \
+    "   pmulhw  %%mm1,%%mm7\n"      \
+    "   paddw   %%mm1,%%mm5\n"      \
+    "   movq    %%mm2,%%mm1\n"      \
+    "   pmulhw  "C(2)",%%mm2\n"     \
+    "   psubsw  %%mm5,%%mm3\n"      \
+    "   movq    "J(6)",%%mm5\n"     \
+    "   paddsw  %%mm7,%%mm0\n"      \
+    "   movq    %%mm5,%%mm7\n"      \
+    "   psubsw  %%mm4,%%mm0\n"      \
+    "   pmulhw  "C(2)",%%mm5\n"     \
+    "   paddw   %%mm1,%%mm2\n"      \
+    "   pmulhw  "C(6)",%%mm1\n"     \
+    "   paddsw  %%mm4,%%mm4\n"      \
+    "   paddsw  %%mm0,%%mm4\n"      \
+    "   psubsw  %%mm6,%%mm3\n"      \
+    "   paddw   %%mm7,%%mm5\n"      \
+    "   paddsw  %%mm6,%%mm6\n"      \
+    "   pmulhw  "C(6)",%%mm7\n"     \
+    "   paddsw  %%mm3,%%mm6\n"      \
+    "   movq    %%mm4,"I(1)"\n"     \
+    "   psubsw  %%mm5,%%mm1\n"      \
+    "   movq    "C(4)",%%mm4\n"     \
+    "   movq    %%mm3,%%mm5\n"      \
+    "   pmulhw  %%mm4,%%mm3\n"      \
+    "   paddsw  %%mm2,%%mm7\n"      \
+    "   movq    %%mm6,"I(2)"\n"     \
+    "   movq    %%mm0,%%mm2\n"      \
+    "   movq    "I(0)",%%mm6\n"     \
+    "   pmulhw  %%mm4,%%mm0\n"      \
+    "   paddw   %%mm3,%%mm5\n"      \
     "\n"                            \
-    "   movq    "J(4)",%%mm3\n"   \
-    "   psubsw  %%mm1,%%mm5\n"     \
-    "   paddw   %%mm0,%%mm2\n"     \
-    "   psubsw  %%mm3,%%mm6\n"     \
-    "   movq    %%mm6,%%mm0\n"     \
-    "   pmulhw  %%mm4,%%mm6\n"     \
-    "   paddsw  %%mm3,%%mm3\n"     \
-    "   paddsw  %%mm1,%%mm1\n"     \
-    "   paddsw  %%mm0,%%mm3\n"     \
-    "   paddsw  %%mm5,%%mm1\n"     \
-    "   pmulhw  %%mm3,%%mm4\n"     \
-    "   paddsw  %%mm0,%%mm6\n"     \
-    "   psubsw  %%mm2,%%mm6\n"     \
-    "   paddsw  %%mm2,%%mm2\n"     \
-    "   movq    "I(1)",%%mm0\n"   \
-    "   paddsw  %%mm6,%%mm2\n"     \
-    "   paddw   %%mm3,%%mm4\n"     \
-    "   psubsw  %%mm1,%%mm2\n"     \
+    "   movq    "J(4)",%%mm3\n"     \
+    "   psubsw  %%mm1,%%mm5\n"      \
+    "   paddw   %%mm0,%%mm2\n"      \
+    "   psubsw  %%mm3,%%mm6\n"      \
+    "   movq    %%mm6,%%mm0\n"      \
+    "   pmulhw  %%mm4,%%mm6\n"      \
+    "   paddsw  %%mm3,%%mm3\n"      \
+    "   paddsw  %%mm1,%%mm1\n"      \
+    "   paddsw  %%mm0,%%mm3\n"      \
+    "   paddsw  %%mm5,%%mm1\n"      \
+    "   pmulhw  %%mm3,%%mm4\n"      \
+    "   paddsw  %%mm0,%%mm6\n"      \
+    "   psubsw  %%mm2,%%mm6\n"      \
+    "   paddsw  %%mm2,%%mm2\n"      \
+    "   movq    "I(1)",%%mm0\n"     \
+    "   paddsw  %%mm6,%%mm2\n"      \
+    "   paddw   %%mm3,%%mm4\n"      \
+    "   psubsw  %%mm1,%%mm2\n"      \
     "#end BeginIDCT\n"
 // end BeginIDCT macro (38 cycles).
 
@@ -160,10 +160,10 @@
 
 // RowIDCT gets ready to transpose.
 
-#define RowIDCT "#RowIDCT\n"                             \
-    BeginIDCT                                           \
-    "\n"                                                \
-    "   movq    "I(2)",%%mm3\n"  /* r3 = D. */           \
+#define RowIDCT "#RowIDCT\n"                              \
+    BeginIDCT                                             \
+    "\n"                                                  \
+    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */           \
     "   psubsw  %%mm7,%%mm4\n"    /* r4 = E. = E - G */   \
     "   paddsw  %%mm1,%%mm1\n"    /* r1 = H. + H. */      \
     "   paddsw  %%mm7,%%mm7\n"    /* r7 = G + G */        \
@@ -177,9 +177,9 @@
     "   paddsw  %%mm6,%%mm5\n"    /* r5 = R5 = F. + B.. */\
     "   psubsw  %%mm0,%%mm7\n"    /* r7 = R7 = G. - C. */ \
     "   paddsw  %%mm0,%%mm0\n"                            \
-    "   movq    %%mm1,"I(1)"\n"  /* save R1 */           \
+    "   movq    %%mm1,"I(1)"\n"   /* save R1 */           \
     "   paddsw  %%mm7,%%mm0\n"    /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT"									
+    "#end RowIDCT"
 
 // end RowIDCT macro (8 + 38 = 46 cycles)
 
@@ -204,45 +204,45 @@
  */
 // Column IDCT normalizes and stores final results.
 
-#define ColumnIDCT "#ColumnIDCT\n"                          \
-    BeginIDCT                                               \
-    "\n"                                                    \
-    "   paddsw  "Eight",%%mm2\n"                             \
+#define ColumnIDCT "#ColumnIDCT\n"                            \
+    BeginIDCT                                                 \
+    "\n"                                                      \
+    "   paddsw  "Eight",%%mm2\n"                              \
     "   paddsw  %%mm1,%%mm1\n"        /* r1 = H. + H. */      \
     "   paddsw  %%mm2,%%mm1\n"        /* r1 = R1 = A.. + H. */\
-    "   psraw   ""$4"",%%mm2\n"      /* r2 = NR2 */          \
+    "   psraw   ""$4"",%%mm2\n"       /* r2 = NR2 */          \
     "   psubsw  %%mm7,%%mm4\n"        /* r4 = E. = E - G */   \
-    "   psraw   ""$4"",%%mm1\n"      /* r1 = NR1 */          \
-    "   movq    "I(2)",%%mm3\n"  /* r3 = D. */               \
+    "   psraw   ""$4"",%%mm1\n"       /* r1 = NR1 */          \
+    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */               \
     "   paddsw  %%mm7,%%mm7\n"        /* r7 = G + G */        \
-    "   movq    %%mm2,"I(2)"\n"  /* store NR2 at I2 */       \
+    "   movq    %%mm2,"I(2)"\n"   /* store NR2 at I2 */       \
     "   paddsw  %%mm4,%%mm7\n"        /* r7 = G. = E + G */   \
-    "   movq    %%mm1,"I(1)"\n"  /* store NR1 at I1 */       \
+    "   movq    %%mm1,"I(1)"\n"   /* store NR1 at I1 */       \
     "   psubsw  %%mm3,%%mm4\n"        /* r4 = R4 = E. - D. */ \
-    "   paddsw  "Eight",%%mm4\n"                             \
+    "   paddsw  "Eight",%%mm4\n"                              \
     "   paddsw  %%mm3,%%mm3\n"        /* r3 = D. + D. */      \
     "   paddsw  %%mm4,%%mm3\n"        /* r3 = R3 = E. + D. */ \
-    "   psraw   ""$4"",%%mm4\n"      /* r4 = NR4 */          \
+    "   psraw   ""$4"",%%mm4\n"       /* r4 = NR4 */          \
     "   psubsw  %%mm5,%%mm6\n"        /* r6 = R6 = F. - B.. */\
-    "   psraw   ""$4"",%%mm3\n"      /* r3 = NR3 */          \
-    "   paddsw  "Eight",%%mm6\n"                             \
+    "   psraw   ""$4"",%%mm3\n"       /* r3 = NR3 */          \
+    "   paddsw  "Eight",%%mm6\n"                              \
     "   paddsw  %%mm5,%%mm5\n"        /* r5 = B.. + B.. */    \
     "   paddsw  %%mm6,%%mm5\n"        /* r5 = R5 = F. + B.. */\
-    "   psraw   ""$4"",%%mm6\n"      /* r6 = NR6 */          \
-    "   movq    %%mm4,"J(4)"\n"  /* store NR4 at J4 */       \
-    "   psraw   ""$4"",%%mm5\n"      /* r5 = NR5 */          \
-    "   movq    %%mm3,"I(3)"\n"  /* store NR3 at I3 */       \
+    "   psraw   ""$4"",%%mm6\n"       /* r6 = NR6 */          \
+    "   movq    %%mm4,"J(4)"\n"   /* store NR4 at J4 */       \
+    "   psraw   ""$4"",%%mm5\n"       /* r5 = NR5 */          \
+    "   movq    %%mm3,"I(3)"\n"   /* store NR3 at I3 */       \
     "   psubsw  %%mm0,%%mm7\n"        /* r7 = R7 = G. - C. */ \
-    "   paddsw  "Eight",%%mm7\n"                             \
+    "   paddsw  "Eight",%%mm7\n"                              \
     "   paddsw  %%mm0,%%mm0\n"        /* r0 = C. + C. */      \
     "   paddsw  %%mm7,%%mm0\n"        /* r0 = R0 = G. + C. */ \
-    "   psraw   ""$4"",%%mm7\n"      /* r7 = NR7 */          \
-    "   movq    %%mm6,"J(6)"\n"  /* store NR6 at J6 */       \
-    "   psraw   ""$4"",%%mm0\n"      /* r0 = NR0 */          \
-    "   movq    %%mm5,"J(5)"\n"  /* store NR5 at J5 */       \
-    "   movq    %%mm7,"J(7)"\n"  /* store NR7 at J7 */       \
-    "   movq    %%mm0,"I(0)"\n"  /* store NR0 at I0 */       \
-    "#end ColumnIDCT\n"					   
+    "   psraw   ""$4"",%%mm7\n"       /* r7 = NR7 */          \
+    "   movq    %%mm6,"J(6)"\n"   /* store NR6 at J6 */       \
+    "   psraw   ""$4"",%%mm0\n"       /* r0 = NR0 */          \
+    "   movq    %%mm5,"J(5)"\n"   /* store NR5 at J5 */       \
+    "   movq    %%mm7,"J(7)"\n"   /* store NR7 at J7 */       \
+    "   movq    %%mm0,"I(0)"\n"   /* store NR0 at I0 */       \
+    "#end ColumnIDCT\n"
 
 // end ColumnIDCT macro (38 + 19 = 57 cycles)
 
@@ -297,10 +297,10 @@
    Since r1 is free at entry, we calculate the Js first. */
 
 
-#define Transpose "#Transpose\n"           \
+#define Transpose "#Transpose\n"              \
     "   movq        %%mm4,%%mm1\n"            \
     "   punpcklwd   %%mm5,%%mm4\n"            \
-    "   movq        %%mm0,"I(0)"\n"          \
+    "   movq        %%mm0,"I(0)"\n"           \
     "   punpckhwd   %%mm5,%%mm1\n"            \
     "   movq        %%mm6,%%mm0\n"            \
     "   punpcklwd   %%mm7,%%mm6\n"            \
@@ -308,17 +308,17 @@
     "   punpckldq   %%mm6,%%mm4\n"            \
     "   punpckhdq   %%mm6,%%mm5\n"            \
     "   movq        %%mm1,%%mm6\n"            \
-    "   movq        %%mm4,"J(4)"\n"          \
+    "   movq        %%mm4,"J(4)"\n"           \
     "   punpckhwd   %%mm7,%%mm0\n"            \
-    "   movq        %%mm5,"J(5)"\n"          \
+    "   movq        %%mm5,"J(5)"\n"           \
     "   punpckhdq   %%mm0,%%mm6\n"            \
-    "   movq        "I(0)",%%mm4\n"          \
+    "   movq        "I(0)",%%mm4\n"           \
     "   punpckldq   %%mm0,%%mm1\n"            \
-    "   movq        "I(1)",%%mm5\n"          \
+    "   movq        "I(1)",%%mm5\n"           \
     "   movq        %%mm4,%%mm0\n"            \
-    "   movq        %%mm6,"J(7)"\n"          \
+    "   movq        %%mm6,"J(7)"\n"           \
     "   punpcklwd   %%mm5,%%mm0\n"            \
-    "   movq        %%mm1,"J(6)"\n"          \
+    "   movq        %%mm1,"J(6)"\n"           \
     "   punpckhwd   %%mm5,%%mm4\n"            \
     "   movq        %%mm2,%%mm5\n"            \
     "   punpcklwd   %%mm3,%%mm2\n"            \
@@ -326,16 +326,16 @@
     "   punpckldq   %%mm2,%%mm0\n"            \
     "   punpckhdq   %%mm2,%%mm1\n"            \
     "   movq        %%mm4,%%mm2\n"            \
-    "   movq        %%mm0,"I(0)"\n"          \
+    "   movq        %%mm0,"I(0)"\n"           \
     "   punpckhwd   %%mm3,%%mm5\n"            \
-    "   movq        %%mm1,"I(1)"\n"          \
+    "   movq        %%mm1,"I(1)"\n"           \
     "   punpckhdq   %%mm5,%%mm4\n"            \
     "   punpckldq   %%mm5,%%mm2\n"            \
-                                            \
-    "   movq        %%mm4,"I(3)"\n"          \
-                                            \
-    "   movq        %%mm2,"I(2)"\n"          \
-    "#end Transpose\n"			    
+                                              \
+    "   movq        %%mm4,"I(3)"\n"           \
+                                              \
+    "   movq        %%mm2,"I(2)"\n"           \
+    "#end Transpose\n"
 // end Transpose macro (19 cycles).
 
 /**************************************************************************************
@@ -357,8 +357,8 @@
  ***************************************************************************************
  */
 void IDctSlow__mmx(const ogg_int16_t *in,
-		   const ogg_int16_t *q,
-		   ogg_int16_t *out ) {
+                   const ogg_int16_t *q,
+                   ogg_int16_t *out ) {
 
 #   define MID(M,I)     MtoSTR(M+(I)*8)"(%[c])"
 #   define M(I)         MID( MaskOffset , I )
@@ -372,233 +372,232 @@
 
 
     __asm__ __volatile__ (
-    "# dequantize, de-zigzag\n"			  
+    "# dequantize, de-zigzag\n"
     "movq   (%[i]), %%mm0\n"
     "pmullw (%[q]), %%mm0\n"     /* r0 = 03 02 01 00 */
     "movq   16(%[i]), %%mm1\n"
     "pmullw 16(%[q]), %%mm1\n"   /* r1 = 13 12 11 10 */
     "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"       /* r3 = 03 02 01 00 */
+    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
     "movq   8(%[i]), %%mm4\n"
     "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
     "pmullw 8(%[q]), %%mm4\n"    /* r4 = 07 06 05 04 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"       /* r5 = __ 03 02 01 */
-    "movq   %%mm1, %%mm6\n"       /* r6 = 13 12 11 10 */
-    "pand   %%mm2, %%mm5\n"       /* r5 = __ __ __ 01 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
+    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
+    "movq   %%mm1, %%mm6\n"      /* r6 = 13 12 11 10 */
+    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
     "psllq  $32, %%mm6\n"        /* r6 = 11 10 __ __ */
     "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pxor   %%mm5, %%mm0\n"       /* r0 = __ 03 02 __ */
-    "pand   %%mm6, %%mm7\n"       /* r7 = 11 __ __ __ */
-    "por    %%mm3, %%mm0\n"       /* r0 = __ 03 02 00 */
-    "pxor   %%mm7, %%mm6\n"       /* r6 = __ 10 __ __ */
-    "por    %%mm7, %%mm0\n"       /* r0 = 11 03 02 00 = R0 */
+    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
+    "pand   %%mm6, %%mm7\n"      /* r7 = 11 __ __ __ */
+    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
+    "pxor   %%mm7, %%mm6\n"      /* r6 = __ 10 __ __ */
+    "por    %%mm7, %%mm0\n"      /* r0 = 11 03 02 00 = R0 */
     "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "movq   %%mm4, %%mm3\n"       /* r3 = 07 06 05 04 */
+    "movq   %%mm4, %%mm3\n"      /* r3 = 07 06 05 04 */
     "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 04 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 04 */
     "movq   32(%[i]), %%mm0\n"
     "psllq  $16, %%mm3\n"        /* r3 = __ __ 04 __ */
     "pmullw 32(%[q]), %%mm0\n"   /* r0 = 23 22 21 20 */
-    "pand   %%mm1, %%mm7\n"       /* r7 = 13 __ __ __ */
-    "por    %%mm3, %%mm5\n"       /* r5 = __ __ 04 01 */
-    "por    %%mm6, %%mm7\n"       /* r7 = 13 10 __ __ */
+    "pand   %%mm1, %%mm7\n"      /* r7 = 13 __ __ __ */
+    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 04 01 */
+    "por    %%mm6, %%mm7\n"      /* r7 = 13 10 __ __ */
     "movq   24(%[i]), %%mm3\n"
-    "por    %%mm5, %%mm7\n"       /* r7 = 13 10 04 01 = R1 */
+    "por    %%mm5, %%mm7\n"      /* r7 = 13 10 04 01 = R1 */
     "pmullw 24(%[q]), %%mm3\n"   /* r3 = 17 16 15 14 */
     "psrlq  $16, %%mm4\n"        /* r4 = __ 07 06 05 */
     "movq   %%mm7, 16(%[o])\n"   /* write R1 = r7 */
-    "movq   %%mm4, %%mm5\n"       /* r5 = __ 07 06 05 */
-    "movq   %%mm0, %%mm7\n"       /* r7 = 23 22 21 20 */
+    "movq   %%mm4, %%mm5\n"      /* r5 = __ 07 06 05 */
+    "movq   %%mm0, %%mm7\n"      /* r7 = 23 22 21 20 */
     "psrlq  $16, %%mm4\n"        /* r4 = __ __ 07 06 */
     "psrlq  $48, %%mm7\n"        /* r7 = __ __ __ 23 */
-    "movq   %%mm2, %%mm6\n"       /* r6 = __ __ __ FF */
-    "pand   %%mm2, %%mm5\n"       /* r5 = __ __ __ 05 */
-    "pand   %%mm4, %%mm6\n"       /* r6 = __ __ __ 06 */
+    "movq   %%mm2, %%mm6\n"      /* r6 = __ __ __ FF */
+    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 05 */
+    "pand   %%mm4, %%mm6\n"      /* r6 = __ __ __ 06 */
     "movq   %%mm7, 80(%[o])\n"   /* partial R9 = __ __ __ 23 */
-    "pxor   %%mm6, %%mm4\n"       /* r4 = __ __ 07 __ */
+    "pxor   %%mm6, %%mm4\n"      /* r4 = __ __ 07 __ */
     "psrlq  $32, %%mm1\n"        /* r1 = __ __ 13 12 */
-    "por    %%mm5, %%mm4\n"       /* r4 = __ __ 07 05 */
+    "por    %%mm5, %%mm4\n"      /* r4 = __ __ 07 05 */
     "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pand   %%mm2, %%mm1\n"       /* r1 = __ __ __ 12 */
+    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 12 */
     "movq   48(%[i]), %%mm5\n"
     "psllq  $16, %%mm0\n"        /* r0 = 22 21 20 __ */
     "pmullw 48(%[q]), %%mm5\n"   /* r5 = 33 32 31 30 */
-    "pand   %%mm0, %%mm7\n"       /* r7 = 22 __ __ __ */
+    "pand   %%mm0, %%mm7\n"      /* r7 = 22 __ __ __ */
     "movq   %%mm1, 64(%[o])\n"   /* partial R8 = __ __ __ 12 */
-    "por    %%mm4, %%mm7\n"       /* r7 = 22 __ 07 05 */
-    "movq   %%mm3, %%mm4\n"       /* r4 = 17 16 15 14 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 14 */
+    "por    %%mm4, %%mm7\n"      /* r7 = 22 __ 07 05 */
+    "movq   %%mm3, %%mm4\n"      /* r4 = 17 16 15 14 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 14 */
     "movq   "M(2)", %%mm1\n"     /* r1 = __ FF __ __ */
     "psllq  $32, %%mm3\n"        /* r3 = __ 14 __ __ */
-    "por    %%mm3, %%mm7\n"       /* r7 = 22 14 07 05 = R2 */
-    "movq   %%mm5, %%mm3\n"       /* r3 = 33 32 31 30 */
+    "por    %%mm3, %%mm7\n"      /* r7 = 22 14 07 05 = R2 */
+    "movq   %%mm5, %%mm3\n"      /* r3 = 33 32 31 30 */
     "psllq  $48, %%mm3\n"        /* r3 = 30 __ __ __ */
-    "pand   %%mm0, %%mm1\n"       /* r1 = __ 21 __ __ */
+    "pand   %%mm0, %%mm1\n"      /* r1 = __ 21 __ __ */
     "movq   %%mm7, 32(%[o])\n"   /* write R2 = r7 */
-    "por    %%mm3, %%mm6\n"       /* r6 = 30 __ __ 06 */
+    "por    %%mm3, %%mm6\n"      /* r6 = 30 __ __ 06 */
     "movq   "M(1)", %%mm7\n"     /* r7 = __ __ FF __ */
-    "por    %%mm1, %%mm6\n"       /* r6 = 30 21 __ 06 */
+    "por    %%mm1, %%mm6\n"      /* r6 = 30 21 __ 06 */
     "movq   56(%[i]), %%mm1\n"
-    "pand   %%mm4, %%mm7\n"       /* r7 = __ __ 15 __ */
+    "pand   %%mm4, %%mm7\n"      /* r7 = __ __ 15 __ */
     "pmullw 56(%[q]), %%mm1\n"   /* r1 = 37 36 35 34 */
-    "por    %%mm6, %%mm7\n"       /* r7 = 30 21 15 06 = R3 */
+    "por    %%mm6, %%mm7\n"      /* r7 = 30 21 15 06 = R3 */
     "pand   "M(1)", %%mm0\n"     /* r0 = __ __ 20 __ */
     "psrlq  $32, %%mm4\n"        /* r4 = __ __ 17 16 */
     "movq   %%mm7, 48(%[o])\n"   /* write R3 = r7 */
-    "movq   %%mm4, %%mm6\n"       /* r6 = __ __ 17 16 */
+    "movq   %%mm4, %%mm6\n"      /* r6 = __ __ 17 16 */
     "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pand   %%mm2, %%mm4\n"       /* r4 = __ __ __ 16 */
+    "pand   %%mm2, %%mm4\n"      /* r4 = __ __ __ 16 */
     "movq   "M(1)", %%mm3\n"     /* r3 = __ __ FF __ */
-    "pand   %%mm1, %%mm7\n"       /* r7 = 37 __ __ __ */
-    "pand   %%mm5, %%mm3\n"       /* r3 = __ __ 31 __ */
-    "por    %%mm4, %%mm0\n"       /* r0 = __ __ 20 16 */
+    "pand   %%mm1, %%mm7\n"      /* r7 = 37 __ __ __ */
+    "pand   %%mm5, %%mm3\n"      /* r3 = __ __ 31 __ */
+    "por    %%mm4, %%mm0\n"      /* r0 = __ __ 20 16 */
     "psllq  $16, %%mm3\n"        /* r3 = __ 31 __ __ */
-    "por    %%mm0, %%mm7\n"       /* r7 = 37 __ 20 16 */
+    "por    %%mm0, %%mm7\n"      /* r7 = 37 __ 20 16 */
     "movq   "M(2)", %%mm4\n"     /* r4 = __ FF __ __ */
-    "por    %%mm3, %%mm7\n"       /* r7 = 37 31 20 16 = R4 */
+    "por    %%mm3, %%mm7\n"      /* r7 = 37 31 20 16 = R4 */
     "movq   80(%[i]), %%mm0\n"
-    "movq   %%mm4, %%mm3\n"       /* r3 = __ __ FF __ */
+    "movq   %%mm4, %%mm3\n"      /* r3 = __ __ FF __ */
     "pmullw 80(%[q]), %%mm0\n"   /* r0 = 53 52 51 50 */
-    "pand   %%mm5, %%mm4\n"       /* r4 = __ 32 __ __ */
+    "pand   %%mm5, %%mm4\n"      /* r4 = __ 32 __ __ */
     "movq   %%mm7, 8(%[o])\n"    /* write R4 = r7 */
-    "por    %%mm4, %%mm6\n"       /* r6 = __ 32 17 16 */
-    "movq   %%mm3, %%mm4\n"       /* r4 = __ FF __ __ */
+    "por    %%mm4, %%mm6\n"      /* r6 = __ 32 17 16 */
+    "movq   %%mm3, %%mm4\n"      /* r4 = __ FF __ __ */
     "psrlq  $16, %%mm6\n"        /* r6 = __ __ 32 17 */
-    "movq   %%mm0, %%mm7\n"       /* r7 = 53 52 51 50 */
-    "pand   %%mm1, %%mm4\n"       /* r4 = __ 36 __ __ */
+    "movq   %%mm0, %%mm7\n"      /* r7 = 53 52 51 50 */
+    "pand   %%mm1, %%mm4\n"      /* r4 = __ 36 __ __ */
     "psllq  $48, %%mm7\n"        /* r7 = 50 __ __ __ */
-    "por    %%mm4, %%mm6\n"       /* r6 = __ 36 32 17 */
+    "por    %%mm4, %%mm6\n"      /* r6 = __ 36 32 17 */
     "movq   88(%[i]), %%mm4\n"
-    "por    %%mm6, %%mm7\n"       /* r7 = 50 36 32 17 = R5 */
+    "por    %%mm6, %%mm7\n"      /* r7 = 50 36 32 17 = R5 */
     "pmullw 88(%[q]), %%mm4\n"   /* r4 = 57 56 55 54 */
     "psrlq  $16, %%mm3\n"        /* r3 = __ __ FF __ */
     "movq   %%mm7, 24(%[o])\n"   /* write R5 = r7 */
-    "pand   %%mm1, %%mm3\n"       /* r3 = __ __ 35 __ */
+    "pand   %%mm1, %%mm3\n"      /* r3 = __ __ 35 __ */
     "psrlq  $48, %%mm5\n"        /* r5 = __ __ __ 33 */
-    "pand   %%mm2, %%mm1\n"       /* r1 = __ __ __ 34 */
+    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 34 */
     "movq   104(%[i]), %%mm6\n"
-    "por    %%mm3, %%mm5\n"       /* r5 = __ __ 35 33 */
+    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 35 33 */
     "pmullw 104(%[q]), %%mm6\n"  /* r6 = 67 66 65 64 */
     "psrlq  $16, %%mm0\n"        /* r0 = __ 53 52 51 */
-    "movq   %%mm4, %%mm7\n"       /* r7 = 57 56 55 54 */
-    "movq   %%mm2, %%mm3\n"       /* r3 = __ __ __ FF */
+    "movq   %%mm4, %%mm7\n"      /* r7 = 57 56 55 54 */
+    "movq   %%mm2, %%mm3\n"      /* r3 = __ __ __ FF */
     "psllq  $48, %%mm7\n"        /* r7 = 54 __ __ __ */
-    "pand   %%mm0, %%mm3\n"       /* r3 = __ __ __ 51 */
-    "pxor   %%mm3, %%mm0\n"       /* r0 = __ 53 52 __ */
+    "pand   %%mm0, %%mm3\n"      /* r3 = __ __ __ 51 */
+    "pxor   %%mm3, %%mm0\n"      /* r0 = __ 53 52 __ */
     "psllq  $32, %%mm3\n"        /* r3 = __ 51 __ __ */
-    "por    %%mm5, %%mm7\n"       /* r7 = 54 __ 35 33 */
-    "movq   %%mm6, %%mm5\n"       /* r5 = 67 66 65 64 */
+    "por    %%mm5, %%mm7\n"      /* r7 = 54 __ 35 33 */
+    "movq   %%mm6, %%mm5\n"      /* r5 = 67 66 65 64 */
     "pand   "M(1)", %%mm6\n"     /* r6 = __ __ 65 __ */
-    "por    %%mm3, %%mm7\n"       /* r7 = 54 51 35 33 = R6 */
+    "por    %%mm3, %%mm7\n"      /* r7 = 54 51 35 33 = R6 */
     "psllq  $32, %%mm6\n"        /* r6 = 65 __ __ __ */
-    "por    %%mm1, %%mm0\n"       /* r0 = __ 53 52 34 */
+    "por    %%mm1, %%mm0\n"      /* r0 = __ 53 52 34 */
     "movq   %%mm7, 40(%[o])\n"   /* write R6 = r7 */
-    "por    %%mm6, %%mm0\n"       /* r0 = 65 53 52 34 = R7 */
+    "por    %%mm6, %%mm0\n"      /* r0 = 65 53 52 34 = R7 */
     "movq   120(%[i]), %%mm7\n"
-    "movq   %%mm5, %%mm6\n"       /* r6 = 67 66 65 64 */
+    "movq   %%mm5, %%mm6\n"      /* r6 = 67 66 65 64 */
     "pmullw 120(%[q]), %%mm7\n"  /* r7 = 77 76 75 74 */
     "psrlq  $32, %%mm5\n"        /* r5 = __ __ 67 66 */
-    "pand   %%mm2, %%mm6\n"       /* r6 = __ __ __ 64 */
-    "movq   %%mm5, %%mm1\n"       /* r1 = __ __ 67 66 */
+    "pand   %%mm2, %%mm6\n"      /* r6 = __ __ __ 64 */
+    "movq   %%mm5, %%mm1\n"      /* r1 = __ __ 67 66 */
     "movq   %%mm0, 56(%[o])\n"   /* write R7 = r0 */
-    "pand   %%mm2, %%mm1\n"       /* r1 = __ __ __ 66 */
+    "pand   %%mm2, %%mm1\n"      /* r1 = __ __ __ 66 */
     "movq   112(%[i]), %%mm0\n"
-    "movq   %%mm7, %%mm3\n"       /* r3 = 77 76 75 74 */
+    "movq   %%mm7, %%mm3\n"      /* r3 = 77 76 75 74 */
     "pmullw 112(%[q]), %%mm0\n"  /* r0 = 73 72 71 70 */
     "psllq  $16, %%mm3\n"        /* r3 = 76 75 74 __ */
     "pand   "M(3)", %%mm7\n"     /* r7 = 77 __ __ __ */
-    "pxor   %%mm1, %%mm5\n"       /* r5 = __ __ 67 __ */
-    "por    %%mm5, %%mm6\n"       /* r6 = __ __ 67 64 */
-    "movq   %%mm3, %%mm5\n"       /* r5 = 76 75 74 __ */
+    "pxor   %%mm1, %%mm5\n"      /* r5 = __ __ 67 __ */
+    "por    %%mm5, %%mm6\n"      /* r6 = __ __ 67 64 */
+    "movq   %%mm3, %%mm5\n"      /* r5 = 76 75 74 __ */
     "pand   "M(3)", %%mm5\n"     /* r5 = 76 __ __ __ */
-    "por    %%mm1, %%mm7\n"       /* r7 = 77 __ __ 66 */
+    "por    %%mm1, %%mm7\n"      /* r7 = 77 __ __ 66 */
     "movq   96(%[i]), %%mm1\n"
-    "pxor   %%mm5, %%mm3\n"       /* r3 = __ 75 74 __ */
+    "pxor   %%mm5, %%mm3\n"      /* r3 = __ 75 74 __ */
     "pmullw 96(%[q]), %%mm1\n"   /* r1 = 63 62 61 60 */
-    "por    %%mm3, %%mm7\n"       /* r7 = 77 75 74 66 = R15 */
-    "por    %%mm5, %%mm6\n"       /* r6 = 76 __ 67 64 */
-    "movq   %%mm0, %%mm5\n"       /* r5 = 73 72 71 70 */
+    "por    %%mm3, %%mm7\n"      /* r7 = 77 75 74 66 = R15 */
+    "por    %%mm5, %%mm6\n"      /* r6 = 76 __ 67 64 */
+    "movq   %%mm0, %%mm5\n"      /* r5 = 73 72 71 70 */
     "movq   %%mm7, 120(%[o])\n"  /* store R15 = r7 */
     "psrlq  $16, %%mm5\n"        /* r5 = __ 73 72 71 */
     "pand   "M(2)", %%mm5\n"     /* r5 = __ 73 __ __ */
-    "movq   %%mm0, %%mm7\n"       /* r7 = 73 72 71 70 */
-    "por    %%mm5, %%mm6\n"       /* r6 = 76 73 67 64 = R14 */
-    "pand   %%mm2, %%mm0\n"       /* r0 = __ __ __ 70 */
-    "pxor   %%mm0, %%mm7\n"       /* r7 = 73 72 71 __ */
+    "movq   %%mm0, %%mm7\n"      /* r7 = 73 72 71 70 */
+    "por    %%mm5, %%mm6\n"      /* r6 = 76 73 67 64 = R14 */
+    "pand   %%mm2, %%mm0\n"      /* r0 = __ __ __ 70 */
+    "pxor   %%mm0, %%mm7\n"      /* r7 = 73 72 71 __ */
     "psllq  $32, %%mm0\n"        /* r0 = __ 70 __ __ */
     "movq   %%mm6, 104(%[o])\n"  /* write R14 = r6 */
     "psrlq  $16, %%mm4\n"        /* r4 = __ 57 56 55 */
     "movq   72(%[i]), %%mm5\n"
     "psllq  $16, %%mm7\n"        /* r7 = 72 71 __ __ */
     "pmullw 72(%[q]), %%mm5\n"   /* r5 = 47 46 45 44 */
-    "movq   %%mm7, %%mm6\n"       /* r6 = 72 71 __ __ */
+    "movq   %%mm7, %%mm6\n"      /* r6 = 72 71 __ __ */
     "movq   "M(2)", %%mm3\n"     /* r3 = __ FF __ __ */
     "psllq  $16, %%mm6\n"        /* r6 = 71 __ __ __ */
     "pand   "M(3)", %%mm7\n"     /* r7 = 72 __ __ __ */
-    "pand   %%mm1, %%mm3\n"       /* r3 = __ 62 __ __ */
-    "por    %%mm0, %%mm7\n"       /* r7 = 72 70 __ __ */
-    "movq   %%mm1, %%mm0\n"       /* r0 = 63 62 61 60 */
+    "pand   %%mm1, %%mm3\n"      /* r3 = __ 62 __ __ */
+    "por    %%mm0, %%mm7\n"      /* r7 = 72 70 __ __ */
+    "movq   %%mm1, %%mm0\n"      /* r0 = 63 62 61 60 */
     "pand   "M(3)", %%mm1\n"     /* r1 = 63 __ __ __ */
-    "por    %%mm3, %%mm6\n"       /* r6 = 71 62 __ __ */
-    "movq   %%mm4, %%mm3\n"       /* r3 = __ 57 56 55 */
+    "por    %%mm3, %%mm6\n"      /* r6 = 71 62 __ __ */
+    "movq   %%mm4, %%mm3\n"      /* r3 = __ 57 56 55 */
     "psrlq  $32, %%mm1\n"        /* r1 = __ __ 63 __ */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 55 */
-    "por    %%mm1, %%mm7\n"       /* r7 = 72 70 63 __ */
-    "por    %%mm3, %%mm7\n"       /* r7 = 72 70 63 55 = R13 */
-    "movq   %%mm4, %%mm3\n"       /* r3 = __ 57 56 55 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 55 */
+    "por    %%mm1, %%mm7\n"      /* r7 = 72 70 63 __ */
+    "por    %%mm3, %%mm7\n"      /* r7 = 72 70 63 55 = R13 */
+    "movq   %%mm4, %%mm3\n"      /* r3 = __ 57 56 55 */
     "pand   "M(1)", %%mm3\n"     /* r3 = __ __ 56 __ */
-    "movq   %%mm5, %%mm1\n"       /* r1 = 47 46 45 44 */
+    "movq   %%mm5, %%mm1\n"      /* r1 = 47 46 45 44 */
     "movq   %%mm7, 88(%[o])\n"   /* write R13 = r7 */
     "psrlq  $48, %%mm5\n"        /* r5 = __ __ __ 47 */
     "movq   64(%[i]), %%mm7\n"
-    "por    %%mm3, %%mm6\n"       /* r6 = 71 62 56 __ */
+    "por    %%mm3, %%mm6\n"      /* r6 = 71 62 56 __ */
     "pmullw 64(%[q]), %%mm7\n"   /* r7 = 43 42 41 40 */
-    "por    %%mm5, %%mm6\n"       /* r6 = 71 62 56 47 = R12 */
+    "por    %%mm5, %%mm6\n"      /* r6 = 71 62 56 47 = R12 */
     "pand   "M(2)", %%mm4\n"     /* r4 = __ 57 __ __ */
     "psllq  $32, %%mm0\n"        /* r0 = 61 60 __ __ */
     "movq   %%mm6, 72(%[o])\n"   /* write R12 = r6 */
-    "movq   %%mm0, %%mm6\n"       /* r6 = 61 60 __ __ */
+    "movq   %%mm0, %%mm6\n"      /* r6 = 61 60 __ __ */
     "pand   "M(3)", %%mm0\n"     /* r0 = 61 __ __ __ */
     "psllq  $16, %%mm6\n"        /* r6 = 60 __ __ __ */
     "movq   40(%[i]), %%mm5\n"
-    "movq   %%mm1, %%mm3\n"       /* r3 = 47 46 45 44 */
+    "movq   %%mm1, %%mm3\n"      /* r3 = 47 46 45 44 */
     "pmullw 40(%[q]), %%mm5\n"   /* r5 = 27 26 25 24 */
     "psrlq  $16, %%mm1\n"        /* r1 = __ 47 46 45 */
     "pand   "M(1)", %%mm1\n"     /* r1 = __ __ 46 __ */
-    "por    %%mm4, %%mm0\n"       /* r0 = 61 57 __ __ */
-    "pand   %%mm7, %%mm2\n"       /* r2 = __ __ __ 40 */
-    "por    %%mm1, %%mm0\n"       /* r0 = 61 57 46 __ */
-    "por    %%mm2, %%mm0\n"       /* r0 = 61 57 46 40 = R11 */
+    "por    %%mm4, %%mm0\n"      /* r0 = 61 57 __ __ */
+    "pand   %%mm7, %%mm2\n"      /* r2 = __ __ __ 40 */
+    "por    %%mm1, %%mm0\n"      /* r0 = 61 57 46 __ */
+    "por    %%mm2, %%mm0\n"      /* r0 = 61 57 46 40 = R11 */
     "psllq  $16, %%mm3\n"        /* r3 = 46 45 44 __ */
-    "movq   %%mm3, %%mm4\n"       /* r4 = 46 45 44 __ */
-    "movq   %%mm5, %%mm2\n"       /* r2 = 27 26 25 24 */
+    "movq   %%mm3, %%mm4\n"      /* r4 = 46 45 44 __ */
+    "movq   %%mm5, %%mm2\n"      /* r2 = 27 26 25 24 */
     "movq   %%mm0, 112(%[o])\n"  /* write R11 = r0 */
     "psrlq  $48, %%mm2\n"        /* r2 = __ __ __ 27 */
     "pand   "M(2)", %%mm4\n"     /* r4 = __ 45 __ __ */
-    "por    %%mm2, %%mm6\n"       /* r6 = 60 __ __ 27 */
+    "por    %%mm2, %%mm6\n"      /* r6 = 60 __ __ 27 */
     "movq   "M(1)", %%mm2\n"     /* r2 = __ __ FF __ */
-    "por    %%mm4, %%mm6\n"       /* r6 = 60 45 __ 27 */
-    "pand   %%mm7, %%mm2\n"       /* r2 = __ __ 41 __ */
+    "por    %%mm4, %%mm6\n"      /* r6 = 60 45 __ 27 */
+    "pand   %%mm7, %%mm2\n"      /* r2 = __ __ 41 __ */
     "psllq  $32, %%mm3\n"        /* r3 = 44 __ __ __ */
     "por    80(%[o]), %%mm3\n"   /* r3 = 44 __ __ 23 */
-    "por    %%mm2, %%mm6\n"       /* r6 = 60 45 41 27 = R10 */
+    "por    %%mm2, %%mm6\n"      /* r6 = 60 45 41 27 = R10 */
     "movq   "M(3)", %%mm2\n"     /* r2 = FF __ __ __ */
     "psllq  $16, %%mm5\n"        /* r5 = 26 25 24 __ */
     "movq   %%mm6, 96(%[o])\n"   /* store R10 = r6 */
-    "pand   %%mm5, %%mm2\n"       /* r2 = 26 __ __ __ */
+    "pand   %%mm5, %%mm2\n"      /* r2 = 26 __ __ __ */
     "movq   "M(2)", %%mm6\n"     /* r6 = __ FF __ __ */
-    "pxor   %%mm2, %%mm5\n"       /* r5 = __ 25 24 __ */
-    "pand   %%mm7, %%mm6\n"       /* r6 = __ 42 __ __ */
+    "pxor   %%mm2, %%mm5\n"      /* r5 = __ 25 24 __ */
+    "pand   %%mm7, %%mm6\n"      /* r6 = __ 42 __ __ */
     "psrlq  $32, %%mm2\n"        /* r2 = __ __ 26 __ */
     "pand   "M(3)", %%mm7\n"     /* r7 = 43 __ __ __ */
-    "por    %%mm2, %%mm3\n"       /* r3 = 44 __ 26 23 */
+    "por    %%mm2, %%mm3\n"      /* r3 = 44 __ 26 23 */
     "por    64(%[o]), %%mm7\n"   /* r7 = 43 __ __ 12 */
-    "por    %%mm3, %%mm6\n"       /* r6 = 44 42 26 23 = R9 */
-    "por    %%mm5, %%mm7\n"       /* r7 = 43 25 24 12 = R8 */
+    "por    %%mm3, %%mm6\n"      /* r6 = 44 42 26 23 = R9 */
+    "por    %%mm5, %%mm7\n"      /* r7 = 43 25 24 12 = R8 */
     "movq   %%mm6, 80(%[o])\n"   /* store R9 = r6 */
     "movq   %%mm7, 64(%[o])\n"   /* store R8 = r7 */
-    
     /* 123c  ( / 64 coeffs  < 2c / coeff) */
 
 /* Done w/dequant + descramble + partial transpose; now do the idct itself. */
@@ -662,86 +661,86 @@
 /* --------------------------------------------------------------- */
 // This macro does four 4-sample one-dimensional idcts in parallel.  Inputs
 // 4 thru 7 are assumed to be zero.
-#define BeginIDCT_10 "#BeginIDCT_10\n"  \
-    "   movq    "I(3)",%%mm2\n"          \
-                                        \
-    "   movq    "C(3)",%%mm6\n"          \
+#define BeginIDCT_10 "#BeginIDCT_10\n"    \
+    "   movq    "I(3)",%%mm2\n"           \
+                                          \
+    "   movq    "C(3)",%%mm6\n"           \
     "   movq    %%mm2,%%mm4\n"            \
-                                        \
-    "   movq    "C(5)",%%mm1\n"          \
+                                          \
+    "   movq    "C(5)",%%mm1\n"           \
     "   pmulhw  %%mm6,%%mm4\n"            \
-                                        \
-    "   movq    "I(1)",%%mm3\n"          \
+                                          \
+    "   movq    "I(1)",%%mm3\n"           \
     "   pmulhw  %%mm2,%%mm1\n"            \
-                                        \
-    "   movq    "C(1)",%%mm0\n"          \
+                                          \
+    "   movq    "C(1)",%%mm0\n"           \
     "   paddw   %%mm2,%%mm4\n"            \
-                                        \
+                                          \
     "   pxor    %%mm6,%%mm6\n"            \
     "   paddw   %%mm1,%%mm2\n"            \
-                                        \
-    "   movq    "I(2)",%%mm5\n"          \
+                                          \
+    "   movq    "I(2)",%%mm5\n"           \
     "   pmulhw  %%mm3,%%mm0\n"            \
-                                        \
+                                          \
     "   movq    %%mm5,%%mm1\n"            \
     "   paddw   %%mm3,%%mm0\n"            \
-                                        \
-    "   pmulhw  "C(7)",%%mm3\n"          \
+                                          \
+    "   pmulhw  "C(7)",%%mm3\n"           \
     "   psubsw  %%mm2,%%mm6\n"            \
-                                        \
-    "   pmulhw  "C(2)",%%mm5\n"          \
+                                          \
+    "   pmulhw  "C(2)",%%mm5\n"           \
     "   psubsw  %%mm4,%%mm0\n"            \
-                                        \
-    "   movq    "I(2)",%%mm7\n"          \
+                                          \
+    "   movq    "I(2)",%%mm7\n"           \
     "   paddsw  %%mm4,%%mm4\n"            \
-                                        \
+                                          \
     "   paddw   %%mm5,%%mm7\n"            \
     "   paddsw  %%mm0,%%mm4\n"            \
-                                        \
-    "   pmulhw  "C(6)",%%mm1\n"          \
+                                          \
+    "   pmulhw  "C(6)",%%mm1\n"           \
     "   psubsw  %%mm6,%%mm3\n"            \
-                                        \
-    "   movq    %%mm4,"I(1)"\n"          \
+                                          \
+    "   movq    %%mm4,"I(1)"\n"           \
     "   paddsw  %%mm6,%%mm6\n"            \
-                                        \
-    "   movq    "C(4)",%%mm4\n"          \
+                                          \
+    "   movq    "C(4)",%%mm4\n"           \
     "   paddsw  %%mm3,%%mm6\n"            \
-                                        \
+                                          \
     "   movq    %%mm3,%%mm5\n"            \
     "   pmulhw  %%mm4,%%mm3\n"            \
-                                        \
-    "   movq    %%mm6,"I(2)"\n"          \
+                                          \
+    "   movq    %%mm6,"I(2)"\n"           \
     "   movq    %%mm0,%%mm2\n"            \
-                                        \
-    "   movq    "I(0)",%%mm6\n"          \
+                                          \
+    "   movq    "I(0)",%%mm6\n"           \
     "   pmulhw  %%mm4,%%mm0\n"            \
-                                        \
+                                          \
     "   paddw   %%mm3,%%mm5\n"            \
     "   paddw   %%mm0,%%mm2\n"            \
-                                        \
+                                          \
     "   psubsw  %%mm1,%%mm5\n"            \
     "   pmulhw  %%mm4,%%mm6\n"            \
-                                        \
-    "   paddw   "I(0)",%%mm6\n"          \
+                                          \
+    "   paddw   "I(0)",%%mm6\n"           \
     "   paddsw  %%mm1,%%mm1\n"            \
-                                        \
+                                          \
     "   movq    %%mm6,%%mm4\n"            \
     "   paddsw  %%mm5,%%mm1\n"            \
-                                        \
+                                          \
     "   psubsw  %%mm2,%%mm6\n"            \
     "   paddsw  %%mm2,%%mm2\n"            \
-                                        \
-    "   movq    "I(1)",%%mm0\n"          \
+                                          \
+    "   movq    "I(1)",%%mm0\n"           \
     "   paddsw  %%mm6,%%mm2\n"            \
-                                        \
+                                          \
     "   psubsw  %%mm1,%%mm2\n"            \
     "#end BeginIDCT_10\n"
 // end BeginIDCT_10 macro (25 cycles).
 
-#define RowIDCT_10 "#RowIDCT_10\n"                           \
-    BeginIDCT_10                                            \
-    "\n"                                                    \
-    "   movq    "I(2)",%%mm3\n"  /* r3 = D. */               \
+#define RowIDCT_10 "#RowIDCT_10\n"                            \
+    BeginIDCT_10                                              \
+    "\n"                                                      \
+    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */               \
     "   psubsw  %%mm7,%%mm4\n"        /* r4 = E. = E - G */   \
     "   paddsw  %%mm1,%%mm1\n"        /* r1 = H. + H. */      \
     "   paddsw  %%mm7,%%mm7\n"        /* r7 = G + G */        \
@@ -755,55 +754,54 @@
     "   paddsw  %%mm6,%%mm5\n"        /* r5 = R5 = F. + B.. */\
     "   psubsw  %%mm0,%%mm7\n"        /* r7 = R7 = G. - C. */ \
     "   paddsw  %%mm0,%%mm0\n"                                \
-    "   movq    %%mm1,"I(1)"\n"  /* save R1 */               \
+    "   movq    %%mm1,"I(1)"\n"   /* save R1 */               \
     "   paddsw  %%mm7,%%mm0\n"        /* r0 = R0 = G. + C. */ \
-    "#end RowIDCT_10\n"									     
+    "#end RowIDCT_10\n"
 // end RowIDCT macro (8 + 38 = 46 cycles)
 
 // Column IDCT normalizes and stores final results.
 
-#define ColumnIDCT_10 "#ColumnIDCT_10\n"               \
-    BeginIDCT_10                                        \
-    "\n"                                                \
-    "   paddsw  "Eight",%%mm2\n"                         \
+#define ColumnIDCT_10 "#ColumnIDCT_10\n"                  \
+    BeginIDCT_10                                          \
+    "\n"                                                  \
+    "   paddsw  "Eight",%%mm2\n"                          \
     "   paddsw  %%mm1,%%mm1\n"    /* r1 = H. + H. */      \
     "   paddsw  %%mm2,%%mm1\n"    /* r1 = R1 = A.. + H. */\
-    "   psraw   ""$4"",%%mm2\n"      /* r2 = NR2 */      \
+    "   psraw   ""$4"",%%mm2\n"       /* r2 = NR2 */      \
     "   psubsw  %%mm7,%%mm4\n"    /* r4 = E. = E - G */   \
-    "   psraw   ""$4"",%%mm1\n"      /* r1 = NR1 */      \
-    "   movq    "I(2)",%%mm3\n"  /* r3 = D. */           \
+    "   psraw   ""$4"",%%mm1\n"       /* r1 = NR1 */      \
+    "   movq    "I(2)",%%mm3\n"   /* r3 = D. */           \
     "   paddsw  %%mm7,%%mm7\n"    /* r7 = G + G */        \
-    "   movq    %%mm2,"I(2)"\n"  /* store NR2 at I2 */   \
+    "   movq    %%mm2,"I(2)"\n"   /* store NR2 at I2 */   \
     "   paddsw  %%mm4,%%mm7\n"    /* r7 = G. = E + G */   \
-    "   movq    %%mm1,"I(1)"\n"  /* store NR1 at I1 */   \
+    "   movq    %%mm1,"I(1)"\n"   /* store NR1 at I1 */   \
     "   psubsw  %%mm3,%%mm4\n"    /* r4 = R4 = E. - D. */ \
-    "   paddsw  "Eight",%%mm4\n"                         \
+    "   paddsw  "Eight",%%mm4\n"                          \
     "   paddsw  %%mm3,%%mm3\n"    /* r3 = D. + D. */      \
     "   paddsw  %%mm4,%%mm3\n"    /* r3 = R3 = E. + D. */ \
-    "   psraw   ""$4"",%%mm4\n"      /* r4 = NR4 */      \
+    "   psraw   ""$4"",%%mm4\n"       /* r4 = NR4 */      \
     "   psubsw  %%mm5,%%mm6\n"    /* r6 = R6 = F. - B.. */\
-    "   psraw   ""$4"",%%mm3\n"      /* r3 = NR3 */      \
-    "   paddsw  "Eight",%%mm6\n"                         \
+    "   psraw   ""$4"",%%mm3\n"       /* r3 = NR3 */      \
+    "   paddsw  "Eight",%%mm6\n"                          \
     "   paddsw  %%mm5,%%mm5\n"    /* r5 = B.. + B.. */    \
     "   paddsw  %%mm6,%%mm5\n"    /* r5 = R5 = F. + B.. */\
-    "   psraw   ""$4"",%%mm6\n"      /* r6 = NR6 */      \
-    "   movq    %%mm4,"J(4)"\n"  /* store NR4 at J4 */   \
-    "   psraw   ""$4"",%%mm5\n"      /* r5 = NR5 */      \
-    "   movq    %%mm3,"I(3)"\n"  /* store NR3 at I3 */   \
+    "   psraw   ""$4"",%%mm6\n"       /* r6 = NR6 */      \
+    "   movq    %%mm4,"J(4)"\n"   /* store NR4 at J4 */   \
+    "   psraw   ""$4"",%%mm5\n"       /* r5 = NR5 */      \
+    "   movq    %%mm3,"I(3)"\n"   /* store NR3 at I3 */   \
     "   psubsw  %%mm0,%%mm7\n"    /* r7 = R7 = G. - C. */ \
-    "   paddsw  "Eight",%%mm7\n"                         \
+    "   paddsw  "Eight",%%mm7\n"                          \
     "   paddsw  %%mm0,%%mm0\n"    /* r0 = C. + C. */      \
     "   paddsw  %%mm7,%%mm0\n"    /* r0 = R0 = G. + C. */ \
-    "   psraw   ""$4"",%%mm7\n"      /* r7 = NR7 */      \
-    "   movq    %%mm6,"J(6)"\n"  /* store NR6 at J6 */   \
-    "   psraw   ""$4"",%%mm0\n"      /* r0 = NR0 */      \
-    "   movq    %%mm5,"J(5)"\n"  /* store NR5 at J5 */   \
-                                                        \
-    "   movq    %%mm7,"J(7)"\n"  /* store NR7 at J7 */   \
-                                                        \
-    "   movq    %%mm0,"I(0)"\n"  /* store NR0 at I0 */   \
-    "#end ColumnIDCT_10\n"								
-
+    "   psraw   ""$4"",%%mm7\n"       /* r7 = NR7 */      \
+    "   movq    %%mm6,"J(6)"\n"   /* store NR6 at J6 */   \
+    "   psraw   ""$4"",%%mm0\n"       /* r0 = NR0 */      \
+    "   movq    %%mm5,"J(5)"\n"   /* store NR5 at J5 */   \
+                                                          \
+    "   movq    %%mm7,"J(7)"\n"   /* store NR7 at J7 */   \
+                                                          \
+    "   movq    %%mm0,"I(0)"\n"   /* store NR0 at I0 */   \
+    "#end ColumnIDCT_10\n"
 // end ColumnIDCT macro (38 + 19 = 57 cycles)
 /* --------------------------------------------------------------- */
 
@@ -811,8 +809,8 @@
 /* --------------------------------------------------------------- */
 /* IDCT 10 */
 void IDct10__mmx( const ogg_int16_t *in,
-		  const ogg_int16_t *q,
-		  ogg_int16_t *out ) {
+                  const ogg_int16_t *q,
+                  ogg_int16_t *out ) {
 
     __asm__ __volatile__ (
 
@@ -821,35 +819,35 @@
     "movq   16(%[i]), %%mm1\n"
     "pmullw 16(%[q]), %%mm1\n"   /* r1 = 13 12 11 10 */
     "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"       /* r3 = 03 02 01 00 */
+    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
     "movq   8(%[i]), %%mm4\n"
     "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
     "pmullw 8(%[q]), %%mm4\n"    /* r4 = 07 06 05 04 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"       /* r5 = __ 03 02 01 */
-    "pand   %%mm2, %%mm5\n"       /* r5 = __ __ __ 01 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
+    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
+    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
     "psllq  $32, %%mm1\n"        /* r1 = 11 10 __ __ */
     "movq   "M(3)", %%mm7\n"     /* r7 = FF __ __ __ */
-    "pxor   %%mm5, %%mm0\n"       /* r0 = __ 03 02 __ */
-    "pand   %%mm1, %%mm7\n"       /* r7 = 11 __ __ __ */
-    "por    %%mm3, %%mm0\n"       /* r0 = __ 03 02 00 */
-    "pxor   %%mm7, %%mm1\n"       /* r1 = __ 10 __ __ */
-    "por    %%mm7, %%mm0\n"       /* r0 = 11 03 02 00 = R0 */
-    "movq   %%mm4, %%mm3\n"       /* r3 = 07 06 05 04 */
+    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
+    "pand   %%mm1, %%mm7\n"      /* r7 = 11 __ __ __ */
+    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
+    "pxor   %%mm7, %%mm1\n"      /* r1 = __ 10 __ __ */
+    "por    %%mm7, %%mm0\n"      /* r0 = 11 03 02 00 = R0 */
+    "movq   %%mm4, %%mm3\n"      /* r3 = 07 06 05 04 */
     "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 04 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 04 */
     "psllq  $16, %%mm3\n"        /* r3 = __ __ 04 __ */
-    "por    %%mm3, %%mm5\n"       /* r5 = __ __ 04 01 */
-    "por    %%mm5, %%mm1\n"       /* r1 = __ 10 04 01 = R1 */
+    "por    %%mm3, %%mm5\n"      /* r5 = __ __ 04 01 */
+    "por    %%mm5, %%mm1\n"      /* r1 = __ 10 04 01 = R1 */
     "psrlq  $16, %%mm4\n"        /* r4 = __ 07 06 05 */
     "movq   %%mm1, 16(%[o])\n"   /* write R1 = r1 */
-    "movq   %%mm4, %%mm5\n"       /* r5 = __ 07 06 05 */
+    "movq   %%mm4, %%mm5\n"      /* r5 = __ 07 06 05 */
     "psrlq  $16, %%mm4\n"        /* r4 = __ __ 07 06 */
-    "movq   %%mm2, %%mm6\n"       /* r6 = __ __ __ FF */
-    "pand   %%mm2, %%mm5\n"       /* r5 = __ __ __ 05 */
-    "pand   %%mm4, %%mm6\n"       /* r6 = __ __ __ 06 */
-    "pxor   %%mm6, %%mm4\n"       /* r4 = __ __ 07 __ */
-    "por    %%mm5, %%mm4\n"       /* r4 = __ __ 07 05 */
+    "movq   %%mm2, %%mm6\n"      /* r6 = __ __ __ FF */
+    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 05 */
+    "pand   %%mm4, %%mm6\n"      /* r6 = __ __ __ 06 */
+    "pxor   %%mm6, %%mm4\n"      /* r4 = __ __ 07 __ */
+    "por    %%mm5, %%mm4\n"      /* r4 = __ __ 07 05 */
     "movq   %%mm4, 32(%[o])\n"   /* write R2 = r4 */
     "movq   %%mm6, 48(%[o])\n"   /* write R3 = r6 */
 
@@ -950,13 +948,13 @@
 ******************************************************************************************/
 
 #define RowIDCT_3 "#RowIDCT_3\n"\
-    "   movq        "I(1)",%%mm7\n"  /* r7 = I1                      */  \
-    "   movq        "C(1)",%%mm0\n"  /* r0 = C1                      */  \
-    "   movq        "C(7)",%%mm3\n"  /* r3 = C7                      */  \
+    "   movq        "I(1)",%%mm7\n"   /* r7 = I1                      */  \
+    "   movq        "C(1)",%%mm0\n"   /* r0 = C1                      */  \
+    "   movq        "C(7)",%%mm3\n"   /* r3 = C7                      */  \
     "   pmulhw      %%mm7,%%mm0\n"    /* r0 = C1 * I1 - I1            */  \
     "   pmulhw      %%mm7,%%mm3\n"    /* r3 = C7 * I1 = B, D.         */  \
-    "   movq        "I(0)",%%mm6\n"  /* r6 = I0                      */  \
-    "   movq        "C(4)",%%mm4\n"  /* r4 = C4                      */  \
+    "   movq        "I(0)",%%mm6\n"   /* r6 = I0                      */  \
+    "   movq        "C(4)",%%mm4\n"   /* r4 = C4                      */  \
     "   paddw       %%mm7,%%mm0\n"    /* r0 = C1 * I1 = A, C.         */  \
     "   movq        %%mm6,%%mm1\n"    /* make a copy of I0            */  \
     "   pmulhw      %%mm4,%%mm6\n"    /* r2 = C4 * I0 - I0            */  \
@@ -992,19 +990,19 @@
     "   paddw       %%mm5,%%mm5\n"    /* r5 = B. + B.                 */  \
     "   paddw       %%mm7,%%mm0\n"    /* r0 = E + A ----R0            */  \
     "   paddw       %%mm2,%%mm1\n"    /* r1 = E + A. + B. -----R1     */  \
-    "   movq        %%mm1,"I(1)"\n"  /* save r1                      */  \
+    "   movq        %%mm1,"I(1)"\n"   /* save r1                      */  \
     "   paddw       %%mm6,%%mm5\n"    /* r5 = E - A. + B. -----R5     */  \
     "#end RowIDCT_3\n"
 //End of RowIDCT_3
 
 #define ColumnIDCT_3 "#ColumnIDCT_3\n"\
-    "   movq        "I(1)",%%mm7\n"  /* r7 = I1                      */  \
-    "   movq        "C(1)",%%mm0\n"  /* r0 = C1                      */  \
-    "   movq        "C(7)",%%mm3\n"  /* r3 = C7                      */  \
+    "   movq        "I(1)",%%mm7\n"   /* r7 = I1                      */  \
+    "   movq        "C(1)",%%mm0\n"   /* r0 = C1                      */  \
+    "   movq        "C(7)",%%mm3\n"   /* r3 = C7                      */  \
     "   pmulhw      %%mm7,%%mm0\n"    /* r0 = C1 * I1 - I1            */  \
     "   pmulhw      %%mm7,%%mm3\n"    /* r3 = C7 * I1 = B, D.         */  \
-    "   movq        "I(0)",%%mm6\n"  /* r6 = I0                      */  \
-    "   movq        "C(4)",%%mm4\n"  /* r4 = C4                      */  \
+    "   movq        "I(0)",%%mm6\n"   /* r6 = I0                      */  \
+    "   movq        "C(4)",%%mm4\n"   /* r4 = C4                      */  \
     "   paddw       %%mm7,%%mm0\n"    /* r0 = C1 * I1 = A, C.         */  \
     "   movq        %%mm6,%%mm1\n"    /* make a copy of I0            */  \
     "   pmulhw      %%mm4,%%mm6\n"    /* r2 = C4 * I0 - I0            */  \
@@ -1014,8 +1012,8 @@
     "   pmulhw      %%mm4,%%mm5\n"    /* r5 = C4 * B - B              */  \
     "   paddw       %%mm1,%%mm6\n"    /* r2 = C4 * I0 = E, F          */  \
     "   movq        %%mm6,%%mm4\n"    /* r4 = E                       */  \
-    "   paddw       "Eight",%%mm6\n" /* +8 for shift                 */  \
-    "   paddw       "Eight",%%mm4\n" /* +8 for shift                 */  \
+    "   paddw       "Eight",%%mm6\n"  /* +8 for shift                 */  \
+    "   paddw       "Eight",%%mm4\n"  /* +8 for shift                 */  \
     "   paddw       %%mm0,%%mm2\n"    /* r2 = A.                      */  \
     "   paddw       %%mm3,%%mm5\n"    /* r5 = B.                      */  \
     "   movq        %%mm6,%%mm7\n"    /* r7 = E                       */  \
@@ -1036,49 +1034,49 @@
     "   paddw       %%mm0,%%mm0\n"    /* r0 = A + A                   */  \
     "   paddw       %%mm6,%%mm2\n"    /* r2 = E + A.                  */  \
     "   paddw       %%mm4,%%mm3\n"    /* r3 = E + B ----R3            */  \
-    "   psraw        $4,%%mm4\n"     /* shift                        */  \
-    "   movq        %%mm4,"J(4)"\n"  /* store R4 at J4               */  \
-    "   psraw       $4,%%mm3\n"      /* shift                        */  \
-    "   movq        %%mm3,"I(3)"\n"  /* store R3 at I3               */  \
+    "   psraw        $4,%%mm4\n"      /* shift                        */  \
+    "   movq        %%mm4,"J(4)"\n"   /* store R4 at J4               */  \
+    "   psraw       $4,%%mm3\n"       /* shift                        */  \
+    "   movq        %%mm3,"I(3)"\n"   /* store R3 at I3               */  \
     "   psubw       %%mm1,%%mm2\n"    /* r2 = E + A. - B. ----R2      */  \
     "   psubw       %%mm5,%%mm6\n"    /* r6 = E - A. - B. ----R6      */  \
     "   paddw       %%mm1,%%mm1\n"    /* r1 = B. + B.                 */  \
     "   paddw       %%mm5,%%mm5\n"    /* r5 = B. + B.                 */  \
     "   paddw       %%mm7,%%mm0\n"    /* r0 = E + A ----R0            */  \
     "   paddw       %%mm2,%%mm1\n"    /* r1 = E + A. + B. -----R1     */  \
-    "   psraw       $4,%%mm7\n"      /* shift                        */  \
-    "   psraw       $4,%%mm2\n"      /* shift                        */  \
-    "   psraw       $4,%%mm0\n"      /* shift                        */  \
-    "   psraw       $4,%%mm1\n"      /* shift                        */  \
-    "   movq        %%mm7,"J(7)"\n"  /* store R7 to J7               */  \
-    "   movq        %%mm0,"I(0)"\n"  /* store R0 to I0               */  \
-    "   movq        %%mm1,"I(1)"\n"  /* store R1 to I1               */  \
-    "   movq        %%mm2,"I(2)"\n"  /* store R2 to I2               */  \
-    "   movq        %%mm1,"I(1)"\n"  /* save r1                      */  \
+    "   psraw       $4,%%mm7\n"       /* shift                        */  \
+    "   psraw       $4,%%mm2\n"       /* shift                        */  \
+    "   psraw       $4,%%mm0\n"       /* shift                        */  \
+    "   psraw       $4,%%mm1\n"       /* shift                        */  \
+    "   movq        %%mm7,"J(7)"\n"   /* store R7 to J7               */  \
+    "   movq        %%mm0,"I(0)"\n"   /* store R0 to I0               */  \
+    "   movq        %%mm1,"I(1)"\n"   /* store R1 to I1               */  \
+    "   movq        %%mm2,"I(2)"\n"   /* store R2 to I2               */  \
+    "   movq        %%mm1,"I(1)"\n"   /* save r1                      */  \
     "   paddw       %%mm6,%%mm5\n"    /* r5 = E - A. + B. -----R5     */  \
-    "   psraw       $4,%%mm5\n"      /* shift                        */  \
-    "   movq        %%mm5,"J(5)"\n"  /* store R5 at J5               */  \
-    "   psraw       $4,%%mm6\n"      /* shift                        */  \
-    "   movq        %%mm6,"J(6)"\n"  /* store R6 at J6               */  \
+    "   psraw       $4,%%mm5\n"       /* shift                        */  \
+    "   movq        %%mm5,"J(5)"\n"   /* store R5 at J5               */  \
+    "   psraw       $4,%%mm6\n"       /* shift                        */  \
+    "   movq        %%mm6,"J(6)"\n"   /* store R6 at J6               */  \
     "#end ColumnIDCT_3\n"
 //End of ColumnIDCT_3
 
 void IDct3__mmx( const ogg_int16_t *in,
-		 const ogg_int16_t *q,
-		 ogg_int16_t *out ) {
+                 const ogg_int16_t *q,
+                 ogg_int16_t *out ) {
 
     __asm__ __volatile__ (
 
-    "movq   (%[i]), %%mm0\n"     
+    "movq   (%[i]), %%mm0\n"
     "pmullw (%[q]), %%mm0\n"     /* r0 = 03 02 01 00 */
     "movq   "M(0)", %%mm2\n"     /* r2 = __ __ __ FF */
-    "movq   %%mm0, %%mm3\n"       /* r3 = 03 02 01 00 */
+    "movq   %%mm0, %%mm3\n"      /* r3 = 03 02 01 00 */
     "psrlq  $16, %%mm0\n"        /* r0 = __ 03 02 01 */
-    "pand   %%mm2, %%mm3\n"       /* r3 = __ __ __ 00 */
-    "movq   %%mm0, %%mm5\n"       /* r5 = __ 03 02 01 */
-    "pand   %%mm2, %%mm5\n"       /* r5 = __ __ __ 01 */
-    "pxor   %%mm5, %%mm0\n"       /* r0 = __ 03 02 __ */
-    "por    %%mm3, %%mm0\n"       /* r0 = __ 03 02 00 */
+    "pand   %%mm2, %%mm3\n"      /* r3 = __ __ __ 00 */
+    "movq   %%mm0, %%mm5\n"      /* r5 = __ 03 02 01 */
+    "pand   %%mm2, %%mm5\n"      /* r5 = __ __ __ 01 */
+    "pxor   %%mm5, %%mm0\n"      /* r0 = __ 03 02 __ */
+    "por    %%mm3, %%mm0\n"      /* r0 = __ 03 02 00 */
     "movq   %%mm0, (%[o])\n"     /* write R0 = r0 */
     "movq   %%mm5, 16(%[o])\n"   /* write R1 = r5 */
 
@@ -1102,17 +1100,17 @@
 #   undef J
 #   define I( K)    MtoSTR((K*16)+8)"(%[o])"
 #   define J( K)    I( K)
-    
+
     ColumnIDCT_3    /* 44 c */
-    
+
 #   undef I
 #   undef J
-    
+
     "emms\n"
     :
     :[i]"r"(in),[q]"r"(q),[o]"r"(out),[c]"r"(idctconstants)
     );
-    
+
 }
 
 /* install our implementation in the function table */
@@ -1124,5 +1122,3 @@
 }
 
 #endif /* USE_ASM */
-
-

Modified: branches/theora-thusnelda/lib/enc/x86/recon_mmx.c
===================================================================
--- branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/enc/x86/recon_mmx.c	2009-02-06 09:43:27 UTC (rev 15675)
@@ -15,7 +15,7 @@
 
  ********************************************************************/
 
-#include "codec_internal.h"
+#include "../codec_internal.h"
 #include <stddef.h>
 
 #if defined(USE_ASM)

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-02-06 00:18:42 UTC (rev 15674)
+++ branches/theora-thusnelda/lib/internal.h	2009-02-06 09:43:27 UTC (rev 15675)
@@ -6,7 +6,7 @@
  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
  *                                                                  *
  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2007                *
- * by the Xiph.Org Foundation http://www.xiph.org/                  *
+ * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  *                                                                  *
  ********************************************************************
 
@@ -27,9 +27,11 @@
 # include "dec/huffman.h"
 # include "dec/quant.h"
 
-/*Thank you Microsoft, I know the order of operations.*/
 # if defined(_MSC_VER)
+/*Thank you Microsoft, I know the order of operations.*/
 #  pragma warning(disable:4554)
+/*Disable missing EMMS warnings.*/
+#  pragma warning(disable:4799)
 # endif
 
 /*This library's version.*/
@@ -39,6 +41,10 @@
 # define TH_VERSION_MAJOR (3)
 # define TH_VERSION_MINOR (2)
 # define TH_VERSION_SUB   (1)
+# define TH_VERSION_CHECK(_info,_maj,_min,_sub) \
+ ((_info)->version_major>(_maj)||(_info)->version_major==(_maj)&& \
+ ((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
+ (_info)->version_subminor>=(_sub)))
 
 /*A keyframe.*/
 #define OC_INTRA_FRAME (0)
@@ -206,10 +212,14 @@
   unsigned        invalid:1;
   /*The quality index used for this fragment's AC coefficients.*/
   unsigned        qi:6;
-  /*The mode of the macroblock this fragment belongs to.*/
-  int             mbmode:8;
-  /*The prediction-corrected DC component.*/
-  int             dc:16;
+  /*The mode of the macroblock this fragment belongs to.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int      mbmode:8;
+  /*The prediction-corrected DC component.
+    Note that the C standard requires an explicit signed keyword for bitfield
+     types, since some compilers may treat them as unsigned without it.*/
+  signed int      dc:16;
   /*A pointer to the portion of an image covered by this fragment in several
      images.
     The first three are reconstructed frame buffers, while the last is the
@@ -223,7 +233,6 @@
   oc_border_info *border;
   /*The motion vector used for this fragment.*/
   oc_mv           mv;
-
 }oc_fragment;
 
 
@@ -261,7 +270,7 @@
    int _src2_ystride,const ogg_int16_t *_residue);
   void (*state_frag_copy)(const oc_theora_state *_state,
    const int *_fragis,int _nfragis,int _dst_frame,int _src_frame,int _pli);
-  void (*state_frag_recon)(oc_theora_state *_state, oc_fragment *_frag,
+  void (*state_frag_recon)(oc_theora_state *_state,oc_fragment *_frag,
    int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
    ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
   void (*restore_fpu)(void);
@@ -274,77 +283,77 @@
 /*Common state information between the encoder and decoder.*/
 struct oc_theora_state{
   /*The stream information.*/
-  th_info           info;
+  th_info             info;
   /*Table for shared accelerated functions.*/
-  oc_base_opt_vtable    opt_vtable;
+  oc_base_opt_vtable  opt_vtable;
   /*CPU flags to detect the presence of extended instruction sets.*/
-  ogg_uint32_t          cpu_flags;
+  ogg_uint32_t        cpu_flags;
   /*The fragment plane descriptions.*/
-  oc_fragment_plane     fplanes[3];
+  oc_fragment_plane   fplanes[3];
   /*The total number of fragments in a single frame.*/
-  int                   nfrags;
+  int                 nfrags;
   /*The list of fragments, indexed in image order.*/
-  oc_fragment          *frags;
+  oc_fragment        *frags;
   /*The total number of super blocks in a single frame.*/
-  int                   nsbs;
+  int                 nsbs;
   /*The list of super blocks, indexed in image order.*/
-  oc_sb                *sbs;
+  oc_sb              *sbs;
   /*The number of macro blocks in the X direction.*/
-  int                   nhmbs;
+  int                 nhmbs;
   /*The number of macro blocks in the Y direction.*/
-  int                   nvmbs;
+  int                 nvmbs;
   /*The total number of macro blocks.*/
-  int                   nmbs;
+  int                 nmbs;
   /*The list of macro blocks, indexed in super block order.
     That is, the macro block corresponding to the macro block mbi in (luma
      plane) super block sbi is (sbi<<2|mbi).*/
-  oc_mb                *mbs;
+  oc_mb              *mbs;
   /*The list of coded fragments, in coded order.*/
-  int                  *coded_fragis;
+  int                *coded_fragis;
   /*The number of coded fragments in each plane.*/
-  int                   ncoded_fragis[3];
+  int                 ncoded_fragis[3];
   /*The list of uncoded fragments.
     This just past the end of the list, which is in reverse order, and
      uses the same block of allocated storage as the coded_fragis list.*/
-  int                  *uncoded_fragis;
+  int                *uncoded_fragis;
   /*The number of uncoded fragments in each plane.*/
-  int                   nuncoded_fragis[3];
+  int                 nuncoded_fragis[3];
   /*The list of coded macro blocks in the Y plane, in coded order.*/
-  int                  *coded_mbis;
+  int                *coded_mbis;
   /*The number of coded macro blocks in the Y plane.*/
-  int                   ncoded_mbis;
+  int                 ncoded_mbis;
   /*A copy of the image data used to fill the input pointers in each fragment.
     If the data pointers or strides change, these input pointers must be
      re-populated.*/
-  th_ycbcr_buffer   input;
+  th_ycbcr_buffer     input;
   /*The number of unique border patterns.*/
-  int                   nborders;
+  int                 nborders;
   /*The storage for the border info for all border fragments.
     This data is pointed to from the appropriate fragments.*/
-  oc_border_info        borders[16];
+  oc_border_info      borders[16];
   /*The index of the buffers being used for each OC_FRAME_* reference frame.*/
-  int                   ref_frame_idx[3];
+  int                 ref_frame_idx[3];
   /*The actual buffers used for the previously decoded frames.*/
-  th_ycbcr_buffer   ref_frame_bufs[3];
+  th_ycbcr_buffer     ref_frame_bufs[3];
   /*The storage for the reference frame buffers.*/
-  unsigned char        *ref_frame_data;
+  unsigned char      *ref_frame_data;
   /*The frame number of the last keyframe.*/
-  ogg_int64_t           keyframe_num;
+  ogg_int64_t         keyframe_num;
   /*The frame number of the current frame.*/
-  ogg_int64_t           curframe_num;
+  ogg_int64_t         curframe_num;
   /*The granpos of the current frame.*/
-  ogg_int64_t           granpos;
+  ogg_int64_t         granpos;
   /*The type of the current frame.*/
-  int                   frame_type;
+  int                 frame_type;
   /*The quality indices of the current frame.*/
-  int                   qis[3];
+  int                 qis[3];
   /*The number of quality indices used in the current frame.*/
-  int                   nqis;
+  int                 nqis;
   /*The dequantization tables.*/
-  oc_quant_table       *dequant_tables[2][3];
-  oc_quant_tables       dequant_table_data[2][3];
+  oc_quant_table     *dequant_tables[2][3];
+  oc_quant_tables     dequant_table_data[2][3];
   /*Loop filter strength parameters.*/
-  unsigned char         loop_filter_limits[64];
+  unsigned char       loop_filter_limits[64];
 };
 
 
@@ -410,8 +419,8 @@
 void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
  th_ycbcr_buffer _img);
 int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
-int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offset0,
- int *_offset1,int _dx,int _dy,int _ystride,int _pli);
+int oc_state_get_mv_offsets(oc_theora_state *_state,int *_offsets,
+ int _dx,int _dy,int _ystride,int _pli);
 
 int oc_state_loop_filter_init(oc_theora_state *_state,int *_bv);
 void oc_state_loop_filter(oc_theora_state *_state,int _frame);
@@ -432,7 +441,7 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon(oc_theora_state *_state, oc_fragment *_frag,
+void oc_state_frag_recon(oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_state_loop_filter_frag_rows(oc_theora_state *_state,int *_bv,
@@ -449,14 +458,14 @@
  int _src2_ystride,const ogg_int16_t *_residue);
 void oc_state_frag_copy_c(const oc_theora_state *_state,const int *_fragis,
  int _nfragis,int _dst_frame,int _src_frame,int _pli);
-void oc_state_frag_recon_c(oc_theora_state *_state, oc_fragment *_frag,
+void oc_state_frag_recon_c(oc_theora_state *_state,oc_fragment *_frag,
  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,int _ncoefs,
  ogg_uint16_t _dc_iquant,const ogg_uint16_t _ac_iquant[64]);
 void oc_state_loop_filter_frag_rows_c(oc_theora_state *_state,int *_bv,
  int _refi,int _pli,int _fragy0,int _fragy_end);
 void oc_restore_fpu_c(void);
 
-/*We need a way to call a few enocder functions without introducing a link-time
+/*We need a way to call a few encoder functions without introducing a link-time
    dependency into the decoder, while still allowing the old alpha API which
    does not distinguish between encoder and decoder objects to be used.
   We do this by placing a function table at the start of the encoder object
@@ -480,15 +489,4 @@
   oc_state_granule_time_func  granule_time;
 };
 
-#if defined(_MSC_VER) && !defined(TH_REALLY_NO_ASSEMBLY)
-# error You are compiling theora without inline assembly.\
- This is probably not what you want.  Instead, please either\
-  (1) download the assembly .lib binaries or\
-  (2) compile them yourself using MinGW, and make Visual Studio\
- link against them.\
-  Please seriously consider this before defining TH_REALLY_NO_ASSEMBLY\
-  to disable this message and compile without inline assembly.\
-  Thank you!
 #endif
-
-#endif