[xiph-commits] r16403 - in branches/theora-thusnelda/lib: . dec dec/x86 dec/x86_vc enc

Sun Aug 2 21:50:28 PDT 2009

Author: tterribe
Date: 2009-08-02 21:50:27 -0700 (Sun, 02 Aug 2009)
New Revision: 16403

Modified:
   branches/theora-thusnelda/lib/dec/decint.h
   branches/theora-thusnelda/lib/dec/decode.c
   branches/theora-thusnelda/lib/dec/huffdec.c
   branches/theora-thusnelda/lib/dec/idct.c
   branches/theora-thusnelda/lib/dec/internal.c
   branches/theora-thusnelda/lib/dec/state.c
   branches/theora-thusnelda/lib/dec/x86/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86/x86int.h
   branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
   branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h
   branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
   branches/theora-thusnelda/lib/dec/x86_vc/x86int.h
   branches/theora-thusnelda/lib/enc/analyze.c
   branches/theora-thusnelda/lib/enc/tokenize.c
   branches/theora-thusnelda/lib/internal.h
Log:
Merge changes from theora-gumboot branch through r16361.
This also includes some additional clean-ups and minor
 improvements/optimizations.
The net improvement is 1.2% (for high-resolution, low-bitrate) to 30.5% (for
 insane bitrates) on x86-32, and 3.7% to 33.5% for the same content on x86-64.
Typical content will fall somewhat closer to the former than the latter (e.g.,
 8-10%), but it still represents a significant speed-up.


Modified: branches/theora-thusnelda/lib/dec/decint.h
===================================================================

--- branches/theora-thusnelda/lib/dec/decint.h	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/decint.h	2009-08-03 04:50:27 UTC (rev 16403)
@@ -58,20 +58,15 @@
   oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of the first token in each plane for each coefficient.*/
   ptrdiff_t            ti0[3][64];
-  /*The index of the first extra bits entry in each plane for each
-     coefficient.*/
-  ptrdiff_t            ebi0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each
      plane.*/
   ptrdiff_t            eob_runs[3][64];
   /*The DCT token lists.*/
   unsigned char       *dct_tokens;
   /*The extra bits associated with DCT tokens.*/
-  ogg_uint16_t        *extra_bits;
+  unsigned char       *extra_bits;
   /*The number of dct tokens unpacked so far.*/
   int                  dct_tokens_count;
-  /*The number of extra bits entries unpacked so far.*/
-  int                  extra_bits_count;
   /*The out-of-loop post-processing level.*/
   int                  pp_level;
   /*The DC scale used for out-of-loop deblocking.*/
@@ -99,14 +94,12 @@
   int                  telemetry_mv;
   int                  telemetry_qi;
   int                  telemetry_bits;
-
   int                  telemetry_frame_bytes;
   int                  telemetry_coding_bytes;
   int                  telemetry_mode_bytes;
   int                  telemetry_mv_bytes;
   int                  telemetry_qi_bytes;
   int                  telemetry_dc_bytes;
-
   unsigned char       *telemetry_frame_data;
 # endif
 };

Modified: branches/theora-thusnelda/lib/dec/decode.c
===================================================================
--- branches/theora-thusnelda/lib/dec/decode.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/decode.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -93,6 +93,182 @@
 };
 
 
+/*The original DCT tokens are extended and reordered during the construction of
+   the Huffman tables.
+  This revised ordering reveals essential information in the token value
+   itself; specifically, whether or not there are extra bits to read and the
+   parameter to which those extra bits are applied.
+  The token is used to fetch a code word from the following table.
+  The extra bits are added into code word at the bit position inferred from the
+   token value and then optionally negated, according to the 'flip' bit, giving
+   the final code word from which all the required parameters are derived.*/
+
+/*The number of extra bits that are decoded with each of the internal DCT
+   tokens.*/
+static const unsigned char OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[16]={
+  12,4,3,3,6,0,3,3,4,4,5,5,8,8,8,8
+};
+
+/*Whether or not an internal token needs any additional extra bits.*/
+#define OC_DCT_TOKEN_NEEDS_MORE(token) \
+ (token<(sizeof(OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)/ \
+  sizeof(*OC_INTERNAL_DCT_TOKEN_EXTRA_BITS)))
+
+/*This token (OC_DCT_REPEAT_RUN3_TOKEN) requires more than 8 extra bits.*/
+#define OC_DCT_TOKEN_FAT_EOB (0)
+
+/*The location of the token magnitude bits in the code word.*/
+#define OC_DCT_CW_MAG_SHIFT  (21)
+/*The location of the flip bit in the code word.*/
+#define OC_DCT_CW_FLIP_BIT   (20)
+/*The location of the run legth bits in the code word.*/
+#define OC_DCT_CW_RLEN_SHIFT (12)
+
+#define OC_DCT_CW_PACK(_eobs,_rlen,_mag,_flip) \
+ ((_eobs)| \
+ (_rlen)<<OC_DCT_CW_RLEN_SHIFT| \
+ (_flip)<<OC_DCT_CW_FLIP_BIT| \
+ (_mag)-(_flip)<<OC_DCT_CW_MAG_SHIFT)
+
+/*A special codeword value that signals the end of the frame (a long EOB run of
+   zero).*/
+#define OC_DCT_CW_FINISH (0)
+
+/*The position at which to insert the extra bits in the code word.*/
+#define OC_DCT_TOKEN_EB_POS(_token) \
+ ((_token)>=6?OC_DCT_CW_MAG_SHIFT:(_token)>=2?OC_DCT_CW_RLEN_SHIFT:0)
+
+/*The code words for each internal token.
+  See the notes at OC_DCT_TOKEN_MAP for the reasons why things are slightly out
+   of order, and why there are a few gaps.*/
+static const ogg_int32_t OC_DCT_CODE_WORD[96]={
+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
+  OC_DCT_CW_FINISH,
+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
+  OC_DCT_CW_PACK(16, 0,  0,0),
+  /*OC_DCT_RUN_CAT1C (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0,10, +1,0),
+  OC_DCT_CW_PACK( 0,10, -1,0),
+  /*OC_DCT_ZRL_TOKEN (6 extra bits)
+    Flip is set to distinguish this from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  /*Unused.*/
+  0,
+  /*OC_DCT_VAL_CAT5 (4 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 13,0),
+  OC_DCT_CW_PACK( 0, 0, 13,1),
+  /*OC_DCT_VAL_CAT6 (5 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 21,0),
+  OC_DCT_CW_PACK( 0, 0, 21,1),
+  /*OC_DCT_VAL_CAT7 (6 extra bits-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 37,0),
+  OC_DCT_CW_PACK( 0, 0, 37,1),
+  /*OC_DCT_VAL_CAT8 (10 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, 69,0),
+  OC_DCT_CW_PACK( 0, 0,325,0),
+  OC_DCT_CW_PACK( 0, 0, 69,1),
+  OC_DCT_CW_PACK( 0, 0,325,1),
+  /*Unused.*/
+  0,
+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 1, 0,  0,0),
+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 2, 0,  0,0),
+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 3, 0,  0,0),
+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 4, 0,  0,0),
+  OC_DCT_CW_PACK( 5, 0,  0,0),
+  OC_DCT_CW_PACK( 6, 0,  0,0),
+  OC_DCT_CW_PACK( 7, 0,  0,0),
+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 8, 0,  0,0),
+  OC_DCT_CW_PACK( 9, 0,  0,0),
+  OC_DCT_CW_PACK(10, 0,  0,0),
+  OC_DCT_CW_PACK(11, 0,  0,0),
+  OC_DCT_CW_PACK(12, 0,  0,0),
+  OC_DCT_CW_PACK(13, 0,  0,0),
+  OC_DCT_CW_PACK(14, 0,  0,0),
+  OC_DCT_CW_PACK(15, 0,  0,0),
+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits-3 already read)
+    Flip is set on the first one to distinguish it from OC_DCT_CW_FINISH.*/
+  OC_DCT_CW_PACK( 0, 0,  0,1),
+  OC_DCT_CW_PACK( 0, 1,  0,0),
+  OC_DCT_CW_PACK( 0, 2,  0,0),
+  OC_DCT_CW_PACK( 0, 3,  0,0),
+  OC_DCT_CW_PACK( 0, 4,  0,0),
+  OC_DCT_CW_PACK( 0, 5,  0,0),
+  OC_DCT_CW_PACK( 0, 6,  0,0),
+  OC_DCT_CW_PACK( 0, 7,  0,0),
+  /*OC_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +1,0),
+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -1,0),
+  /*OC_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, +2,0),
+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
+  OC_DCT_CW_PACK( 0, 0, -2,0),
+  /*OC_DCT_VAL_CAT2 (1 extra bit-1 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +3,0),
+  OC_DCT_CW_PACK( 0, 0, -3,0),
+  OC_DCT_CW_PACK( 0, 0, +4,0),
+  OC_DCT_CW_PACK( 0, 0, -4,0),
+  OC_DCT_CW_PACK( 0, 0, +5,0),
+  OC_DCT_CW_PACK( 0, 0, -5,0),
+  OC_DCT_CW_PACK( 0, 0, +6,0),
+  OC_DCT_CW_PACK( 0, 0, -6,0),
+  /*OC_DCT_VAL_CAT3 (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +7,0),
+  OC_DCT_CW_PACK( 0, 0, +8,0),
+  OC_DCT_CW_PACK( 0, 0, -7,0),
+  OC_DCT_CW_PACK( 0, 0, -8,0),
+  /*OC_DCT_VAL_CAT4 (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 0, +9,0),
+  OC_DCT_CW_PACK( 0, 0,+10,0),
+  OC_DCT_CW_PACK( 0, 0,+11,0),
+  OC_DCT_CW_PACK( 0, 0,+12,0),
+  OC_DCT_CW_PACK( 0, 0, -9,0),
+  OC_DCT_CW_PACK( 0, 0,-10,0),
+  OC_DCT_CW_PACK( 0, 0,-11,0),
+  OC_DCT_CW_PACK( 0, 0,-12,0),
+  /*OC_DCT_RUN_CAT1B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 6, +1,0),
+  OC_DCT_CW_PACK( 0, 7, +1,0),
+  OC_DCT_CW_PACK( 0, 8, +1,0),
+  OC_DCT_CW_PACK( 0, 9, +1,0),
+  OC_DCT_CW_PACK( 0, 6, -1,0),
+  OC_DCT_CW_PACK( 0, 7, -1,0),
+  OC_DCT_CW_PACK( 0, 8, -1,0),
+  OC_DCT_CW_PACK( 0, 9, -1,0),
+  /*OC_DCT_RUN_CAT2B (3 extra bits-3 already read)*/
+  OC_DCT_CW_PACK( 0, 2, +2,0),
+  OC_DCT_CW_PACK( 0, 3, +2,0),
+  OC_DCT_CW_PACK( 0, 2, +3,0),
+  OC_DCT_CW_PACK( 0, 3, +3,0),
+  OC_DCT_CW_PACK( 0, 2, -2,0),
+  OC_DCT_CW_PACK( 0, 3, -2,0),
+  OC_DCT_CW_PACK( 0, 2, -3,0),
+  OC_DCT_CW_PACK( 0, 3, -3,0),
+  /*OC_DCT_RUN_CAT2A (2 extra bits-2 already read)*/
+  OC_DCT_CW_PACK( 0, 1, +2,0),
+  OC_DCT_CW_PACK( 0, 1, +3,0),
+  OC_DCT_CW_PACK( 0, 1, -2,0),
+  OC_DCT_CW_PACK( 0, 1, -3,0),
+  /*OC_DCT_RUN_CAT1A (1 extra bit-1 already read)*/
+  OC_DCT_CW_PACK( 0, 1, +1,0),
+  OC_DCT_CW_PACK( 0, 1, -1,0),
+  OC_DCT_CW_PACK( 0, 2, +1,0),
+  OC_DCT_CW_PACK( 0, 2, -1,0),
+  OC_DCT_CW_PACK( 0, 3, +1,0),
+  OC_DCT_CW_PACK( 0, 3, -1,0),
+  OC_DCT_CW_PACK( 0, 4, +1,0),
+  OC_DCT_CW_PACK( 0, 4, -1,0),
+  OC_DCT_CW_PACK( 0, 5, +1,0),
+  OC_DCT_CW_PACK( 0, 5, -1,0),
+};
+
+
+
 static int oc_sb_run_unpack(oc_pack_buf *_opb){
   long bits;
   int ret;
@@ -186,10 +362,12 @@
     }
     _dec->pp_sharp_mod[qi]=-(qsum>>11);
   }
-  _dec->dct_tokens=(unsigned char *)_ogg_malloc(64*
+  /*For each fragment, allocate one byte for every DCT coefficient token, plus
+     one byte for extra-bits for each token, plus one more byte for the long
+     EOB run, just in case it's the very last token and has a run length of
+     one.*/
+  _dec->dct_tokens=(unsigned char *)_ogg_malloc((64+64+1)*
    _dec->state.nfrags*sizeof(_dec->dct_tokens[0]));
-  _dec->extra_bits=(ogg_uint16_t *)_ogg_malloc(64*
-   _dec->state.nfrags*sizeof(_dec->extra_bits[0]));
   memcpy(_dec->state.loop_filter_limits,_setup->qinfo.loop_filter_limits,
    sizeof(_dec->state.loop_filter_limits));
   _dec->pp_level=OC_PP_LEVEL_DISABLED;
@@ -216,7 +394,6 @@
   _ogg_free(_dec->pp_frame_data);
   _ogg_free(_dec->variances);
   _ogg_free(_dec->dc_qis);
-  oc_free_2d(_dec->extra_bits);
   oc_free_2d(_dec->dct_tokens);
   oc_huff_trees_clear(_dec->huff_tables);
   oc_state_clear(&_dec->state);
@@ -748,107 +925,6 @@
 
 
 
-/*Returns the decoded value of the first coefficient produced by the given
-   token.
-  It CANNOT be called for any of the EOB tokens.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: The decoded coefficient value.*/
-typedef int (*oc_token_dec1val_func)(int _token,int _extra_bits);
-
-/*We want to avoid accessing arrays of constants in these functions, because
-   we take the address of them, which means that when compiling with -fPIC,
-   an expensive prolog is added to set up the PIC register in any functions
-   which access a global symbol (even if it has file scope or smaller).
-  Thus a lot of what would be tables are packed into 32-bit constants.*/
-
-/*Handles zero run tokens.*/
-static int oc_token_dec1val_zrl(void){
-  return 0;
-}
-
-/*Handles 1, -1, 2 and -2 tokens.*/
-static int oc_token_dec1val_const(int _token){
-  return OC_BYTE_TABLE32(1,-1,2,-2,_token-OC_NDCT_ZRL_TOKEN_MAX);
-}
-
-/*Handles DCT value tokens category 2.*/
-static int oc_token_dec1val_cat2(int _token,int _extra_bits){
-  int mask;
-  mask=-_extra_bits;
-  return _token-OC_DCT_VAL_CAT2+3+mask^mask;
-}
-
-/*Handles DCT value tokens categories 3 through 6.*/
-static int oc_token_dec1val_cat3_6(int _token,int _extra_bits){
-  int cati;
-  int mask;
-  int val_cat_offs;
-  int val_cat_shift;
-  cati=_token-OC_DCT_VAL_CAT3;
-  val_cat_shift=cati+1;
-  mask=-(_extra_bits>>val_cat_shift);
-  _extra_bits&=(1<<val_cat_shift)-1;
-  val_cat_offs=OC_BYTE_TABLE32(7,9,13,21,cati);
-  return val_cat_offs+_extra_bits+mask^mask;
-}
-
-/*Handles DCT value tokens categories 7 through 8.*/
-static int oc_token_dec1val_cat7_8(int _token,int _extra_bits){
-  int cati;
-  int mask;
-  int val_cat_offs;
-  int val_cat_shift;
-  cati=_token-OC_DCT_VAL_CAT7;
-  val_cat_shift=5+(cati<<2);
-  mask=-(_extra_bits>>val_cat_shift);
-  _extra_bits&=(1<<val_cat_shift)-1;
-  val_cat_offs=37+(cati<<5);
-  return val_cat_offs+_extra_bits+mask^mask;
-}
-
-/*A jump table for computing the first coefficient value the given token value
-   represents.*/
-static const oc_token_dec1val_func OC_TOKEN_DEC1VAL_TABLE[TH_NDCT_TOKENS-
- OC_NDCT_EOB_TOKEN_MAX]={
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_const,
-  (oc_token_dec1val_func)oc_token_dec1val_const,
-  (oc_token_dec1val_func)oc_token_dec1val_const,
-  (oc_token_dec1val_func)oc_token_dec1val_const,
-  oc_token_dec1val_cat2,
-  oc_token_dec1val_cat2,
-  oc_token_dec1val_cat2,
-  oc_token_dec1val_cat2,
-  oc_token_dec1val_cat3_6,
-  oc_token_dec1val_cat3_6,
-  oc_token_dec1val_cat3_6,
-  oc_token_dec1val_cat3_6,
-  oc_token_dec1val_cat7_8,
-  oc_token_dec1val_cat7_8,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl,
-  (oc_token_dec1val_func)oc_token_dec1val_zrl
-};
-
-/*Returns the decoded value of the first coefficient produced by the given
-   token.
-  It CANNOT be called for any of the EOB tokens.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: The decoded coefficient value.*/
-static int oc_dct_token_dec1val(int _token,int _extra_bits){
-  return (*OC_TOKEN_DEC1VAL_TABLE[_token-OC_NDCT_EOB_TOKEN_MAX])(_token,
-   _extra_bits);
-}
-
 /*Unpacks the DC coefficient tokens.
   Unlike when unpacking the AC coefficient tokens, we actually need to decode
    the DC coefficient values now so that we can do DC prediction.
@@ -860,20 +936,17 @@
 static ptrdiff_t oc_dec_dc_coeff_unpack(oc_dec_ctx *_dec,int _huff_idxs[2],
  ptrdiff_t _ntoks_left[3][64]){
   unsigned char   *dct_tokens;
-  ogg_uint16_t    *extra_bits;
   oc_fragment     *frags;
   const ptrdiff_t *coded_fragis;
   ptrdiff_t        ncoded_fragis;
   ptrdiff_t        fragii;
   ptrdiff_t        eobs;
   ptrdiff_t        ti;
-  ptrdiff_t        ebi;
   int              pli;
   dct_tokens=_dec->dct_tokens;
-  extra_bits=_dec->extra_bits;
   frags=_dec->state.frags;
   coded_fragis=_dec->state.coded_fragis;
-  ncoded_fragis=fragii=eobs=ti=ebi=0;
+  ncoded_fragis=fragii=eobs=ti=0;
   for(pli=0;pli<3;pli++){
     ptrdiff_t run_counts[64];
     ptrdiff_t eob_count;
@@ -883,7 +956,6 @@
     memset(run_counts,0,sizeof(run_counts));
     _dec->eob_runs[pli][0]=eobs;
     _dec->ti0[pli][0]=ti;
-    _dec->ebi0[pli][0]=ebi;
     /*Continue any previous EOB run, if there was one.*/
     eobi=eobs;
     if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
@@ -892,32 +964,39 @@
     while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
     while(fragii<ncoded_fragis){
       int token;
-      int neb;
+      int cw;
       int eb;
       int skip;
       token=oc_huff_token_decode(&_dec->opb,
        _dec->huff_tables[_huff_idxs[pli+1>>1]]);
       dct_tokens[ti++]=(unsigned char)token;
-      neb=OC_DCT_TOKEN_EXTRA_BITS[token];
-      if(neb){
-        long val;
-        val=oc_pack_read(&_dec->opb,neb);
-        eb=(int)val;
-        extra_bits[ebi++]=(ogg_uint16_t)eb;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
       }
       else eb=0;
-      skip=oc_dct_token_skip(token,eb);
-      if(skip<0){
-        eobs=eobi=-skip;
-        if(ncoded_fragis-fragii<eobi)eobi=ncoded_fragis-fragii;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      eobs=cw&0xFFF;
+      /*Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99,
+         which is not yet available everywhere; this should be equivalent.*/
+      if(cw==OC_DCT_CW_FINISH)eobs=~(size_t)0>>1;
+      if(eobs){
+        eobi=OC_MINI(eobs,ncoded_fragis-fragii);
         eob_count+=eobi;
         eobs-=eobi;
         while(eobi-->0)frags[coded_fragis[fragii++]].dc=0;
       }
       else{
-        run_counts[skip-1]++;
-        eobs=0;
-        frags[coded_fragis[fragii++]].dc=oc_dct_token_dec1val(token,eb);
+        int coeff;
+        skip=cw>>OC_DCT_CW_RLEN_SHIFT&0x3F;
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        if(skip)coeff=0;
+        run_counts[skip]++;
+        frags[coded_fragis[fragii++]].dc=coeff;
       }
     }
     /*Add the total EOB count to the longest run length.*/
@@ -929,7 +1008,6 @@
     for(rli=64;rli-->0;)_ntoks_left[pli][rli]-=run_counts[rli];
   }
   _dec->dct_tokens_count=ti;
-  _dec->extra_bits_count=ebi;
   return eobs;
 }
 
@@ -946,30 +1024,25 @@
 static int oc_dec_ac_coeff_unpack(oc_dec_ctx *_dec,int _zzi,int _huff_idxs[2],
  ptrdiff_t _ntoks_left[3][64],ptrdiff_t _eobs){
   unsigned char *dct_tokens;
-  ogg_uint16_t  *extra_bits;
   ptrdiff_t      ti;
-  ptrdiff_t      ebi;
   int            pli;
   dct_tokens=_dec->dct_tokens;
-  extra_bits=_dec->extra_bits;
   ti=_dec->dct_tokens_count;
-  ebi=_dec->extra_bits_count;
   for(pli=0;pli<3;pli++){
     ptrdiff_t run_counts[64];
-    ptrdiff_t ntoks_left;
     ptrdiff_t eob_count;
-    ptrdiff_t ntoks;
+    size_t    ntoks_left;
+    size_t    ntoks;
     int       rli;
     _dec->eob_runs[pli][_zzi]=_eobs;
     _dec->ti0[pli][_zzi]=ti;
-    _dec->ebi0[pli][_zzi]=ebi;
     ntoks_left=_ntoks_left[pli][_zzi];
     memset(run_counts,0,sizeof(run_counts));
     eob_count=0;
     ntoks=0;
     while(ntoks+_eobs<ntoks_left){
       int token;
-      int neb;
+      int cw;
       int eb;
       int skip;
       ntoks+=_eobs;
@@ -977,20 +1050,23 @@
       token=oc_huff_token_decode(&_dec->opb,
        _dec->huff_tables[_huff_idxs[pli+1>>1]]);
       dct_tokens[ti++]=(unsigned char)token;
-      neb=OC_DCT_TOKEN_EXTRA_BITS[token];
-      if(neb){
-        long val;
-        val=oc_pack_read(&_dec->opb,neb);
-        eb=(int)val;
-        extra_bits[ebi++]=(ogg_uint16_t)eb;
+      if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+        eb=(int)oc_pack_read(&_dec->opb,
+         OC_INTERNAL_DCT_TOKEN_EXTRA_BITS[token]);
+        dct_tokens[ti++]=(unsigned char)eb;
+        if(token==OC_DCT_TOKEN_FAT_EOB)dct_tokens[ti++]=(unsigned char)(eb>>8);
+        eb<<=OC_DCT_TOKEN_EB_POS(token);
       }
       else eb=0;
-      skip=oc_dct_token_skip(token,eb);
-      if(skip<0)_eobs=-skip;
-      else{
-        run_counts[skip-1]++;
+      cw=OC_DCT_CODE_WORD[token]+eb;
+      skip=cw>>OC_DCT_CW_RLEN_SHIFT&0x3F;
+      _eobs=cw&0xFFF;
+      /*Note: We want to set eobs to PTRDIFF_MAX here, but that requires C99,
+         which is not yet available everywhere; this should be equivalent.*/
+      if(cw==OC_DCT_CW_FINISH)_eobs=~(size_t)0>>1;
+      if(_eobs==0){
+        run_counts[skip]++;
         ntoks++;
-        _eobs=0;
       }
     }
     /*Add the portion of the last EOB run actually used by this coefficient.*/
@@ -1006,7 +1082,6 @@
     for(rli=64-_zzi;rli-->0;)_ntoks_left[pli][_zzi+rli]-=run_counts[rli];
   }
   _dec->dct_tokens_count=ti;
-  _dec->extra_bits_count=ebi;
   return _eobs;
 }
 
@@ -1022,7 +1097,7 @@
   The former was VP3's choice, and it meant 2*w*h extra storage for all the
    decoded coefficient values.
 
-  We take the second option, which lets us store just one or three bytes per
+  We take the second option, which lets us store just one to three bytes per
    token (generally far fewer than the number of coefficients, due to EOB
    tokens and zero runs), and which requires us to only maintain a counter for
    each of the 64 coefficients, instead of a counter for every fragment to
@@ -1073,141 +1148,6 @@
 }
 
 
-
-/*Expands a single token into the given coefficient list.
-  This fills in the zeros for zero runs as well as coefficient values, and
-   updates the index of the current coefficient.
-  It CANNOT be called for any of the EOB tokens.
-  _token:      The token value to expand.
-  _extra_bits: The extra bits associated with the token.
-  _dct_coeffs: The current list of coefficients, in zig-zag order.
-  _zzi:        The zig-zag index of the next coefficient to write to.
-  Return: The updated index of the next coefficient to write to.*/
-typedef int (*oc_token_expand_func)(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi);
-
-/*Expands a zero run token.*/
-static int oc_token_expand_zrl(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  do _dct_coeffs[_zzi++]=0;
-  while(_extra_bits-->0);
-  return _zzi;
-}
-
-/*Expands a constant, single-value token.*/
-static int oc_token_expand_const(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_const(_token);
-  return _zzi;
-}
-
-/*Expands category 2 single-valued tokens.*/
-static int oc_token_expand_cat2(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cat2(_token,_extra_bits);
-  return _zzi;
-}
-
-/*Expands category 3 through 6 single-valued tokens.*/
-static int oc_token_expand_cat3_6(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cat3_6(_token,_extra_bits);
-  return _zzi;
-}
-
-/*Expands category 7 through 8 single-valued tokens.*/
-static int oc_token_expand_cat7_8(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  _dct_coeffs[_zzi++]=(ogg_int16_t)oc_token_dec1val_cat7_8(_token,_extra_bits);
-  return _zzi;
-}
-
-/*Expands a category 1a zero run/value combo token.*/
-static int oc_token_expand_run_cat1a(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  int rl;
-  /*LOOP VECTORIZES.*/
-  for(rl=_token-OC_DCT_RUN_CAT1A+1;rl-->0;)_dct_coeffs[_zzi++]=0;
-  _dct_coeffs[_zzi++]=(ogg_int16_t)(1-(_extra_bits<<1));
-  return _zzi;
-}
-
-/*Expands all other zero run/value combo tokens.*/
-static int oc_token_expand_run(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  int nzeros_mask;
-  int nzeros_adjust;
-  int sign_shift;
-  int value_shift;
-  int value_mask;
-  int value_adjust;
-  int mask;
-  int rl;
-  _token-=OC_DCT_RUN_CAT1B;
-  nzeros_mask=OC_BYTE_TABLE32(3,7,0,1,_token);
-  nzeros_adjust=OC_BYTE_TABLE32(6,10,1,2,_token);
-  rl=(_extra_bits&nzeros_mask)+nzeros_adjust;
-  /*LOOP VECTORIZES.*/
-  while(rl-->0)_dct_coeffs[_zzi++]=0;
-  sign_shift=OC_BYTE_TABLE32(2,3,1,2,_token);
-  mask=-(_extra_bits>>sign_shift);
-  value_shift=_token+1>>2;
-  value_mask=_token>>1;
-  value_adjust=value_mask+1;
-  _dct_coeffs[_zzi++]=
-   (ogg_int16_t)(value_adjust+(_extra_bits>>value_shift&value_mask)+mask^mask);
-  return _zzi;
-}
-
-/*A jump table for expanding token values into coefficient values.
-  This reduces all the conditional branches, etc., needed to parse these token
-   values down to one indirect jump.*/
-static const oc_token_expand_func OC_TOKEN_EXPAND_TABLE[TH_NDCT_TOKENS-
- OC_NDCT_EOB_TOKEN_MAX]={
-  oc_token_expand_zrl,
-  oc_token_expand_zrl,
-  oc_token_expand_const,
-  oc_token_expand_const,
-  oc_token_expand_const,
-  oc_token_expand_const,
-  oc_token_expand_cat2,
-  oc_token_expand_cat2,
-  oc_token_expand_cat2,
-  oc_token_expand_cat2,
-  oc_token_expand_cat3_6,
-  oc_token_expand_cat3_6,
-  oc_token_expand_cat3_6,
-  oc_token_expand_cat3_6,
-  oc_token_expand_cat7_8,
-  oc_token_expand_cat7_8,
-  oc_token_expand_run_cat1a,
-  oc_token_expand_run_cat1a,
-  oc_token_expand_run_cat1a,
-  oc_token_expand_run_cat1a,
-  oc_token_expand_run_cat1a,
-  oc_token_expand_run,
-  oc_token_expand_run,
-  oc_token_expand_run,
-  oc_token_expand_run
-};
-
-/*Expands a single token into the given coefficient list.
-  This fills in the zeros for zero runs as well as coefficient values, and
-   updates the index of the current coefficient.
-  It CANNOT be called for any of the EOB tokens.
-  _token:      The token value to expand.
-  _extra_bits: The extra bits associated with the token.
-  _dct_coeffs: The current list of coefficients, in zig-zag order.
-  _zzi:        The zig-zag index of the next coefficient to write to.
-  Return: The updated index of the next coefficient to write to.*/
-static int oc_dct_token_expand(int _token,int _extra_bits,
- ogg_int16_t _dct_coeffs[128],int _zzi){
-  return (*OC_TOKEN_EXPAND_TABLE[_token-OC_NDCT_EOB_TOKEN_MAX])(_token,
-   _extra_bits,_dct_coeffs,_zzi);
-}
-
-
-
 static int oc_dec_postprocess_init(oc_dec_ctx *_dec){
   /*pp_level 0: disabled; free any memory used and return*/
   if(_dec->pp_level<=OC_PP_LEVEL_DISABLED){
@@ -1314,7 +1254,6 @@
 typedef struct{
   int                 bounding_values[256];
   ptrdiff_t           ti[3][64];
-  ptrdiff_t           ebi[3][64];
   ptrdiff_t           eob_runs[3][64];
   const ptrdiff_t    *coded_fragis[3];
   const ptrdiff_t    *uncoded_fragis[3];
@@ -1345,7 +1284,6 @@
   /*Initialize the token and extra bits indices for each plane and
      coefficient.*/
   memcpy(_pipe->ti,_dec->ti0,sizeof(_pipe->ti));
-  memcpy(_pipe->ebi,_dec->ebi0,sizeof(_pipe->ebi));
   /*Also copy over the initial the EOB run counts.*/
   memcpy(_pipe->eob_runs,_dec->eob_runs,sizeof(_pipe->eob_runs));
   /*Set up per-plane pointers to the coded and uncoded fragments lists.*/
@@ -1514,64 +1452,82 @@
    counts.*/
 static void oc_dec_frags_recon_mcu_plane(oc_dec_ctx *_dec,
  oc_dec_pipeline_state *_pipe,int _pli){
-  unsigned char     *dct_tokens;
-  ogg_uint16_t      *extra_bits;
-  ogg_uint16_t       dc_quant[2];
-  const oc_fragment *frags;
-  const ptrdiff_t   *coded_fragis;
-  ptrdiff_t          ncoded_fragis;
-  ptrdiff_t          fragii;
-  ptrdiff_t         *ti;
-  ptrdiff_t         *ebi;
-  ptrdiff_t         *eob_runs;
-  int                qti;
+  unsigned char       *dct_tokens;
+  const unsigned char *dct_fzig_zag;
+  ogg_uint16_t         dc_quant[2];
+  const oc_fragment   *frags;
+  const ptrdiff_t     *coded_fragis;
+  ptrdiff_t            ncoded_fragis;
+  ptrdiff_t            fragii;
+  ptrdiff_t           *ti;
+  ptrdiff_t           *eob_runs;
+  int                  qti;
   dct_tokens=_dec->dct_tokens;
-  extra_bits=_dec->extra_bits;
+  dct_fzig_zag=_dec->state.opt_data.dct_fzig_zag;
   frags=_dec->state.frags;
   coded_fragis=_pipe->coded_fragis[_pli];
   ncoded_fragis=_pipe->ncoded_fragis[_pli];
   ti=_pipe->ti[_pli];
-  ebi=_pipe->ebi[_pli];
   eob_runs=_pipe->eob_runs[_pli];
   for(qti=0;qti<2;qti++)dc_quant[qti]=_pipe->dequant[_pli][0][qti][0];
   for(fragii=0;fragii<ncoded_fragis;fragii++){
-    /*This array is made twice as large as necessary so that an invalid zero
-       run cannot cause a buffer overflow.*/
-    ogg_int16_t dct_coeffs[128];
-    ptrdiff_t   fragi;
-    int         last_zzi;
-    int         zzi;
+    /*This array is made one element larger because the zig-zag index array
+       uses the final element as a dumping ground for out-of-range indices
+       to protect us from buffer overflow.*/
+    OC_ALIGN8(ogg_int16_t dct_coeffs[65]);
+    const ogg_uint16_t *ac_quant;
+    ptrdiff_t           fragi;
+    int                 last_zzi;
+    int                 zzi;
     fragi=coded_fragis[fragii];
+    for(zzi=0;zzi<64;zzi++)dct_coeffs[zzi]=0;
+    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
+    ac_quant=_pipe->dequant[_pli][frags[fragi].qii][qti];
     /*Decode the AC coefficients.*/
     for(zzi=0;zzi<64;){
       int token;
-      int eb;
       last_zzi=zzi;
       if(eob_runs[zzi]){
         eob_runs[zzi]--;
         break;
       }
       else{
-        int ebflag;
-        token=dct_tokens[ti[zzi]++];
-        ebflag=OC_DCT_TOKEN_EXTRA_BITS[token]!=0;
-        eb=extra_bits[ebi[zzi]]&-ebflag;
-        ebi[zzi]+=ebflag;
-        if(token<OC_NDCT_EOB_TOKEN_MAX){
-          eob_runs[zzi]=-oc_dct_token_skip(token,eb);
+        ptrdiff_t eob;
+        int       cw;
+        int       rlen;
+        int       coeff;
+        int       lti;
+        lti=ti[zzi];
+        token=dct_tokens[lti++];
+        cw=OC_DCT_CODE_WORD[token];
+        if(OC_DCT_TOKEN_NEEDS_MORE(token)){
+          int eb;
+          eb=dct_tokens[lti++];
+          cw+=eb<<OC_DCT_TOKEN_EB_POS(token);
         }
-        else zzi=oc_dct_token_expand(token,eb,dct_coeffs,zzi);
+        eob=cw&0xFFF;
+        if(token==OC_DCT_TOKEN_FAT_EOB){
+          eob+=dct_tokens[lti++]<<8;
+          if(eob==0)eob=~(size_t)0>>1;
+        }
+        rlen=cw>>OC_DCT_CW_RLEN_SHIFT&0x3F;
+        cw^=-(cw&1<<OC_DCT_CW_FLIP_BIT);
+        coeff=cw>>OC_DCT_CW_MAG_SHIFT;
+        eob_runs[zzi]=eob;
+        ti[zzi]=lti;
+        zzi+=rlen;
+        dct_coeffs[dct_fzig_zag[zzi]]=(ogg_int16_t)(coeff*(int)ac_quant[zzi]);
+        zzi+=!eob;
       }
     }
     /*TODO: zzi should be exactly 64 here.
       If it's not, we should report some kind of warning.*/
     zzi=OC_MINI(zzi,64);
     dct_coeffs[0]=(ogg_int16_t)frags[fragi].dc;
-    qti=frags[fragi].mb_mode!=OC_MODE_INTRA;
     /*last_zzi is always initialized.
       If your compiler thinks otherwise, it is dumb.*/
-    oc_state_frag_recon(&_dec->state,fragi,_pli,dct_coeffs,last_zzi,zzi,
-     dc_quant[qti],_pipe->dequant[_pli][frags[fragi].qii][qti]);
+    oc_state_frag_recon(&_dec->state,fragi,_pli,
+     dct_coeffs,last_zzi,dc_quant[qti]);
   }
   _pipe->coded_fragis[_pli]+=ncoded_fragis;
   /*Right now the reconstructed MCU has only the coded blocks in it.*/
@@ -2702,26 +2658,31 @@
         mult=(_dec->telemetry_bits>=0xFF?1:_dec->telemetry_bits);
         fullw=250*h*fpsd*mult/fpsn;
         padw=w-24;
-        /* header and coded block bits */
-        if(_dec->telemetry_frame_bytes<0 ||
-           _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS)
+        /*Header and coded block bits.*/
+        if(_dec->telemetry_frame_bytes<0||
+         _dec->telemetry_frame_bytes==OC_LOTS_OF_BITS){
           _dec->telemetry_frame_bytes=0;
-        if(_dec->telemetry_coding_bytes<0 ||
-           _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes)
+        }
+        if(_dec->telemetry_coding_bytes<0||
+         _dec->telemetry_coding_bytes>_dec->telemetry_frame_bytes){
           _dec->telemetry_coding_bytes=0;
-        if(_dec->telemetry_mode_bytes<0 ||
-           _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes)
+        }
+        if(_dec->telemetry_mode_bytes<0||
+         _dec->telemetry_mode_bytes>_dec->telemetry_frame_bytes){
           _dec->telemetry_mode_bytes=0;
-        if(_dec->telemetry_mv_bytes<0 ||
-           _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes)
+        }
+        if(_dec->telemetry_mv_bytes<0||
+         _dec->telemetry_mv_bytes>_dec->telemetry_frame_bytes){
           _dec->telemetry_mv_bytes=0;
-        if(_dec->telemetry_qi_bytes<0 ||
-           _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes)
+        }
+        if(_dec->telemetry_qi_bytes<0||
+         _dec->telemetry_qi_bytes>_dec->telemetry_frame_bytes){
           _dec->telemetry_qi_bytes=0;
-        if(_dec->telemetry_dc_bytes<0 ||
-           _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes)
+        }
+        if(_dec->telemetry_dc_bytes<0||
+         _dec->telemetry_dc_bytes>_dec->telemetry_frame_bytes){
           _dec->telemetry_dc_bytes=0;
-
+        }
         widths[0]=padw*(_dec->telemetry_frame_bytes-_dec->telemetry_coding_bytes)/fullw;
         widths[1]=padw*(_dec->telemetry_coding_bytes-_dec->telemetry_mode_bytes)/fullw;
         widths[2]=padw*(_dec->telemetry_mode_bytes-_dec->telemetry_mv_bytes)/fullw;

Modified: branches/theora-thusnelda/lib/dec/huffdec.c
===================================================================
--- branches/theora-thusnelda/lib/dec/huffdec.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/huffdec.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -25,6 +25,77 @@
 #define _ogg_offsetof(_type,_field)\
  ((size_t)((char *)&((_type *)0)->_field-(char *)0))
 
+/*The number of internal tokens associated with each of the spec tokens.*/
+static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
+  1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
+};
+
+/*The map from external spec-defined tokens to internal tokens.
+  This is constructed so that any extra bits read with the original token value
+   can be masked off the least significant bits of its internal token index.
+  In addition, all of the tokens which require additional extra bits are placed
+   at the start of the list, and grouped by type.
+  These requirements leave things slightly out of order, and leave a few gaps.*/
+static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS][8]={
+  /*OC_DCT_EOB1_TOKEN (0 extra bits)*/
+  {17},
+  /*OC_DCT_EOB2_TOKEN (0 extra bits)*/
+  {18},
+  /*OC_DCT_EOB3_TOKEN (0 extra bits)*/
+  {19},
+  /*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits)*/
+  {20,21,22,23},
+  /*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits)*/
+  {24,25,26,27,28,29,30,31},
+  /*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
+  {1},
+  /*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
+  {0},
+  /*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits)*/
+  {32,33,34,35,36,37,38,39},
+  /*OC_DCT_ZRL_TOKEN (6 extra bits)*/
+  {4},
+  /*OC_ONE_TOKEN (0 extra bits)*/
+  {40},
+  /*OC_MINUS_ONE_TOKEN (0 extra bits)*/
+  {41},
+  /*OC_TWO_TOKEN (0 extra bits)*/
+  {42},
+  /*OC_MINUS_TWO_TOKEN (0 extra bits)*/
+  {43},
+  /*OC_DCT_VAL_CAT2 (1 extra bit)*/
+  {44,45},
+  {46,47},
+  {48,49},
+  {50,51},
+  /*OC_DCT_VAL_CAT3 (2 extra bits)*/
+  {52,53,54,55},
+  /*OC_DCT_VAL_CAT4 (3 extra bits)*/
+  {56,57,58,59,60,61,62,63},
+  /*OC_DCT_VAL_CAT5 (4 extra bits)*/
+  {6,7},
+  /*OC_DCT_VAL_CAT6 (5 extra bits)*/
+  {8,9},
+  /*OC_DCT_VAL_CAT7 (6 extra bits)*/
+  {10,11},
+  /*OC_DCT_VAL_CAT8 (10 extra bits)*/
+  {12,13,14,15},
+  /*OC_DCT_RUN_CAT1A (1 extra bit)*/
+  {84,85},
+  {86,87},
+  {88,89},
+  {90,91},
+  {92,93},
+  /*OC_DCT_RUN_CAT1B (3 extra bits)*/
+  {64,65,66,67,68,69,70,71},
+  /*OC_DCT_RUN_CAT1C (4 extra bits)*/
+  {2,3},
+  /*OC_DCT_RUN_CAT2A (2 extra bits)*/
+  {80,81,82,83},
+  /*OC_DCT_RUN_CAT2B (3 extra bits)*/
+  {72,73,74,75,76,77,78,79}
+};
+
 /*These three functions are really part of the bitpack.c module, but
    they are only used here.
   Declaring local static versions so they can be inlined saves considerable
@@ -151,12 +222,13 @@
   int           nused;
   if(_nbinodes<1)return TH_EBADHEADER;
   binode=_binodes;
-  nused=1;
+  nused=0;
   bits=oc_pack_read1(_opb);
   if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
   /*Read an internal node:*/
   if(!bits){
     int ret;
+    nused++;
     binode->nbits=1;
     binode->depth=1;
     binode->nodes[0]=_binodes+nused;
@@ -171,11 +243,32 @@
   }
   /*Read a leaf node:*/
   else{
+    int ntokens;
+    int i;
     bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
     if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-    binode->nbits=0;
-    binode->depth=1;
-    binode->token=(unsigned char)bits;
+    /*Find out how many internal tokens we translate this external token into.*/
+    ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
+    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
+    /*Fill in a complete binary tree pointing to the internal tokens.*/
+    for(i=1;i<ntokens;i<<=1){
+      int j;
+      binode=_binodes+nused;
+      nused+=i;
+      for(j=0;j<i;j++){
+        binode[j].nbits=1;
+        binode[j].depth=1;
+        binode[j].nodes[0]=_binodes+nused+2*j;
+        binode[j].nodes[1]=_binodes+nused+2*j+1;
+      }
+    }
+    /*And now the leaf nodes with those tokens.*/
+    for(i=0;i<ntokens;i++){
+      binode=_binodes+nused++;
+      binode->nbits=0;
+      binode->depth=1;
+      binode->token=OC_DCT_TOKEN_MAP[bits][i];
+    }
   }
   return nused;
 }
@@ -298,10 +391,10 @@
  oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    oc_huff_node nodes[63];
+    oc_huff_node nodes[511];
     int          ret;
     /*Unpack the full tree into a temporary buffer.*/
-    ret=oc_huff_tree_unpack(_opb,nodes,63);
+    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
     if(ret<0)return ret;
     _nodes[i]=oc_huff_tree_collapse(nodes);
   }

Modified: branches/theora-thusnelda/lib/dec/idct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/idct.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/idct.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -296,14 +296,14 @@
 }
 
 void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- int _last_zzi,int _ncoefs){
-  (*_state->opt_vtable.idct8x8)(_y,_last_zzi,_ncoefs);
+ int _last_zzi){
+  (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
 }
 
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was

Modified: branches/theora-thusnelda/lib/dec/internal.c
===================================================================
--- branches/theora-thusnelda/lib/dec/internal.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/internal.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -97,112 +97,6 @@
 
 
 
-/*Determines the number of blocks or coefficients to be skipped for a given
-   token value.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: A positive value indicates that number of coefficients are to be
-           skipped in the current block.
-          Otherwise, the negative of the return value indicates that number of
-           blocks are to be ended.*/
-typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
-
-/*Handles the simple end of block tokens.*/
-static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
-  int nblocks_adjust;
-  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
-  return -_extra_bits-nblocks_adjust;
-}
-
-/*The last EOB token has a special case, where an EOB run of size zero ends all
-   the remaining blocks in the frame.*/
-static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
-  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
-     yet available everywhere; this should be equivalent.*/
-  if(!_extra_bits)return -(~(size_t)0>>1);
-  return -_extra_bits;
-}
-
-/*Handles the pure zero run tokens.*/
-static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
-  return _extra_bits+1;
-}
-
-/*Handles a normal coefficient value token.*/
-static ptrdiff_t oc_token_skip_val(void){
-  return 1;
-}
-
-/*Handles a category 1A zero run/coefficient value combo token.*/
-static ptrdiff_t oc_token_skip_run_cat1a(int _token){
-  return _token-OC_DCT_RUN_CAT1A+2;
-}
-
-/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
-static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
-  int run_cati;
-  int ncoeffs_mask;
-  int ncoeffs_adjust;
-  run_cati=_token-OC_DCT_RUN_CAT1B;
-  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
-  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
-  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
-}
-
-/*A jump table for computing the number of coefficients or blocks to skip for
-   a given token value.
-  This reduces all the conditional branches, etc., needed to parse these token
-   values down to one indirect jump.*/
-static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob,
-  oc_token_skip_eob6,
-  oc_token_skip_zrl,
-  oc_token_skip_zrl,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_val,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  (oc_token_skip_func)oc_token_skip_run_cat1a,
-  oc_token_skip_run,
-  oc_token_skip_run,
-  oc_token_skip_run,
-  oc_token_skip_run
-};
-
-/*Determines the number of blocks or coefficients to be skipped for a given
-   token value.
-  _token:      The token value to skip.
-  _extra_bits: The extra bits attached to this token.
-  Return: A positive value indicates that number of coefficients are to be
-           skipped in the current block.
-          Otherwise, the negative of the return value indicates that number of
-           blocks are to be ended.
-          0 will never be returned, so that at least one coefficient in one
-           block will always be decoded for every token.*/
-ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
-  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
-}
-
-
 /*The function used to fill in the chroma plane motion vectors for a macro
    block when 4 different motion vectors are specified in the luma plane.
   This version is for use with chroma decimated in the X and Y directions

Modified: branches/theora-thusnelda/lib/dec/state.c
===================================================================
--- branches/theora-thusnelda/lib/dec/state.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/state.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -855,60 +855,39 @@
 }
 
 void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
   _state->opt_vtable.state_frag_recon(_state,_fragi,_pli,_dct_coeffs,
-   _last_zzi,_ncoefs,_dc_quant,_ac_quant);
+   _last_zzi,_dc_quant);
 }
 
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant, const ogg_uint16_t _ac_quant[64]){
-  ogg_int16_t    res_buf[64];
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
   unsigned char *dst;
   ptrdiff_t      frag_buf_off;
   int            ystride;
   int            mb_mode;
-  /*Dequantize and apply the inverse transform.*/
+  /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
     ogg_int16_t p;
-    int ci;
+    int         ci;
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
     /*LOOP VECTORIZES.*/
-    for(ci=0;ci<64;ci++)res_buf[ci]=p;
+    for(ci=0;ci<64;ci++)_dct_coeffs[ci]=p;
   }
   else{
-    const unsigned char *dct_fzig_zag;
-    int                  zzi;
-    /*First, dequantize the coefficients.*/
-    dct_fzig_zag=_state->opt_data.dct_fzig_zag;
-    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      res_buf[dct_fzig_zag[zzi]]=
-       (ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
-    }
-    /*Then, fill in the remainder of the coefficients with 0's, and perform
-       the iDCT.*/
-    if(_last_zzi<3){
-      for(;zzi<3;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
-    }
-    else if(_last_zzi<10){
-      for(;zzi<10;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
-    }
-    else{
-      for(;zzi<64;zzi++)res_buf[dct_fzig_zag[zzi]]=0;
-    }
-    oc_idct8x8(_state,res_buf,_last_zzi,_ncoefs);
+    /*First, dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8(_state,_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra(_state,dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -918,9 +897,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2(_state,
-       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,res_buf);
+       dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,_dct_coeffs);
     }
-    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter(_state,dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-thusnelda/lib/dec/x86/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86/mmxidct.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -531,7 +531,7 @@
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was

Modified: branches/theora-thusnelda/lib/dec/x86/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86/mmxstate.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -24,28 +24,13 @@
 
 #if defined(OC_X86_ASM)
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
-  OC_ALIGN8(ogg_int16_t    res_buf[64]);
-  unsigned char          *dst;
-  ptrdiff_t               frag_buf_off;
-  int                     ystride;
-  int                     mb_mode;
-  /*Dequantize and apply the inverse transform.*/
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
     /*Note that this value must be unsigned, to keep the __asm__ block from
@@ -79,49 +64,21 @@
       "movq %%mm0,112(%[y])\n\t"
       "movq %%mm0,120(%[y])\n\t"
       :
-      :[y]"r"(res_buf),[p]"r"((unsigned)p)
+      :[y]"r"(_dct_coeffs),[p]"r"((unsigned)p)
       :"memory"
     );
   }
   else{
-    int zzi;
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm__ __volatile__(
-      "pxor %%mm0,%%mm0\n\t"
-      "movq %%mm0,(%[y])\n\t"
-      "movq %%mm0,8(%[y])\n\t"
-      "movq %%mm0,16(%[y])\n\t"
-      "movq %%mm0,24(%[y])\n\t"
-      "movq %%mm0,32(%[y])\n\t"
-      "movq %%mm0,40(%[y])\n\t"
-      "movq %%mm0,48(%[y])\n\t"
-      "movq %%mm0,56(%[y])\n\t"
-      "movq %%mm0,64(%[y])\n\t"
-      "movq %%mm0,72(%[y])\n\t"
-      "movq %%mm0,80(%[y])\n\t"
-      "movq %%mm0,88(%[y])\n\t"
-      "movq %%mm0,96(%[y])\n\t"
-      "movq %%mm0,104(%[y])\n\t"
-      "movq %%mm0,112(%[y])\n\t"
-      "movq %%mm0,120(%[y])\n\t"
-      :
-      :[y]"r"(res_buf)
-      :"memory"
-    );
-    /*Dequantize the coefficients.*/
-    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      res_buf[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
-    }
-    oc_idct8x8_mmx(res_buf,_last_zzi,_ncoefs);
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -131,9 +88,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       res_buf);
+       _dct_coeffs);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-thusnelda/lib/dec/x86/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86/x86int.h	2009-08-03 04:50:27 UTC (rev 16403)
@@ -29,10 +29,9 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxidct.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -529,7 +529,7 @@
 /*Performs an inverse 8x8 Type-II DCT transform.
   The input is assumed to be scaled by a factor of 4 relative to orthonormal
    version of the transform.*/
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs){
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi){
   /*_last_zzi is subtly different from an actual count of the number of
      coefficients we decoded for this block.
     It contains the value of zzi BEFORE the final token in the block was

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxloop.h	2009-08-03 04:50:27 UTC (rev 16403)
@@ -92,9 +92,8 @@
 
 #define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
   do{ \
-    /*Used local variable pix__ in order to fix compilation errors like:
-     "error C2425: 'SHL' : non-constant expression in 'second operand'"
-     */ \
+    /*Used local variable pix__ in order to fix compilation errors like: \
+     "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \
     unsigned char *pix__; \
     unsigned char *ll__; \
     ll__=(_ll); \
@@ -123,9 +122,8 @@
 
 #define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
   do{ \
-    /*Used local variable ll__ in order to fix compilation errors like: 
-     "error C2443: operand size conflict"
-     */ \
+    /*Used local variable ll__ in order to fix compilation errors like: \
+     "error C2443: operand size conflict".*/ \
     unsigned char *ll__; \
     unsigned char *pix__; \
     ll__=(_ll); \

Modified: branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86_vc/mmxstate.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -24,28 +24,13 @@
 
 #if defined(OC_X86_ASM)
 
-/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
-   each quadrant of the destination.*/
-static const unsigned char OC_FZIG_ZAG_MMX[64]={
-   0, 8, 1, 2, 9,16,24,17,
-  10, 3,32,11,18,25, 4,12,
-   5,26,19,40,33,34,41,48,
-  27, 6,13,20,28,21,14, 7,
-  56,49,42,35,43,50,57,36,
-  15,22,29,30,23,44,37,58,
-  51,59,38,45,52,31,60,53,
-  46,39,47,54,61,62,55,63
-};
-
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]){
-  OC_ALIGN8(ogg_int16_t   res_buf[64]);
-  unsigned char          *dst;
-  ptrdiff_t               frag_buf_off;
-  int                     ystride;
-  int                     mb_mode;
-  /*Dequantize and apply the inverse transform.*/
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant){
+  unsigned char *dst;
+  ptrdiff_t      frag_buf_off;
+  int            ystride;
+  int            mb_mode;
+  /*Apply the inverse transform.*/
   /*Special case only having a DC component.*/
   if(_last_zzi<2){
     /*Note that this value must be unsigned, to keep the __asm__ block from
@@ -54,11 +39,11 @@
     /*We round this dequant product (and not any of the others) because there's
        no iDCT rounding.*/
     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
-    /*Fill res_buf with p.*/
+    /*Fill _dct_coeffs with p.*/
     __asm{
 #define Y eax
 #define P ecx
-      mov Y,res_buf
+      mov Y,_dct_coeffs
       movd P,p
       /*mm0=0000 0000 0000 AAAA*/
       movd mm0,P
@@ -87,44 +72,16 @@
     }
   }
   else{
-    int zzi;
-    /*First zero the buffer.*/
-    /*On K7, etc., this could be replaced with movntq and sfence.*/
-    __asm{
-#define Y eax
-      mov Y,res_buf
-      pxor mm0,mm0
-      movq [Y],mm0
-      movq [8+Y],mm0
-      movq [16+Y],mm0
-      movq [24+Y],mm0
-      movq [32+Y],mm0
-      movq [40+Y],mm0
-      movq [48+Y],mm0
-      movq [56+Y],mm0
-      movq [64+Y],mm0
-      movq [72+Y],mm0
-      movq [80+Y],mm0
-      movq [88+Y],mm0
-      movq [96+Y],mm0
-      movq [104+Y],mm0
-      movq [112+Y],mm0
-      movq [120+Y],mm0
-#undef Y
-    }
-    /*Dequantize the coefficients.*/
-    res_buf[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
-    for(zzi=1;zzi<_ncoefs;zzi++){
-      res_buf[OC_FZIG_ZAG_MMX[zzi]]=(ogg_int16_t)(_dct_coeffs[zzi]*(int)_ac_quant[zzi]);
-    }
-    oc_idct8x8_mmx(res_buf,_last_zzi,_ncoefs);
+    /*Dequantize the DC coefficient.*/
+    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
+    oc_idct8x8_mmx(_dct_coeffs,_last_zzi,_ncoefs);
   }
   /*Fill in the target buffer.*/
   frag_buf_off=_state->frag_buf_offs[_fragi];
   mb_mode=_state->frags[_fragi].mb_mode;
   ystride=_state->ref_ystride[_pli];
   dst=_state->ref_frame_data[_state->ref_frame_idx[OC_FRAME_SELF]]+frag_buf_off;
-  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,res_buf);
+  if(mb_mode==OC_MODE_INTRA)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs);
   else{
     const unsigned char *ref;
     int                  mvoffsets[2];
@@ -134,9 +91,9 @@
     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
      _state->frag_mvs[_fragi][0],_state->frag_mvs[_fragi][1])>1){
       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
-       res_buf);
+       _dct_coeffs);
     }
-    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,res_buf);
+    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs);
   }
 }
 

Modified: branches/theora-thusnelda/lib/dec/x86_vc/x86int.h
===================================================================
--- branches/theora-thusnelda/lib/dec/x86_vc/x86int.h	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/dec/x86_vc/x86int.h	2009-08-03 04:50:27 UTC (rev 16403)
@@ -29,10 +29,9 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
-void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
+void oc_idct8x8_mmx(ogg_int16_t _y[64],int _last_zzi);
 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_mmx(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);

Modified: branches/theora-thusnelda/lib/enc/analyze.c
===================================================================
--- branches/theora-thusnelda/lib/enc/analyze.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/enc/analyze.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -781,7 +781,7 @@
   }
   else{
     data[0]=dc*dc_dequant;
-    oc_idct8x8(&_enc->state,data,nonzero+1,nonzero+1);
+    oc_idct8x8(&_enc->state,data,nonzero+1);
   }
   if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
   else{
@@ -1332,7 +1332,8 @@
 /*The estimated number of bits used by a coded chroma block to specify the AC
    quantizer.
   TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
-   we should measure it.*/
+   measurements suggest this is in the right ballpark, but it varies somewhat
+   with lambda.*/
 #define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
 
 static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
@@ -1344,7 +1345,6 @@
   unsigned     rate;
   int          overhead;
   unsigned     satd;
-  unsigned     best_cost;
   unsigned     best_ssd;
   unsigned     best_rate;
   int          best_overhead;
@@ -1373,6 +1373,7 @@
   for(bi=0;bi<4;bi++){
     oc_fr_state  ft[2];
     oc_qii_state qt[3];
+    unsigned     best_cost;
     satd=_frag_satd[bi];
     *(ft+0)=*&fr;
     oc_fr_code_block(ft+0);
@@ -1402,7 +1403,6 @@
       cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
       cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
       if(cur_cost<=best_cost){
-        best_cost=cur_cost;
         best_ssd=cur_ssd;
         best_rate=0;
         best_overhead=cur_overhead;
@@ -1429,7 +1429,6 @@
   unsigned ssd;
   unsigned rate;
   unsigned satd;
-  unsigned best_cost;
   unsigned best_ssd;
   unsigned best_rate;
   int      best_qii;
@@ -1453,6 +1452,7 @@
   bi=4;
   for(pli=1;pli<3;pli++){
     for(;bi<nblocks;bi++){
+      unsigned best_cost;
       satd=_frag_satd[bi];
       best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd)
        +OC_CHROMA_QII_RATE;
@@ -1473,7 +1473,6 @@
         cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
         cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
         if(cur_cost<=best_cost){
-          best_cost=cur_cost;
           best_ssd=cur_ssd;
           best_rate=0;
           best_qii+=4;
@@ -2331,6 +2330,120 @@
   }
 }
 
+
+
+/*The following token skipping code used to also be used in the decoder (and
+   even at one point other places in the encoder).
+  However, it was obsoleted by other optimizations, and is now only used here.
+  It has been moved here to avoid generating the code when it's not needed.*/
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.*/
+typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
+
+/*Handles the simple end of block tokens.*/
+static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
+  int nblocks_adjust;
+  nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
+  return -_extra_bits-nblocks_adjust;
+}
+
+/*The last EOB token has a special case, where an EOB run of size zero ends all
+   the remaining blocks in the frame.*/
+static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
+  /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
+     yet available everywhere; this should be equivalent.*/
+  if(!_extra_bits)return -(~(size_t)0>>1);
+  return -_extra_bits;
+}
+
+/*Handles the pure zero run tokens.*/
+static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
+  return _extra_bits+1;
+}
+
+/*Handles a normal coefficient value token.*/
+static ptrdiff_t oc_token_skip_val(void){
+  return 1;
+}
+
+/*Handles a category 1A zero run/coefficient value combo token.*/
+static ptrdiff_t oc_token_skip_run_cat1a(int _token){
+  return _token-OC_DCT_RUN_CAT1A+2;
+}
+
+/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
+static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
+  int run_cati;
+  int ncoeffs_mask;
+  int ncoeffs_adjust;
+  run_cati=_token-OC_DCT_RUN_CAT1B;
+  ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
+  ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
+  return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
+}
+
+/*A jump table for computing the number of coefficients or blocks to skip for
+   a given token value.
+  This reduces all the conditional branches, etc., needed to parse these token
+   values down to one indirect jump.*/
+static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob,
+  oc_token_skip_eob6,
+  oc_token_skip_zrl,
+  oc_token_skip_zrl,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_val,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  (oc_token_skip_func)oc_token_skip_run_cat1a,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run,
+  oc_token_skip_run
+};
+
+/*Determines the number of blocks or coefficients to be skipped for a given
+   token value.
+  _token:      The token value to skip.
+  _extra_bits: The extra bits attached to this token.
+  Return: A positive value indicates that number of coefficients are to be
+           skipped in the current block.
+          Otherwise, the negative of the return value indicates that number of
+           blocks are to be ended.
+          0 will never be returned, so that at least one coefficient in one
+           block will always be decoded for every token.*/
+static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
+  return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
+}
+
+
+
 void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
   static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
      0,16,16,16,16,16,32,32,

Modified: branches/theora-thusnelda/lib/enc/tokenize.c
===================================================================
--- branches/theora-thusnelda/lib/enc/tokenize.c	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/enc/tokenize.c	2009-08-03 04:50:27 UTC (rev 16403)
@@ -44,8 +44,9 @@
   }
 }
 
+/*Returns the number of blocks ended by an EOB token.*/
 static int oc_decode_eob_token(int _token,int _eb){
-  return -oc_dct_token_skip(_token,_eb);
+  return (0x20820C41U>>_token*5&0x1F)+_eb;
 }
 
 /*TODO: This is now only used during DCT tokenization, and never for runs; it

Modified: branches/theora-thusnelda/lib/internal.h
===================================================================
--- branches/theora-thusnelda/lib/internal.h	2009-08-02 19:30:11 UTC (rev 16402)
+++ branches/theora-thusnelda/lib/internal.h	2009-08-03 04:50:27 UTC (rev 16403)
@@ -278,10 +278,9 @@
    const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
   void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
    const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
+  void (*idct8x8)(ogg_int16_t _y[64],int _last_zzi);
   void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
-   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
-   ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+   int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
   void (*state_frag_copy_list)(const oc_theora_state *_state,
    const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
    int _dst_frame,int _src_frame,int _pli);
@@ -425,8 +424,6 @@
 void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
  const th_ycbcr_buffer _src);
 
-ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits);
-
 int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
 void oc_state_clear(oc_theora_state *_state);
 void oc_state_vtable_init_c(oc_theora_state *_state);
@@ -457,11 +454,9 @@
 void oc_frag_recon_inter2(const oc_theora_state *_state,
  unsigned char *_dst,const unsigned char *_src1,const unsigned char *_src2,
  int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
- int _last_zzi,int _ncoefs);
+void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],int _last_zzi);
 void oc_state_frag_recon(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);
@@ -478,10 +473,9 @@
  const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
 void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
  const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
-void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi,int _ncoefs);
+void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi);
 void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
- int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,int _ncoefs,
- ogg_uint16_t _dc_quant,const ogg_uint16_t _ac_quant[64]);
+ int _pli,ogg_int16_t _dct_coeffs[64],int _last_zzi,ogg_uint16_t _dc_quant);
 void oc_state_frag_copy_list_c(const oc_theora_state *_state,
  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,
  int _dst_frame,int _src_frame,int _pli);