[xiph-commits] r17221 - in branches/theora-gumboot: . lib

Sun May 16 05:46:19 PDT 2010

Author: gumboot
Date: 2010-05-16 05:46:19 -0700 (Sun, 16 May 2010)
New Revision: 17221

Modified:
   branches/theora-gumboot/
   branches/theora-gumboot/CHANGES
   branches/theora-gumboot/lib/analyze.c
   branches/theora-gumboot/lib/bitpack.c
   branches/theora-gumboot/lib/bitpack.h
   branches/theora-gumboot/lib/decint.h
   branches/theora-gumboot/lib/decode.c
   branches/theora-gumboot/lib/huffdec.c
   branches/theora-gumboot/lib/huffdec.h
Log:
Integrate latest changes from ptalarbvorm, thereby exaggerating the damage I've done while messing around.



Property changes on: branches/theora-gumboot
___________________________________________________________________
Modified: svn:mergeinfo
   - /trunk/theora:14138-15320,15741
   + /experimental/derf/theora-ptalarbvorm:17164-17220
/trunk/theora:14138-15320,15741

Modified: branches/theora-gumboot/CHANGES
===================================================================

--- branches/theora-gumboot/CHANGES	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/CHANGES	2010-05-16 12:46:19 UTC (rev 17221)
@@ -136,7 +136,7 @@
  - Granulepos scheme modified to match other codecs. This bumps
    the bitstream revision to 3.2.1. Bitstreams marked 3.2.0 are
    handled correctly by this decoder. Older decoders will show
-   a one frame sync error in the less noticable direction.
+   a one frame sync error in the less noticeable direction.
 
 libtheora 1.0alpha8 (2007 September 18)
 

Modified: branches/theora-gumboot/lib/analyze.c
===================================================================
--- branches/theora-gumboot/lib/analyze.c	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/analyze.c	2010-05-16 12:46:19 UTC (rev 17221)
@@ -758,7 +758,6 @@
   int                     nonzero;
   unsigned                uncoded_ssd;
   unsigned                coded_ssd;
-  int                     coded_dc;
   oc_token_checkpoint    *checkpoint;
   oc_fragment            *frags;
   int                     mb_mode;
@@ -918,11 +917,10 @@
   {
     /*In retrospect, should we have skipped this block?*/
     oc_enc_frag_sub(_enc,data,src,dst,ystride);
-    coded_ssd=coded_dc=0;
+    coded_ssd=0;
     if(borderi<0){
       for(pi=0;pi<64;pi++){
         coded_ssd+=data[pi]*data[pi];
-        coded_dc+=data[pi];
       }
     }
     else{
@@ -930,13 +928,10 @@
       mask=_enc->state.borders[borderi].mask;
       for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
         coded_ssd+=data[pi]*data[pi];
-        coded_dc+=data[pi];
       }
     }
     /*Scale to match DCT domain.*/
     coded_ssd<<=4;
-    /*We actually only want the AC contribution to the SSD.*/
-    coded_ssd-=coded_dc*coded_dc>>2;
 #if defined(OC_COLLECT_METRICS)
     _enc->frag_ssd[_fragi]=coded_ssd;
   }
@@ -1924,7 +1919,6 @@
   unsigned                uncoded_ssd;
   int                     uncoded_dc;
   unsigned                dc_dequant;
-  int                     dc_flag;
   int                     mapii;
   int                     mapi;
   int                     pli;
@@ -1949,7 +1943,6 @@
     if(borderi<0){
       for(pi=0;pi<64;pi++){
         uncoded_ssd+=buffer[pi]*buffer[pi];
-        uncoded_dc+=buffer[pi];
       }
     }
     else{
@@ -1957,18 +1950,16 @@
       mask=_enc->state.borders[borderi].mask;
       for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
         uncoded_ssd+=buffer[pi]*buffer[pi];
-        uncoded_dc+=buffer[pi];
       }
     }
     /*Scale to match DCT domain.*/
     uncoded_ssd<<=4;
-    /*We actually only want the AC contribution to the SSD.*/
-    uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
     uncoded_ssd=OC_RD_SCALE(uncoded_ssd,_rd_scale[bi]);
-    /*DC is a special case; if there's more than a full-quantizer improvement
-       in the effective DC component, always force-code the block.*/
-    dc_flag=abs(uncoded_dc)>dc_dequant<<1;
-    uncoded_ssd|=-dc_flag;
+    /*Motion is a special case; if there is more than a full-pixel motion 
+      against the prior frame, penalize skipping. TODO: The factor of 
+      two here is a kludge, but it tested out better than a hard limit*/
+    if(_enc->mb_info[_mbi].block_mv[bi][0]!=0||
+      _enc->mb_info[_mbi].block_mv[bi][1]!=0)uncoded_ssd*=2;
     _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
   }
   mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
@@ -1990,25 +1981,22 @@
       if(borderi<0){
         for(pi=0;pi<64;pi++){
           uncoded_ssd+=buffer[pi]*buffer[pi];
-          uncoded_dc+=buffer[pi];
         }
       }
       else{
         mask=_enc->state.borders[borderi].mask;
         for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
           uncoded_ssd+=buffer[pi]*buffer[pi];
-          uncoded_dc+=buffer[pi];
         }
       }
       /*Scale to match DCT domain.*/
       uncoded_ssd<<=4;
-      /*We actually only want the AC contribution to the SSD.*/
-      uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
       uncoded_ssd=OC_RD_SCALE(uncoded_ssd,_rd_scale[4]);
-      /*DC is a special case; if there's more than a full-quantizer improvement
-         in the effective DC component, always force-code the block.*/
-      dc_flag=abs(uncoded_dc)>dc_dequant<<1;
-      uncoded_ssd|=-dc_flag;
+      /*Motion is a special case; if there is more than a full-pixel motion 
+        against the prior frame, penalize skipping. TODO: The factor of 
+        two here is a kludge, but it tested out better than a hard limit*/
+      if(_enc->mb_info[_mbi].unref_mv[OC_FRAME_PREV][0]!=0||
+        _enc->mb_info[_mbi].unref_mv[OC_FRAME_PREV][1]!=0)uncoded_ssd*=2;
       _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
     }
     map_nidxs=(map_nidxs-4<<1)+4;

Modified: branches/theora-gumboot/lib/bitpack.c
===================================================================
--- branches/theora-gumboot/lib/bitpack.c	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/bitpack.c	2010-05-16 12:46:19 UTC (rev 17221)
@@ -32,10 +32,10 @@
   const unsigned char *stop;
   oc_pb_window         window;
   int                  available;
+  stop=_b->stop;
+  ptr=_b->ptr;
   window=_b->window;
   available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
   while(available<=OC_PB_WINDOW_SIZE-8&&ptr<stop){
     available+=8;
     window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
@@ -82,8 +82,8 @@
   available-=_bits;
   window<<=1;
   window<<=_bits-1;
+  _b->window=window;
   _b->bits=available;
-  _b->window=window;
   return result;
 }
 
@@ -100,8 +100,8 @@
   result=window>>OC_PB_WINDOW_SIZE-1;
   available--;
   window<<=1;
+  _b->window=window;
   _b->bits=available;
-  _b->window=window;
   return result;
 }
 

Modified: branches/theora-gumboot/lib/bitpack.h
===================================================================
--- branches/theora-gumboot/lib/bitpack.h	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/bitpack.h	2010-05-16 12:46:19 UTC (rev 17221)
@@ -35,9 +35,9 @@
 
 
 struct oc_pack_buf{
+  const unsigned char *stop;
+  const unsigned char *ptr;
   oc_pb_window         window;
-  const unsigned char *ptr;
-  const unsigned char *stop;
   int                  bits;
   int                  eof;
 };

Modified: branches/theora-gumboot/lib/decint.h
===================================================================
--- branches/theora-gumboot/lib/decint.h	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/decint.h	2010-05-16 12:46:19 UTC (rev 17221)
@@ -37,7 +37,7 @@
 
 struct th_setup_info{
   /*The Huffman codes.*/
-  oc_huff_node      *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t   *huff_tables[TH_NHUFFMAN_TABLES];
   /*The quantization parameters.*/
   th_quant_info  qinfo;
 };
@@ -55,7 +55,7 @@
   /*Buffer in which to assemble packets.*/
   oc_pack_buf          opb;
   /*Huffman decode trees.*/
-  oc_huff_node        *huff_tables[TH_NHUFFMAN_TABLES];
+  ogg_int16_t         *huff_tables[TH_NHUFFMAN_TABLES];
   /*The index of the first token in each plane for each coefficient.*/
   ptrdiff_t            ti0[3][64];
   /*The number of outstanding EOB runs at the start of each coefficient in each

Modified: branches/theora-gumboot/lib/decode.c
===================================================================
--- branches/theora-gumboot/lib/decode.c	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/decode.c	2010-05-16 12:46:19 UTC (rev 17221)
@@ -371,7 +371,7 @@
   ret=oc_state_init(&_dec->state,_info,3);
   if(ret<0)return ret;
   ret=oc_huff_trees_copy(_dec->huff_tables,
-   (const oc_huff_node *const *)_setup->huff_tables);
+   (const ogg_int16_t *const *)_setup->huff_tables);
   if(ret<0){
     oc_state_clear(&_dec->state);
     return ret;

Modified: branches/theora-gumboot/lib/huffdec.c
===================================================================
--- branches/theora-gumboot/lib/huffdec.c	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/huffdec.c	2010-05-16 12:46:19 UTC (rev 17221)
@@ -22,10 +22,60 @@
 #include "decint.h"
 
 
-/*The ANSI offsetof macro is broken on some platforms (e.g., older DECs).*/
-#define _ogg_offsetof(_type,_field)\
- ((size_t)((char *)&((_type *)0)->_field-(char *)0))
 
+/*Instead of storing every branching in the tree, subtrees can be collapsed
+   into one node, with a table of size 1<<nbits pointing directly to its
+   descedents nbits levels down.
+  This allows more than one bit to be read at a time, and avoids following all
+   the intermediate branches with next to no increased code complexity once
+   the collapsed tree has been built.
+  We do _not_ require that a subtree be complete to be collapsed, but instead
+   store duplicate pointers in the table, and record the actual depth of the
+   node below its parent.
+  This tells us the number of bits to advance the stream after reaching it.
+
+  This turns out to be equivalent to the method described in \cite{Hash95},
+   without the requirement that codewords be sorted by length.
+  If the codewords were sorted by length (so-called ``canonical-codes''), they
+   could be decoded much faster via either Lindell and Moffat's approach or
+   Hashemian's Condensed Huffman Code approach, the latter of which has an
+   extremely small memory footprint.
+  We can't use Choueka et al.'s finite state machine approach, which is
+   extremely fast, because we can't allow multiple symbols to be output at a
+   time; the codebook can and does change between symbols.
+  It also has very large memory requirements, which impairs cache coherency.
+
+  We store the tree packed in an array of 16-bit integers (words).
+  Each node of the tree consists of a single word, followed consecutiably by
+   zero or more indexes of its children.
+  Let n be the value of this first word.
+
+  If n is negative or zero, then this is a leaf node.
+  The token in the leaf node is (-n&255).
+  The number of bits that should have been read to reach this child, starting
+   from the current node, is (-n>>8).
+
+  If n is positive, then it is the number of bits that need to be read to
+   traverse the node.
+  1<<n entries follow in the array, each an index to a child node.
+  As an optimization to save space, leaf nodes are stored directly, instead of
+   storing an index, since they only require a single word.
+  If a leaf node would have been before reading n bits, then it is duplicated
+   the necessary number of times in this table.
+
+  @ARTICLE{Hash95,
+    author="Reza Hashemian",
+    title="Memory Efficient and High-Speed Search {Huffman} Coding",
+    journal="{IEEE} Transactions on Communications",
+    volume=43,
+    number=10,
+    pages="2576--2581",
+    month=Oct,
+    year=1995
+  }*/
+
+
+
 /*The number of internal tokens associated with each of the spec tokens.*/
 static const unsigned char OC_DCT_TOKEN_MAP_ENTRIES[TH_NDCT_TOKENS]={
   1,1,1,4,8,1,1,8,1,1,1,1,1,2,2,2,2,4,8,2,2,2,4,2,2,2,2,2,8,2,4,8
@@ -99,61 +149,7 @@
   40
 };
 
-/*These three functions are really part of the bitpack.c module, but
-   they are only used here.
-  Declaring local static versions so they can be inlined saves considerable
-   function call overhead.*/
 
-static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
-  const unsigned char *ptr;
-  const unsigned char *stop;
-  oc_pb_window         window;
-  int                  available;
-  window=_b->window;
-  available=_b->bits;
-  ptr=_b->ptr;
-  stop=_b->stop;
-  /*This version of _refill() doesn't bother setting eof because we won't
-     check for it after we've started decoding DCT tokens.*/
-  if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  while(available<=OC_PB_WINDOW_SIZE-8){
-    available+=8;
-    window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
-    if(ptr>=stop)available=OC_LOTS_OF_BITS;
-  }
-  _b->ptr=ptr;
-  if(_bits>available)window|=*ptr>>(available&7);
-  _b->bits=available;
-  return window;
-}
-
-
-/*Read in bits without advancing the bit pointer.
-  Here we assume 0<=_bits&&_bits<=32.*/
-static long oc_pack_look(oc_pack_buf *_b,int _bits){
-  oc_pb_window window;
-  int          available;
-  long         result;
-  window=_b->window;
-  available=_b->bits;
-  if(_bits==0)return 0;
-  if(_bits>available)_b->window=window=oc_pack_refill(_b,_bits);
-  result=window>>OC_PB_WINDOW_SIZE-_bits;
-  return result;
-}
-
-/*Advance the bit pointer.*/
-static void oc_pack_adv(oc_pack_buf *_b,int _bits){
-  /*We ignore the special cases for _bits==0 and _bits==32 here, since they are
-     never used actually used.
-    OC_HUFF_SLUSH (defined below) would have to be at least 27 to actually read
-     32 bits in a single go, and would require a 32 GB lookup table (assuming
-     8 byte pointers, since 4 byte pointers couldn't fit such a table).*/
-  _b->window<<=_bits;
-  _b->bits-=_bits;
-}
-
-
 /*The log_2 of the size of a lookup table is allowed to grow to relative to
    the number of unique nodes it contains.
   E.g., if OC_HUFF_SLUSH is 2, then at most 75% of the space in the tree is
@@ -169,47 +165,53 @@
   10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
   10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
   Since a value of 1 gets us the vast majority of the speed-up with only a
-   small amount of wasted memory, this is what we use.*/
+   small amount of wasted memory, this is what we use.
+  This value must be less than 7, or you could create a tree with more than
+   32767 entries, which would overflow the 16-bit words used to index it.*/
 #define OC_HUFF_SLUSH (1)
+/*The root of the tree is the fast path, and a larger value here is more
+   beneficial than elsewhere in the tree.
+  3 appears to give the best performance, trading off between increased use of
+   the single-read fast path and cache footprint for the tables.
+  using 3 here The VP3 tables are about 2.6x larger using 3 here than using 1.*/
+#define OC_ROOT_HUFF_SLUSH (3)
 
 
-/*Determines the size in bytes of a Huffman tree node that represents a
+/*Determines the size in words of a Huffman tree node that represents a
    subtree of depth _nbits.
   _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
+          If this is <=0, the node is a leaf node.
           Otherwise 1<<_nbits pointers are allocated for children.
-  Return: The number of bytes required to store the node.*/
+  Return: The number of words required to store the node.*/
 static size_t oc_huff_node_size(int _nbits){
-  size_t size;
-  size=_ogg_offsetof(oc_huff_node,nodes);
-  if(_nbits>0)size+=sizeof(oc_huff_node *)*(1<<_nbits);
-  return size;
+  return _nbits>0?1+(1<<_nbits):1;
 }
 
-static oc_huff_node *oc_huff_node_init(char **_storage,size_t _size,int _nbits){
-  oc_huff_node *ret;
-  ret=(oc_huff_node *)*_storage;
-  ret->nbits=(unsigned char)_nbits;
-  (*_storage)+=_size;
-  return ret;
-}
 
-
-/*Determines the size in bytes of a Huffman tree.
-  _nbits: The depth of the subtree.
-          If this is 0, the node is a leaf node.
-          Otherwise storage for 1<<_nbits pointers are added for children.
-  Return: The number of bytes required to store the tree.*/
-static size_t oc_huff_tree_size(const oc_huff_node *_node){
+/*Determines the size in words of a Huffman subtree.
+  _tree: The complete Huffman tree.
+  _node: The index of the root of the desired subtree.
+  Return: The number of words required to store the tree.*/
+static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
   size_t size;
-  size=oc_huff_node_size(_node->nbits);
-  if(_node->nbits){
+  int    n;
+  n=_tree[_node];
+  size=oc_huff_node_size(n);
+  if(n>0){
     int nchildren;
     int i;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;i+=1<<_node->nbits-_node->nodes[i]->depth){
-      size+=oc_huff_tree_size(_node->nodes[i]);
+    nchildren=1<<n;
+    i=0;
+    do{
+      int child;
+      child=_tree[_node+i+1];
+      if(child<=0)i+=1<<n-(-child>>8);
+      else{
+        size+=oc_huff_tree_size(_tree,child);
+        i++;
+      }
     }
+    while(i<nchildren);
   }
   return size;
 }
@@ -217,208 +219,283 @@
 
 /*Unpacks a sub-tree from the given buffer.
   _opb:      The buffer to unpack from.
-  _binodes:  The nodes to store the sub-tree in.
-  _nbinodes: The number of nodes available for the sub-tree.
-  Return: 0 on success, or a negative value on error.*/
+  _bitree:   The tree to store the sub-tree in.
+  _nbinodes: The number of nodes available for the tree.
+  _binode:   The index of the tree to store it in.
+             This is updated to point to the first available slot past the
+              sub-tree on return.
+             If an error occurs, this is set larger than _nbinodes.
+  Return: The node index of the sub-tree, or leaf node value if this was a leaf
+   node and this wasn't the root of the tree.*/
 static int oc_huff_tree_unpack(oc_pack_buf *_opb,
- oc_huff_node *_binodes,int _nbinodes){
-  oc_huff_node *binode;
-  long          bits;
-  int           nused;
-  if(_nbinodes<1)return TH_EBADHEADER;
-  binode=_binodes;
-  nused=0;
+ ogg_int16_t *_bitree,int _nbinodes,int *_binode){
+  long bits;
+  int  binode;
+  binode=*_binode;
+  if(binode>_nbinodes)return 0;
   bits=oc_pack_read1(_opb);
-  if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
+  if(oc_pack_bytes_left(_opb)<0){
+    *_binode=_nbinodes+1;
+    return 0;
+  }
   /*Read an internal node:*/
   if(!bits){
-    int ret;
-    nused++;
-    binode->nbits=1;
-    binode->depth=1;
-    binode->nodes[0]=_binodes+nused;
-    ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    if(ret>=0){
-      nused+=ret;
-      binode->nodes[1]=_binodes+nused;
-      ret=oc_huff_tree_unpack(_opb,_binodes+nused,_nbinodes-nused);
-    }
-    if(ret<0)return ret;
-    nused+=ret;
+    *_binode=binode+3;
+    if(binode+3>_nbinodes)return 0;
+    _bitree[binode+0]=1;
+    _bitree[binode+1]=
+     (ogg_int16_t)oc_huff_tree_unpack(_opb,_bitree,_nbinodes,_binode);
+    _bitree[binode+2]=
+     (ogg_int16_t)oc_huff_tree_unpack(_opb,_bitree,_nbinodes,_binode);
   }
   /*Read a leaf node:*/
   else{
     int ntokens;
     int token;
-    int i;
     bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
-    if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
-    /*Find out how many internal tokens we translate this external token into.*/
+    if(oc_pack_bytes_left(_opb)<0){
+      *_binode=_nbinodes+1;
+      return 0;
+    }
+    /*Find out how many internal tokens we translate this external token into.
+      This must be a power of two.*/
     ntokens=OC_DCT_TOKEN_MAP_ENTRIES[bits];
-    if(_nbinodes<2*ntokens-1)return TH_EBADHEADER;
-    /*Fill in a complete binary tree pointing to the internal tokens.*/
-    for(i=1;i<ntokens;i<<=1){
+    token=OC_DCT_TOKEN_MAP[bits];
+    if(ntokens<=1){
+      /*If we only have the one leaf, return it directly.*/
+      if(binode>0)return -(1<<8|token);
+      /*Unless we are the root of the tree; then we have to store it as a
+         node explicitly.*/
+      *_binode=binode+1;
+      if(binode+1>_nbinodes)return 0;
+      _bitree[binode]=-token;
+    }
+    else{
+      int i;
       int j;
-      binode=_binodes+nused;
-      nused+=i;
-      for(j=0;j<i;j++){
-        binode[j].nbits=1;
-        binode[j].depth=1;
-        binode[j].nodes[0]=_binodes+nused+2*j;
-        binode[j].nodes[1]=_binodes+nused+2*j+1;
+      /*Fill in a complete binary tree pointing to the internal nodes.*/
+      *_binode=binode+3*(ntokens-1);
+      if(binode+3*(ntokens-1)>_nbinodes)return 0;
+      for(i=1;i<ntokens>>1;i<<=1){
+        int child;
+        child=binode+3*(2*i-1);
+        for(j=0;j<i;j++){
+          _bitree[binode+3*(i-1+j)+0]=1;
+          _bitree[binode+3*(i-1+j)+1]=(ogg_int16_t)child;
+          child+=3;
+          _bitree[binode+3*(i-1+j)+2]=(ogg_int16_t)child;
+          child+=3;
+        }
       }
+      /*And now the last row pointing to the leaf nodes.*/
+      for(i=j=0;j<ntokens>>1;j++){
+        _bitree[binode+3*((ntokens>>1)-1+j)+0]=1;
+        _bitree[binode+3*((ntokens>>1)-1+j)+1]=(ogg_int16_t)-(1<<8|token+i++);
+        _bitree[binode+3*((ntokens>>1)-1+j)+2]=(ogg_int16_t)-(1<<8|token+i++);
+      }
     }
-    /*And now the leaf nodes with those tokens.*/
-    token=OC_DCT_TOKEN_MAP[bits];
-    for(i=0;i<ntokens;i++){
-      binode=_binodes+nused++;
-      binode->nbits=0;
-      binode->depth=1;
-      binode->token=token+i;
-    }
   }
-  return nused;
+  return binode;
 }
 
 /*Finds the depth of shortest branch of the given sub-tree.
   The tree must be binary.
+  _bitree: A packed binary Huffman tree.
   _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
+           _bitree[_binode] must be <=1.
   Return: The smallest depth of a leaf node in this sub-tree.
           0 indicates this sub-tree is a leaf node.*/
-static int oc_huff_tree_mindepth(oc_huff_node *_binode){
+static int oc_huff_tree_mindepth(const ogg_int16_t *_bitree,int _binode){
+  int child;
   int depth0;
   int depth1;
-  if(_binode->nbits==0)return 0;
-  depth0=oc_huff_tree_mindepth(_binode->nodes[0]);
-  depth1=oc_huff_tree_mindepth(_binode->nodes[1]);
+  if(_bitree[_binode]<=0)return 0;
+  child=_bitree[_binode+1];
+  depth0=child<=0?0:oc_huff_tree_mindepth(_bitree,child);
+  child=_bitree[_binode+2];
+  depth1=child<=0?0:oc_huff_tree_mindepth(_bitree,child);
   return OC_MINI(depth0,depth1)+1;
 }
 
 /*Finds the number of internal nodes at a given depth, plus the number of
    leaves at that depth or shallower.
   The tree must be binary.
+  _bitree: A packed binary Huffman tree.
   _binode: The root of the given sub-tree.
-           _binode->nbits must be 0 or 1.
-  Return: The number of entries that would be contained in a jump table of the
-           given depth.*/
-static int oc_huff_tree_occupancy(oc_huff_node *_binode,int _depth){
-  if(_binode->nbits==0||_depth<=0)return 1;
+           _bitree[_binode] must be <=1.
+  Return: The number of unique entries that would be contained in a jump table
+           of the given depth.*/
+static int oc_huff_tree_occupancy(const ogg_int16_t *_bitree,int _binode,
+ int _depth){
+  if(_depth<=0||_bitree[_binode]<=0)return 1;
   else{
-    return oc_huff_tree_occupancy(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_occupancy(_binode->nodes[1],_depth-1);
+    int child1;
+    int child2;
+    child1=_bitree[_binode+1];
+    child2=_bitree[_binode+2];
+    return (child1<=0?1:oc_huff_tree_occupancy(_bitree,child1,_depth-1))+
+     (child2<=0?1:oc_huff_tree_occupancy(_bitree,child2,_depth-1));
   }
 }
 
-/*Makes a copy of the given Huffman tree.
-  _node: The Huffman tree to copy.
-  Return: The copy of the Huffman tree.*/
-static oc_huff_node *oc_huff_tree_copy(const oc_huff_node *_node,
- char **_storage){
-  oc_huff_node *ret;
-  ret=oc_huff_node_init(_storage,oc_huff_node_size(_node->nbits),_node->nbits);
-  ret->depth=_node->depth;
-  if(_node->nbits){
+/*Makes a copy of the given Huffman sub-tree.
+  _dst: The destination tree.
+  _src: The destination tree.
+  _node: The index Huffman sub-tree to copy.
+  Return: The index of the first available slot after the sub-tree copy.*/
+static int oc_huff_tree_copy(ogg_int16_t *_dst,
+ const ogg_int16_t *_src,int _node){
+  int n;
+  n=_src[_node];
+  _dst[_node++]=(ogg_int16_t)n;
+  if(n<=0)return _node;
+  else{
     int nchildren;
+    int ret;
     int i;
-    int inext;
-    nchildren=1<<_node->nbits;
-    for(i=0;i<nchildren;){
-      ret->nodes[i]=oc_huff_tree_copy(_node->nodes[i],_storage);
-      inext=i+(1<<_node->nbits-ret->nodes[i]->depth);
-      while(++i<inext)ret->nodes[i]=ret->nodes[i-1];
+    nchildren=1<<n;
+    ret=_node+nchildren;
+    for(i=0;i<nchildren;i++){
+      int child;
+      child=_src[_node+i];
+      _dst[_node+i]=(ogg_int16_t)child;
+      if(child>0){
+        /*Note: This assumes that storage for children is in order, and
+           consecutive.*/
+        ret=oc_huff_tree_copy(_dst,_src,child);
+      }
     }
+    return ret;
   }
-  else ret->token=_node->token;
-  return ret;
 }
 
-static size_t oc_huff_tree_collapse_size(oc_huff_node *_binode,int _depth){
+static size_t oc_huff_tree_collapse_size(ogg_int16_t *_bitree,int _binode,
+ int _depth){
   size_t size;
   int    mindepth;
   int    depth;
+  int    threshold;
   int    loccupancy;
   int    occupancy;
-  if(_binode->nbits!=0&&_depth>0){
-    return oc_huff_tree_collapse_size(_binode->nodes[0],_depth-1)+
-     oc_huff_tree_collapse_size(_binode->nodes[1],_depth-1);
+  int    child1;
+  int    child2;
+  int    n;
+  n=_bitree[_binode];
+  if(n>0&&_depth>0){
+    child1=_bitree[_binode+1];
+    child2=_bitree[_binode+2];
+    return (child1<=0?0:oc_huff_tree_collapse_size(_bitree,child1,_depth-1))+
+     (child2<=0?0:oc_huff_tree_collapse_size(_bitree,child2,_depth-1));
   }
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
+  depth=mindepth=oc_huff_tree_mindepth(_bitree,_binode);
   occupancy=1<<mindepth;
   do{
     loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
+    occupancy=oc_huff_tree_occupancy(_bitree,_binode,++depth);
+    threshold=1<<OC_MAXI(depth-(_binode>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH),0);
   }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+  while(occupancy>loccupancy&&occupancy>=threshold);
   depth--;
   size=oc_huff_node_size(depth);
   if(depth>0){
-    size+=oc_huff_tree_collapse_size(_binode->nodes[0],depth-1);
-    size+=oc_huff_tree_collapse_size(_binode->nodes[1],depth-1);
+    child1=_bitree[_binode+1];
+    child2=_bitree[_binode+2];
+    size+=child1<=0?0:oc_huff_tree_collapse_size(_bitree,child1,depth-1);
+    size+=child2<=0?0:oc_huff_tree_collapse_size(_bitree,child2,depth-1);
   }
   return size;
 }
 
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage);
+static int oc_huff_tree_collapse(ogg_int16_t *_tree,int *_node,
+ const ogg_int16_t *_bitree,int _binode);
 
 /*Fills the given nodes table with all the children in the sub-tree at the
    given depth.
   The nodes in the sub-tree with a depth less than that stored in the table
    are freed.
   The sub-tree must be binary and complete up until the given depth.
-  _nodes:  The nodes table to fill.
-  _binode: The root of the sub-tree to fill it with.
-           _binode->nbits must be 0 or 1.
+  _tree:   The storage for the collapsed Huffman tree.
+  _node:   The index of the next available storage in the tree.
+  _offs:   The index of the nodes to fill.
+  _bitree: The binary tree to collapse.
+  _binode: The index of the root of the sub-tree to fill it with.
+           _bitree[_binode] must be <=1 .
   _level:  The current level in the table.
            0 indicates that the current node should be stored, regardless of
             whether it is a leaf node or an internal node.
   _depth:  The depth of the nodes to fill the table with, relative to their
             parent.*/
-static void oc_huff_node_fill(oc_huff_node **_nodes,
- oc_huff_node *_binode,int _level,int _depth,char **_storage){
-  if(_level<=0||_binode->nbits==0){
-    int i;
-    _binode->depth=(unsigned char)(_depth-_level);
-    _nodes[0]=oc_huff_tree_collapse(_binode,_storage);
-    for(i=1;i<1<<_level;i++)_nodes[i]=_nodes[0];
+static void oc_huff_node_fill(ogg_int16_t *_tree,int *_node,int _offs,
+ const ogg_int16_t *_bitree,int _binode,int _level,int _depth){
+  int leaf;
+  int n;
+  int i;
+  n=_bitree[_binode];
+  if(n<=0){
+    leaf=-((_depth-_level<<8)|(-n&255));
+    for(i=0;i<1<<_level;i++)_tree[_offs+i]=(ogg_int16_t)leaf;
   }
+  else if(_level<=0){
+    _tree[_offs]=oc_huff_tree_collapse(_tree,_node,_bitree,_binode);
+  }
   else{
+    int child1;
+    int child2;
     _level--;
-    oc_huff_node_fill(_nodes,_binode->nodes[0],_level,_depth,_storage);
-    _nodes+=1<<_level;
-    oc_huff_node_fill(_nodes,_binode->nodes[1],_level,_depth,_storage);
+    child1=_bitree[_binode+1];
+    if(child1<=0){
+      leaf=-((_depth-_level<<8)|(-child1&255));
+      for(i=0;i<1<<_level;i++)_tree[_offs+i]=(ogg_int16_t)leaf;
+    }
+    else oc_huff_node_fill(_tree,_node,_offs,_bitree,child1,_level,_depth);
+    _offs+=1<<_level;
+    child2=_bitree[_binode+2];
+    if(child2<=0){
+      leaf=-((_depth-_level<<8)|(-child2&255));
+      for(i=0;i<1<<_level;i++)_tree[_offs+i]=(ogg_int16_t)leaf;
+    }
+    else oc_huff_node_fill(_tree,_node,_offs,_bitree,child2,_level,_depth);
   }
 }
 
 /*Finds the largest complete sub-tree rooted at the current node and collapses
    it into a single node.
   This procedure is then applied recursively to all the children of that node.
-  _binode: The root of the sub-tree to collapse.
-           _binode->nbits must be 0 or 1.
+  _tree:   The storage for the collapsed Huffman tree.
+  _node:   The index of the next available storage in the tree.
+  _offs:   The index of the nodes to fill.
+  _bitree: The binary tree to collapse.
+  _binode: The index of the root of the sub-tree to fill it with.
+           _bitree[_binode] must be <=1 .
   Return: The new root of the collapsed sub-tree.*/
-static oc_huff_node *oc_huff_tree_collapse(oc_huff_node *_binode,
- char **_storage){
-  oc_huff_node *root;
-  size_t        size;
-  int           mindepth;
-  int           depth;
-  int           loccupancy;
-  int           occupancy;
-  depth=mindepth=oc_huff_tree_mindepth(_binode);
+static int oc_huff_tree_collapse(ogg_int16_t *_tree,int *_node,
+ const ogg_int16_t *_bitree,int _binode){
+  int mindepth;
+  int depth;
+  int loccupancy;
+  int occupancy;
+  int threshold;
+  int node;
+  depth=mindepth=oc_huff_tree_mindepth(_bitree,_binode);
   occupancy=1<<mindepth;
   do{
     loccupancy=occupancy;
-    occupancy=oc_huff_tree_occupancy(_binode,++depth);
+    occupancy=oc_huff_tree_occupancy(_bitree,_binode,++depth);
+    threshold=1<<OC_MAXI(depth-(_binode>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH),0);
   }
-  while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+  while(occupancy>loccupancy&&occupancy>=threshold);
   depth--;
-  if(depth<=0)return oc_huff_tree_copy(_binode,_storage);
-  size=oc_huff_node_size(depth);
-  root=oc_huff_node_init(_storage,size,depth);
-  root->depth=_binode->depth;
-  oc_huff_node_fill(root->nodes,_binode,depth,depth,_storage);
-  return root;
+  node=*_node;
+  if(depth<=0){
+    _tree[node]=_bitree[_binode];
+    *_node=node+1;
+  }
+  else{
+    _tree[node]=depth;
+    *_node=node+1+(1<<depth);
+    oc_huff_node_fill(_tree,_node,node+1,_bitree,_binode,depth,depth);
+  }
+  return node;
 }
 
 /*Unpacks a set of Huffman trees, and reduces them to a collapsed
@@ -427,22 +504,25 @@
   _nodes: The table to fill with the Huffman trees.
   Return: 0 on success, or a negative value on error.*/
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    oc_huff_node  nodes[511];
-    char         *storage;
+    ogg_int16_t   nodes[255*3];
+    ogg_int16_t  *tree;
     size_t        size;
-    int           ret;
+    int           node;
     /*Unpack the full tree into a temporary buffer.*/
-    ret=oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes));
-    if(ret<0)return ret;
+    node=0;
+    oc_huff_tree_unpack(_opb,nodes,sizeof(nodes)/sizeof(*nodes),&node);
+    if(node>sizeof(nodes)/sizeof(*nodes))return TH_EBADHEADER;
     /*Figure out how big the collapsed tree will be.*/
-    size=oc_huff_tree_collapse_size(nodes,0);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL)return TH_EFAULT;
+    size=oc_huff_tree_collapse_size(nodes,0,0);
+    tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
+    if(tree==NULL)return TH_EFAULT;
     /*And collapse it.*/
-    _nodes[i]=oc_huff_tree_collapse(nodes,&storage);
+    node=0;
+    oc_huff_tree_collapse(tree,&node,nodes,0);
+    _nodes[i]=tree;
   }
   return 0;
 }
@@ -450,40 +530,100 @@
 /*Makes a copy of the given set of Huffman trees.
   _dst: The array to store the copy in.
   _src: The array of trees to copy.*/
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]){
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
+  int total;
   int i;
+  total=0;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++){
-    size_t  size;
-    char   *storage;
-    size=oc_huff_tree_size(_src[i]);
-    storage=(char *)_ogg_calloc(1,size);
-    if(storage==NULL){
+    size_t       size;
+    ogg_int16_t *tree;
+    size=oc_huff_tree_size(_src[i],0);
+    total+=size;
+    tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
+    if(tree==NULL){
       while(i-->0)_ogg_free(_dst[i]);
       return TH_EFAULT;
     }
-    _dst[i]=oc_huff_tree_copy(_src[i],&storage);
+    oc_huff_tree_copy(tree,_src[i],0);
+    _dst[i]=tree;
   }
   return 0;
 }
 
 /*Frees the memory used by a set of Huffman trees.
   _nodes: The array of trees to free.*/
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]){
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
   int i;
   for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
 }
 
+
 /*Unpacks a single token using the given Huffman tree.
   _opb:  The buffer to unpack the token from.
   _node: The tree to unpack the token with.
   Return: The token value.*/
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node){
-  long bits;
-  while(_node->nbits!=0){
-    bits=oc_pack_look(_opb,_node->nbits);
-    _node=_node->nodes[bits];
-    oc_pack_adv(_opb,_node->depth);
+int oc_huff_token_decode(oc_pack_buf *_opb,const ogg_int16_t *_tree){
+  const unsigned char *ptr;
+  const unsigned char *stop;
+  oc_pb_window         window;
+  int                  available;
+  long                 bits;
+  int                  node;
+  int                  n;
+  n=_tree[0];
+  ptr=_opb->ptr;
+  window=_opb->window;
+  stop=_opb->stop;
+  available=_opb->bits;
+  /*Handle the case of a codebook with a single value.
+    Branch prediction should make this esoteric case free.*/
+  if(n<=0)return -n&255;
+  if(n>available){
+    for(;;){
+      /*We don't bother setting eof because we won't check for it after we've
+         started decoding DCT tokens.*/
+      if(ptr>=stop){
+        available=OC_LOTS_OF_BITS;
+        break;
+      }
+      if(available>OC_PB_WINDOW_SIZE-8)break;
+      available+=8;
+      window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+    }
+    /*Note: We never request more than 24 bits, so there's no need to fill in
+       the last partial byte here.*/
   }
-  return _node->token;
+  bits=window>>OC_PB_WINDOW_SIZE-n;
+  node=_tree[1+bits];
+  while(node>0){
+    window<<=n;
+    available-=n;
+    n=_tree[node];
+    if(n>available){
+      for(;;){
+        /*We don't bother setting eof because we won't check for it after we've
+           started decoding DCT tokens.*/
+        if(ptr>=stop){
+          available=OC_LOTS_OF_BITS;
+          break;
+        }
+        if(available>OC_PB_WINDOW_SIZE-8)break;
+        available+=8;
+        window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
+      }
+      /*Note: We never request more than 24 bits, so there's no need to fill in
+         the last partial byte here.*/
+    }
+    bits=window>>OC_PB_WINDOW_SIZE-n;
+    node=_tree[node+1+bits];
+  }
+  node=-node;
+  n=node>>8;
+  window<<=n;
+  available-=n;
+  _opb->ptr=ptr;
+  _opb->window=window;
+  _opb->bits=available;
+  return node&255;
 }

Modified: branches/theora-gumboot/lib/huffdec.h
===================================================================
--- branches/theora-gumboot/lib/huffdec.h	2010-05-16 11:13:00 UTC (rev 17220)
+++ branches/theora-gumboot/lib/huffdec.h	2010-05-16 12:46:19 UTC (rev 17221)
@@ -22,71 +22,11 @@
 
 
 
-typedef struct oc_huff_node oc_huff_node;
-
-/*A node in the Huffman tree.
-  Instead of storing every branching in the tree, subtrees can be collapsed
-   into one node, with a table of size 1<<nbits pointing directly to its
-   descedents nbits levels down.
-  This allows more than one bit to be read at a time, and avoids following all
-   the intermediate branches with next to no increased code complexity once
-   the collapsed tree has been built.
-  We do _not_ require that a subtree be complete to be collapsed, but instead
-   store duplicate pointers in the table, and record the actual depth of the
-   node below its parent.
-  This tells us the number of bits to advance the stream after reaching it.
-
-  This turns out to be equivalent to the method described in \cite{Hash95},
-   without the requirement that codewords be sorted by length.
-  If the codewords were sorted by length (so-called ``canonical-codes''), they
-   could be decoded much faster via either Lindell and Moffat's approach or
-   Hashemian's Condensed Huffman Code approach, the latter of which has an
-   extremely small memory footprint.
-  We can't use Choueka et al.'s finite state machine approach, which is
-   extremely fast, because we can't allow multiple symbols to be output at a
-   time; the codebook can and does change between symbols.
-  It also has very large memory requirements, which impairs cache coherency.
-
-  @ARTICLE{Hash95,
-    author="Reza Hashemian",
-    title="Memory Efficient and High-Speed Search {Huffman} Coding",
-    journal="{IEEE} Transactions on Communications",
-    volume=43,
-    number=10,
-    pages="2576--2581",
-    month=Oct,
-    year=1995
-  }*/
-struct oc_huff_node{
-  /*The number of bits of the code needed to descend through this node.
-    0 indicates a leaf node.
-    Otherwise there are 1<<nbits nodes in the nodes table, which can be
-     indexed by reading nbits bits from the stream.*/
-  unsigned char  nbits;
-  /*The value of a token stored in a leaf node.
-    The value in non-leaf nodes is undefined.*/
-  unsigned char  token;
-  /*The depth of the current node, relative to its parent in the collapsed
-     tree.
-    This can be less than its parent's nbits value, in which case there are
-     1<<nbits-depth copies of this node in the table, and the bitstream should
-     only be advanced depth bits after reaching this node.*/
-  unsigned char  depth;
-  /*The table of child nodes.
-    The ACTUAL size of this array is 1<<nbits, despite what the declaration
-     below claims.
-    The exception is that for leaf nodes the size is 0.*/
-  oc_huff_node  *nodes[2];
-};
-
-
-
 int oc_huff_trees_unpack(oc_pack_buf *_opb,
- oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_trees_copy(oc_huff_node *_dst[TH_NHUFFMAN_TABLES],
- const oc_huff_node *const _src[TH_NHUFFMAN_TABLES]);
-void oc_huff_trees_clear(oc_huff_node *_nodes[TH_NHUFFMAN_TABLES]);
-int oc_huff_token_decode(oc_pack_buf *_opb,const oc_huff_node *_node);
+ ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
+ const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
+void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
+int oc_huff_token_decode(oc_pack_buf *_opb,const ogg_int16_t *_node);
 
-
 #endif