[xiph-commits] r17206 - experimental/derf/theora-ptalarbvorm/lib
tterribe at svn.xiph.org
tterribe at svn.xiph.org
Thu May 13 10:48:52 PDT 2010
Author: tterribe
Date: 2010-05-13 10:48:52 -0700 (Thu, 13 May 2010)
New Revision: 17206
Modified:
experimental/derf/theora-ptalarbvorm/lib/huffdec.c
Log:
More Huffman decoding optimizations.
Eliminates some useless checks in oc_huff_token_decode() and reorganizes the
refill loop to reduce icache footprint.
This gains about 1% on both x86 and x86-64 for very high bitrate files.
Also allows a separate HUFF_SLUSH factor for the root of the tree vs. the rest
of it.
This eliminates much of the space savings of r17204, but in return gives an
additional 7.5% speed-up on both x86 and x86-64 (for very high bitrate files).
Modified: experimental/derf/theora-ptalarbvorm/lib/huffdec.c
===================================================================
--- experimental/derf/theora-ptalarbvorm/lib/huffdec.c 2010-05-13 12:53:41 UTC (rev 17205)
+++ experimental/derf/theora-ptalarbvorm/lib/huffdec.c 2010-05-13 17:48:52 UTC (rev 17206)
@@ -165,8 +165,16 @@
10538563 read calls are required when OC_HUFF_SLUSH is 2 (32.7%).
10192578 read calls are required when OC_HUFF_SLUSH is 3 (31.6%).
Since a value of 1 gets us the vast majority of the speed-up with only a
- small amount of wasted memory, this is what we use.*/
+ small amount of wasted memory, this is what we use.
+ This value must be less than 7, or you could create a tree with more than
+ 32767 entries, which would overflow the 16-bit words used to index it.*/
#define OC_HUFF_SLUSH (1)
+/*The root of the tree is the fast path, and a larger value here is more
+ beneficial than elsewhere in the tree.
+ 3 appears to give the best performance, trading off between increased use of
+ the single-read fast path and cache footprint for the tables.
+ using 3 here The VP3 tables are about 2.6x larger using 3 here than using 1.*/
+#define OC_ROOT_HUFF_SLUSH (3)
/*Determines the size in words of a Huffman tree node that represents a
@@ -366,6 +374,7 @@
size_t size;
int mindepth;
int depth;
+ int threshold;
int loccupancy;
int occupancy;
int child1;
@@ -383,8 +392,9 @@
do{
loccupancy=occupancy;
occupancy=oc_huff_tree_occupancy(_bitree,_binode,++depth);
+ threshold=1<<OC_MAXI(depth-(_binode>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH),0);
}
- while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+ while(occupancy>loccupancy&&occupancy>=threshold);
depth--;
size=oc_huff_node_size(depth);
if(depth>0){
@@ -464,14 +474,16 @@
int depth;
int loccupancy;
int occupancy;
+ int threshold;
int node;
depth=mindepth=oc_huff_tree_mindepth(_bitree,_binode);
occupancy=1<<mindepth;
do{
loccupancy=occupancy;
occupancy=oc_huff_tree_occupancy(_bitree,_binode,++depth);
+ threshold=1<<OC_MAXI(depth-(_binode>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH),0);
}
- while(occupancy>loccupancy&&occupancy>=1<<OC_MAXI(depth-OC_HUFF_SLUSH,0));
+ while(occupancy>loccupancy&&occupancy>=threshold);
depth--;
node=*_node;
if(depth<=0){
@@ -520,11 +532,14 @@
_src: The array of trees to copy.*/
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
+ int total;
int i;
+ total=0;
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
size_t size;
ogg_int16_t *tree;
size=oc_huff_tree_size(_src[i],0);
+ total+=size;
tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
if(tree==NULL){
while(i-->0)_ogg_free(_dst[i]);
@@ -565,15 +580,19 @@
Branch prediction should make this esoteric case free.*/
if(n<=0)return -n&255;
if(n>available){
- /*We don't bother setting eof because we won't check for it after we've
- started decoding DCT tokens.*/
- if(ptr>=stop)available=OC_LOTS_OF_BITS;
- while(available<=OC_PB_WINDOW_SIZE-8){
+ for(;;){
+ /*We don't bother setting eof because we won't check for it after we've
+ started decoding DCT tokens.*/
+ if(ptr>=stop){
+ available=OC_LOTS_OF_BITS;
+ break;
+ }
+ if(available>OC_PB_WINDOW_SIZE-8)break;
available+=8;
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
- if(ptr>=stop)available=OC_LOTS_OF_BITS;
}
- if(n>available)window|=*ptr>>(available&7);
+ /*Note: We never request more than 24 bits, so there's no need to fill in
+ the last partial byte here.*/
}
bits=window>>OC_PB_WINDOW_SIZE-n;
node=_tree[1+bits];
@@ -582,15 +601,19 @@
available-=n;
n=_tree[node];
if(n>available){
- /*We don't bother setting eof because we won't check for it after we've
- started decoding DCT tokens.*/
- if(ptr>=stop)available=OC_LOTS_OF_BITS;
- while(available<=OC_PB_WINDOW_SIZE-8){
+ for(;;){
+ /*We don't bother setting eof because we won't check for it after we've
+ started decoding DCT tokens.*/
+ if(ptr>=stop){
+ available=OC_LOTS_OF_BITS;
+ break;
+ }
+ if(available>OC_PB_WINDOW_SIZE-8)break;
available+=8;
window|=(oc_pb_window)*ptr++<<OC_PB_WINDOW_SIZE-available;
- if(ptr>=stop)available=OC_LOTS_OF_BITS;
}
- if(n>available)window|=*ptr>>(available&7);
+ /*Note: We never request more than 24 bits, so there's no need to fill in
+ the last partial byte here.*/
}
bits=window>>OC_PB_WINDOW_SIZE-n;
node=_tree[node+1+bits];
More information about the commits
mailing list