[xiph-commits] r8006 - in experimental/derf/theora-exp: lib tools

Thu Oct 14 23:58:04 PDT 2004

Author: tterribe
Date: 2004-10-14 23:58:04 -0700 (Thu, 14 Oct 2004)
New Revision: 8006

Added:
   experimental/derf/theora-exp/tools/extgen.c
Modified:
   experimental/derf/theora-exp/lib/encint.h
   experimental/derf/theora-exp/lib/encode.c
   experimental/derf/theora-exp/lib/fdct.c
   experimental/derf/theora-exp/lib/fdct.h
Log:
Commit a much faster implementation of boundary padding.
The total cost is now no more than twice that of a normal fDCT, and needs only
 around 1.5K of global side information, instead of around 64K per encoder
 instance.
Performance should be indistinguishable.

Also add another part of the zero-length packet for dropped frames.
Not sure why this wasn't committed before.


Modified: experimental/derf/theora-exp/lib/encint.h
===================================================================

--- experimental/derf/theora-exp/lib/encint.h	2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/encint.h	2004-10-15 06:58:04 UTC (rev 8006)
@@ -6,7 +6,6 @@
 
 typedef struct oc_fragment_enc_info   oc_fragment_enc_info;
 typedef struct oc_mb_enc_info         oc_mb_enc_info;
-typedef struct oc_border_enc_info     oc_border_enc_info;
 typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
 typedef struct oc_impmap_ctx          oc_impmap_ctx;
 typedef struct oc_mcenc_ctx           oc_mcenc_ctx;
@@ -76,19 +75,6 @@
   int           aerror4mv;
 };
 
-/*Information used to extend fragments which straddle the cropping border in
-   an optimal manner.*/
-struct oc_border_enc_info{
-  /*The indices of pixels with valid values first and then padding pixels.*/
-  unsigned char   pi[64];
-  /*The amount to scale pixel values by before multiplying by the extension
-     matrix.*/
-  int             shift;
-  /*The matrix used to extend the fragment into the padding region, in signed
-     1.15 fixed-point format.*/
-  ogg_int16_t   **ext_matrix;
-};
-
 /*A structure used to track the optimal mode coding scheme, for use in
    estimating the cost of coding each mode label during the mode selection
    process.*/
@@ -143,8 +129,6 @@
   oc_fragment_enc_info    *frinfo;
   /*Encoder-specific macro block information.*/
   oc_mb_enc_info          *mbinfo;
-  /*Encoder-specific border information.*/
-  oc_border_enc_info       border_info[16];
   /*Minimum psychovisual tolerance for the DC coefficients in each plane.*/
   unsigned                 dc_tol_mins[3];
   /*The qi value lists selected for each potential frame type.*/

Modified: experimental/derf/theora-exp/lib/encode.c
===================================================================
--- experimental/derf/theora-exp/lib/encode.c	2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/encode.c	2004-10-15 06:58:04 UTC (rev 8006)
@@ -850,7 +850,6 @@
      pixels.*/
   if(_frag->border!=NULL){
     ogg_int64_t mask;
-    int         bii;
     mask=_frag->border->mask;
     for(pixi=y=0;y<8;y++){
       for(x=0;x<8;x++,pixi++){
@@ -868,8 +867,7 @@
       }
       pixels+=_ystride;
     }
-    bii=_frag->border-_enc->state.borders;
-    oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,pix_buf);
+    oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
   }
   /*Otherwise, copy all the pixels in the fragment and do a normal DCT.*/
   else{
@@ -897,7 +895,6 @@
   int            ref_framei;
   int            mvoffset0;
   int            mvoffset1;
-  int            bii;
   int            y;
   int            x;
   src_ystride=_enc->state.input[_pli].ystride;
@@ -921,9 +918,7 @@
         ref0+=ref_ystride;
         ref1+=ref_ystride;
       }
-      bii=_frag->border-_enc->state.borders;
-      oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,
-       pix_buf);
+      oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
     }
     else{
       for(pixi=y=0;y<8;y++){
@@ -950,9 +945,7 @@
         src+=src_ystride;
         ref0+=ref_ystride;
       }
-      bii=_frag->border-_enc->state.borders;
-      oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,
-       pix_buf);
+      oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
     }
     else{
       for(pixi=y=0;y<8;y++){
@@ -2437,24 +2430,30 @@
 }
 
 static int oc_enc_deltaframe(oc_enc_ctx *_enc){
-  oc_enc_mark_coded(_enc);
-  oc_enc_quant_sel_quality(_enc,0);
-  _enc->state.frame_type=oc_enc_choose_mbmodes(_enc);
-  if(_enc->state.frame_type==OC_INTER_FRAME)oc_enc_do_inter_dcts(_enc);
-  oc_enc_quant_dc(_enc);
-  oc_enc_residual_tokenize(_enc);
   oggpackB_reset(&_enc->opb);
-  oc_enc_frame_header_pack(_enc);
-  if(_enc->state.frame_type==OC_INTER_FRAME){
-    oggpackB_writecopy(&_enc->opb,
-     oggpackB_get_buffer(&_enc->opb_coded_flags),
-     oggpackB_bits(&_enc->opb_coded_flags));
-    oc_enc_mb_modes_pack(_enc);
-    oc_enc_mvs_pack(_enc);
+  oc_enc_mark_coded(_enc);
+  /*Only proceed if we have some coded blocks.
+    No coded blocks -> dropped frame -> 0 byte packet.*/
+  if(_enc->state.ncoded_fragis[0]!=0||
+   _enc->state.ncoded_fragis[1]!=0||
+   _enc->state.ncoded_fragis[2]!=0){
+    oc_enc_quant_sel_quality(_enc,0);
+    _enc->state.frame_type=oc_enc_choose_mbmodes(_enc);
+    if(_enc->state.frame_type==OC_INTER_FRAME)oc_enc_do_inter_dcts(_enc);
+    oc_enc_quant_dc(_enc);
+    oc_enc_residual_tokenize(_enc);
+    oc_enc_frame_header_pack(_enc);
+    if(_enc->state.frame_type==OC_INTER_FRAME){
+      oggpackB_writecopy(&_enc->opb,
+       oggpackB_get_buffer(&_enc->opb_coded_flags),
+       oggpackB_bits(&_enc->opb_coded_flags));
+      oc_enc_mb_modes_pack(_enc);
+      oc_enc_mvs_pack(_enc);
+    }
+    oc_enc_block_qis_pack(_enc);
+    /*Pack the quantized DCT coefficients.*/
+    oc_enc_residual_tokens_pack(_enc);
   }
-  oc_enc_block_qis_pack(_enc);
-  /*Pack the quantized DCT coefficients.*/
-  oc_enc_residual_tokens_pack(_enc);
   /*Success: Mark the packet as ready to be flushed.*/
   _enc->packet_state=OC_PACKET_READY;
   return 0;
@@ -2462,17 +2461,12 @@
 
 
 static int oc_enc_init(oc_enc_ctx *_enc,const theora_info *_info){
-  int bii;
   int ret;
   /*Initialize the shared encoder/decoder state.*/
   ret=oc_state_init(&_enc->state,_info);
   if(ret<0)return ret;
   _enc->block_coded_flags=_ogg_calloc(_enc->state.nfrags,
    sizeof(_enc->block_coded_flags[0]));
-  /*Compute the matrices needed to extend the frame into the border regions.*/
-  for(bii=0;bii<_enc->state.nborders;bii++){
-    oc_border_enc_info_init(_enc->border_info+bii,_enc->state.borders+bii);
-  }
   /*Initialize our packet buffers.*/
   oggpackB_writeinit(&_enc->opb);
   oggpackB_writeinit(&_enc->opb_coded_flags);

Modified: experimental/derf/theora-exp/lib/fdct.c
===================================================================
--- experimental/derf/theora-exp/lib/fdct.c	2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/fdct.c	2004-10-15 06:58:04 UTC (rev 8006)
@@ -2,66 +2,9 @@
 #include <string.h>
 #include "dct.h"
 #include "fdct.h"
-#include "idct.h"
 
-/*The matrix representation of the Theora's actual iDCT.
-  This is computed using the 16-bit approximations of the sines and cosines
-   that Theora uses, but using floating point values and assuming that no
-   truncation occurs after division (i.e., pretending that the transform is
-   linear like the real DCT).*/
-static const float OC_IDCT_MATRIX[8][8]={
-  {
-    +0.7071075439453125F,+0.9807891845703125F,
-    +0.9238739013671875F,+0.8314666748046875F,
-    +0.7071075439453125F,+0.555572509765625F,
-    +0.3826904296875F,   +0.1950836181640625F
-  },
-  {
-    +0.7071075439453125F,+0.8314685295335948F,
-    +0.3826904296875F,   -0.1950868454296142F,
-    -0.7071075439453125F,-0.9807858711574227F,
-    -0.9238739013671875F,-0.5555783333256841F
-  },
-  {
-    +0.7071075439453125F,+0.5555783333256841F,
-    -0.3826904296875F,   -0.9807858711574227F,
-    -0.7071075439453125F,+0.1950868454296142F,
-     0.9238739013671875F,+0.8314685295335948F
-  },
-  {
-    +0.7071075439453125F,+0.1950836181640625F,
-    -0.9238739013671875F,-0.555572509765625F,
-    +0.7071075439453125F,+0.8314666748046875F,
-    -0.3826904296875F,   -0.9807891845703125F
-  },
-  {
-    +0.7071075439453125F,-0.1950836181640625F,
-    -0.9238739013671875F,+0.555572509765625F,
-    +0.7071075439453125F,-0.8314666748046875F,
-    -0.3826904296875F,   +0.9807891845703125F
-  },
-  {
-    +0.7071075439453125F,-0.5555783333256841F,
-    -0.3826904296875F,    0.9807858711574227F,
-    -0.7071075439453125F,-0.1950868454296142F,
-    +0.9238739013671875F,-0.8314685295335948F
-  },
-  {
-    +0.7071075439453125F,-0.8314685295335948F,
-    +0.3826904296875,    +0.1950868454296142F,
-    -0.7071075439453125F,+0.9807858711574227F,
-    -0.9238739013671875F,+0.5555783333256841F
-  },
-  {
-    +0.7071075439453125F,-0.9807891845703125F,
-    +0.9238739013671875F,-0.8314666748046875F,
-    +0.7071075439453125F,-0.555572509765625F,
-    +0.3826904296875F,   -0.1950836181640625F
-  }
-};
 
 
-
 /*Performs a forward 8 point Type-II DCT transform.
   The output is scaled by a factor of 2 from the orthonormal version of the
    transform.
@@ -126,7 +69,6 @@
 /*Performs a forward 8x8 Type-II DCT transform.
   The output is scaled by a factor of 4 relative to the orthonormal version
    of the transform.
-  TODO: What is the maximum dynamic range of the input? output?
   _y: The buffer to store the result in.
       This may be the same as _x.
   _x: The input coefficients. */
@@ -141,376 +83,238 @@
   for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8(out,in);
 }
 
+/*Information needed to pad boundary blocks.
+  We multiply each row/column by an extension matrix that fills in the padding
+   values as a linear combination of the active values, so that an equivalent
+   number of coefficients are forced to zero.
+  This costs at most 16 multiplies, the same as a 1-D fDCT itself, and as
+   little as 7 multiplies.
+  We compute the extension matrices for every possible shape in advance, as
+   there are only 35.
+  The coefficients for all matrices are stored in a single array to take
+   advantage of the overlap and repetitiveness of many of the shapes.
+  A similar technique is applied to the offsets into this array.
+  This reduces the required table storage by about 48%.
+  See tools/extgen.c for details.
+  We could conceivably do the same for all 256 possible shapes.*/
+typedef struct oc_extension_info{
+  /*The mask of the active pixels in the shape.*/
+  short                     mask;
+  /*The number of active pixels in the shape.*/
+  short                     na;
+  /*The extension matrix.
+    This is (8-na)xna*/
+  const ogg_int32_t *const *ext;
+  /*The pixel indices: na active pixels followed by 8-na padding pixels.*/
+  unsigned char             pi[8];
+  /*The coefficient indices: na unconstrained coefficients followed by 8-na
+     coefficients to be forced to zero.*/
+  unsigned char             ci[8];
+}oc_extension_info;
 
-/*A general low-pass extension technique:
 
-  The idea behind low-pass signal extension is to extend a signal beyond its
-   boundary in such a way that the signal energy is concentrated in the
-   low-frequency coefficients (i.e., is easy to compress).
 
-  There are several different ways to go about this.
-  The obvious one is to select the coefficients at the end of the zig-zag run,
-   and select padding values that force these coefficients to zero.
-  This can be done in an iterative manner, by filling in the padding values
-   with some initial guess (e.g., the average non-padding pixel value),
-   performing a DCT, zeroing the desired coefficients, inverting the DCT, and
-   updating the padding values with the result of the inverse (keeping the
-   non-padding values fixed).
-  I remember reading about this method somewhere, but I can no longer find the
-   reference for it.
-  In practice, however, this method does not actually work when implemented as
-   just described.
+/*The precision at which the matrix coefficients are stored.*/
+#define OC_EXT_SHIFT (19)
 
-  The reason for this becomes clear when examining the non-iterative
-   formulation of the problem as a system of linear equations.
-  One wants to solve for the padding values that zero out the desired set of
-   DCT coefficients.
-  This can be done by extracting the appropriate submatrix from the general
-   2D DCT transformation matrix and inverting it, then multiplying this
-   inverse by the actual values of these coefficients computed with some
-   arbitrary initial padding value (e.g., 0).
+/*The number of shapes we need.*/
+#define OC_NSHAPES   (35)
 
-  Unfortunately, the linear system derived in this manner is ill-conditioned.
-  Due to the symmetry of the basis functions, the submatrices are often
-   singular.
-  One could instead use the pseudoinverse to find a least-squares solution
-   for the coefficient values, however the system is also fairly stiff,
-   containing large order of magnitude differences in the singular values.
-  This causes very large values to appear in the pseudoinverse, producing
-   interpolated pixel values that are well outside the normal [-255,255]
-   range and creating generally large coefficient values in the non-zero
-   coefficients.
+static const ogg_int32_t OC_EXT_COEFFS[231]={
+   0x80000,-0x1E085,-0x96FC9,-0x55871, 0x55871, 0x96FC9, 0x1E085, 0x96FC9,
+   0x55871,-0x55871,-0x96FC9,-0x1E085, 0x80000, 0x00000, 0x00000, 0x00000,
+   0x80000, 0x00000, 0x00000, 0x80000,-0x80000, 0x80000, 0x00000, 0x00000,
+   0x80000,-0x1E086, 0x1E086,-0x4F591,-0x55E37, 0x337C7, 0x8C148, 0x43448,
+   0x2266F, 0x43448, 0x8C148, 0x337C7,-0x55E37,-0x4F591,-0x75743, 0x4F591,
+   0x03B45,-0x1D29D, 0x929E0, 0x2CF2A, 0x929E0,-0x1D29D, 0x03B45, 0x4F591,
+  -0x75743, 0x11033, 0x7AEF5, 0x5224C,-0x209FA,-0x3D77A,-0x209FA, 0x5224C,
+   0x7AEF5, 0x11033, 0x668A4,-0x29124, 0x3A15B, 0x0E6BD,-0x05F98, 0x0E6BD,
+   0x3A15B,-0x29124, 0x668A4, 0x2A78F, 0x24019,-0x67F0F, 0x50F49, 0x4881E,
+   0x50F49,-0x67F0F, 0x24019, 0x2A78F,-0x06898, 0x27678, 0x5F220, 0x27678,
+  -0x06898, 0x1F910, 0x76C13,-0x16523, 0x76C13, 0x1F910, 0x9EB2C,-0x2E7B1,
+   0x0FC86,-0x2E7B1, 0x9EB2C, 0x4F594, 0x43452,-0x129E6, 0x43452, 0x4F594,
+  -0x0A8C0, 0x5D997, 0x2CF29, 0x5D997,-0x0A8C0, 0x30FBD,-0x0B7E5,-0x6AC38,
+  -0x153C8, 0x8B7E5, 0x4F043, 0x8B7E5,-0x153C8,-0x6AC38,-0x0B7E5, 0x30FBD,
+  -0x5A827, 0x153C8, 0x6AC38, 0x3C7A2, 0x1E085, 0x3C7A2, 0x6AC38, 0x153C8,
+  -0x5A827, 0x5A827, 0x6AC38, 0x153C8,-0x3C7A2,-0x1E085,-0x3C7A2, 0x153C8,
+   0x6AC38, 0x5A827, 0x30FBB, 0x4F045, 0x273D7,-0x273D7, 0x273D7, 0x1E08B,
+   0x61F75, 0x1E08B, 0x273D7,-0x273D7, 0x273D7, 0x4F045, 0x30FBB,-0x273D7,
+   0x273D7, 0x9E08B,-0x1E08B, 0x9E08B, 0x273D7,-0x273D7, 0x4F045, 0x30FBB,
+  -0x273D7, 0x273D7,-0x273D7, 0x30FBB, 0x4F045, 0x1FC86, 0x67AC8, 0x18538,
+  -0x1FC86, 0x18538, 0x67AC8, 0x1FC86, 0x45460,-0x1FC87, 0x1FC87, 0x3ABA0,
+   0x1FC87,-0x1FC87, 0x45460, 0x35052, 0x5586D,-0x0A8BF, 0x5586D, 0x35052,
+  -0x43EF2, 0x78F43, 0x4AFAE,-0x070BD, 0x3C10E, 0x3C10E,-0x070BD, 0x4AFAE,
+   0x78F43,-0x43EF2, 0x3C10E, 0x78F43,-0x35052, 0x78F43, 0x3C10E, 0x070BD,
+   0x236D6, 0x5586D, 0x236D6, 0x070BD,-0x07755, 0x3C79F, 0x4AFB6,-0x190D2,
+   0x4E11C, 0x9FC86, 0x153C4,-0x3504A, 0xAC38E, 0x08CBB, 0x1E086,-0x1E086,
+   0x80000, 0x08CBB, 0xAC38E,-0x3504A, 0x153C4, 0x9FC86, 0x4E11C,-0x190D2,
+   0x4AFB6, 0x3C79F,-0x07755,-0x5A818, 0xDA818,-0x5A818,-0x101C31, 0x181C31,
+  -0x101C31,-0xD0C67, 0x150C67,-0xD0C67,-0x7643F, 0xF643F,-0x7643F
+};
 
-  Another obvious choices of coefficients to zero is the m highest frequency
-   values in each column, where m is the number of rows that have at least as
-   many as 8-ci padding values, where ci is the column index.
-  This selects the highest frequency coefficients with some non-zero support
-   in the padding region, and ensures that every pixel in the padding region
-   is covered by at least one such coefficient's basis function.
-  However, these conditions alone are not enough to guarantee that the
-   corresponding submatrix of the DCT transform is non-singular, nor does it
-   even help reduce the stiffness of the least squares problem.
+static const ogg_int32_t *const OC_EXT_ROWS[96]={
+  OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,
+  OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   0,OC_EXT_COEFFS+   6,
+  OC_EXT_COEFFS+  27,OC_EXT_COEFFS+  38,OC_EXT_COEFFS+  43,OC_EXT_COEFFS+  32,
+  OC_EXT_COEFFS+  49,OC_EXT_COEFFS+  58,OC_EXT_COEFFS+  67,OC_EXT_COEFFS+  71,
+  OC_EXT_COEFFS+  62,OC_EXT_COEFFS+  53,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
+  OC_EXT_COEFFS+  14,OC_EXT_COEFFS+  13,OC_EXT_COEFFS+  76,OC_EXT_COEFFS+  81,
+  OC_EXT_COEFFS+  86,OC_EXT_COEFFS+  91,OC_EXT_COEFFS+  96,OC_EXT_COEFFS+  98,
+  OC_EXT_COEFFS+  93,OC_EXT_COEFFS+  88,OC_EXT_COEFFS+  83,OC_EXT_COEFFS+  78,
+  OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,
+  OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,
+  OC_EXT_COEFFS+  15,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+ 101,OC_EXT_COEFFS+ 106,
+  OC_EXT_COEFFS+ 112,OC_EXT_COEFFS+  16,OC_EXT_COEFFS+ 121,OC_EXT_COEFFS+ 125,
+  OC_EXT_COEFFS+  20,OC_EXT_COEFFS+ 116,OC_EXT_COEFFS+ 130,OC_EXT_COEFFS+ 133,
+  OC_EXT_COEFFS+ 143,OC_EXT_COEFFS+ 150,OC_EXT_COEFFS+ 157,OC_EXT_COEFFS+ 164,
+  OC_EXT_COEFFS+ 167,OC_EXT_COEFFS+ 160,OC_EXT_COEFFS+ 153,OC_EXT_COEFFS+ 146,
+  OC_EXT_COEFFS+ 136,OC_EXT_COEFFS+ 139,OC_EXT_COEFFS+ 171,OC_EXT_COEFFS+ 176,
+  OC_EXT_COEFFS+ 181,OC_EXT_COEFFS+ 186,OC_EXT_COEFFS+ 191,OC_EXT_COEFFS+ 196,
+  OC_EXT_COEFFS+ 201,OC_EXT_COEFFS+ 206,OC_EXT_COEFFS+ 209,OC_EXT_COEFFS+ 214,
+  OC_EXT_COEFFS+ 198,OC_EXT_COEFFS+ 203,OC_EXT_COEFFS+  24,OC_EXT_COEFFS+ 211,
+  OC_EXT_COEFFS+ 216,OC_EXT_COEFFS+ 193,OC_EXT_COEFFS+ 188,OC_EXT_COEFFS+ 178,
+  OC_EXT_COEFFS+ 183,OC_EXT_COEFFS+ 173,OC_EXT_COEFFS+ 219,OC_EXT_COEFFS+ 220,
+  OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+  12,OC_EXT_COEFFS+  15,OC_EXT_COEFFS+ 219,
+  OC_EXT_COEFFS+ 219,OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 222,OC_EXT_COEFFS+ 225,
+  OC_EXT_COEFFS+ 228,OC_EXT_COEFFS+ 229,OC_EXT_COEFFS+ 226,OC_EXT_COEFFS+ 223
+};
 
-  A better strategy is to select a linearly independent set of DCT basis
-   functions to represent the pixels inside the boundary, and then extrapolate
-   the remaining pixel values using the coefficients derived from these basis
-   functions.
-  This is the approach taken by Kaup and Aach  \cite{KA94}.
-  The coefficients corresponding to the basis functions that are not selected
-   are implicitly zeroed out by this process.
-  One can attempt to select the linearly independent set using Gaussian
-   elimination.
-  However, experiments show straightforward methods of determining when a
-   pivot value is small enough to consider zero do not work reliably, and this
-   still doesn't solve the stiffness problem.
+static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={
+  {0x7F,7,OC_EXT_ROWS+  0,{0,1,2,3,4,5,6,7},{0,1,2,4,5,6,7,3}},
+  {0xFE,7,OC_EXT_ROWS+  7,{1,2,3,4,5,6,7,0},{0,1,2,4,5,6,7,3}},
+  {0x3F,6,OC_EXT_ROWS+  8,{0,1,2,3,4,5,7,6},{0,1,3,4,6,7,5,2}},
+  {0xFC,6,OC_EXT_ROWS+ 10,{2,3,4,5,6,7,1,0},{0,1,3,4,6,7,5,2}},
+  {0x1F,5,OC_EXT_ROWS+ 12,{0,1,2,3,4,7,6,5},{0,2,3,5,7,6,4,1}},
+  {0xF8,5,OC_EXT_ROWS+ 15,{3,4,5,6,7,2,1,0},{0,2,3,5,7,6,4,1}},
+  {0x0F,4,OC_EXT_ROWS+ 18,{0,1,2,3,7,6,5,4},{0,2,4,6,7,5,3,1}},
+  {0xF0,4,OC_EXT_ROWS+ 18,{4,5,6,7,3,2,1,0},{0,2,4,6,7,5,3,1}},
+  {0x07,3,OC_EXT_ROWS+ 22,{0,1,2,7,6,5,4,3},{0,3,6,7,5,4,2,1}},
+  {0xE0,3,OC_EXT_ROWS+ 27,{5,6,7,4,3,2,1,0},{0,3,6,7,5,4,2,1}},
+  {0x03,2,OC_EXT_ROWS+ 32,{0,1,7,6,5,4,3,2},{0,4,7,6,5,3,2,1}},
+  {0xC0,2,OC_EXT_ROWS+ 32,{6,7,5,4,3,2,1,0},{0,4,7,6,5,3,2,1}},
+  {0x01,1,OC_EXT_ROWS+  0,{0,7,6,5,4,3,2,1},{0,7,6,5,4,3,2,1}},
+  {0x80,1,OC_EXT_ROWS+  0,{7,6,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x7E,6,OC_EXT_ROWS+ 42,{1,2,3,4,5,6,7,0},{0,1,2,5,6,7,4,3}},
+  {0x7C,5,OC_EXT_ROWS+ 44,{2,3,4,5,6,7,1,0},{0,1,4,5,7,6,3,2}},
+  {0x3E,5,OC_EXT_ROWS+ 47,{1,2,3,4,5,7,6,0},{0,1,4,5,7,6,3,2}},
+  {0x78,4,OC_EXT_ROWS+ 50,{3,4,5,6,7,2,1,0},{0,4,5,7,6,3,2,1}},
+  {0x3C,4,OC_EXT_ROWS+ 54,{2,3,4,5,7,6,1,0},{0,3,4,7,6,5,2,1}},
+  {0x1E,4,OC_EXT_ROWS+ 58,{1,2,3,4,7,6,5,0},{0,4,5,7,6,3,2,1}},
+  {0x70,3,OC_EXT_ROWS+ 62,{4,5,6,7,3,2,1,0},{0,5,7,6,4,3,2,1}},
+  {0x38,3,OC_EXT_ROWS+ 67,{3,4,5,7,6,2,1,0},{0,5,6,7,4,3,2,1}},
+  {0x1C,3,OC_EXT_ROWS+ 72,{2,3,4,7,6,5,1,0},{0,5,6,7,4,3,2,1}},
+  {0x0E,3,OC_EXT_ROWS+ 77,{1,2,3,7,6,5,4,0},{0,5,7,6,4,3,2,1}},
+  {0x60,2,OC_EXT_ROWS+ 82,{5,6,7,4,3,2,1,0},{0,2,7,6,5,4,3,1}},
+  {0x30,2,OC_EXT_ROWS+ 36,{4,5,7,6,3,2,1,0},{0,4,7,6,5,3,2,1}},
+  {0x18,2,OC_EXT_ROWS+ 90,{3,4,7,6,5,2,1,0},{0,1,7,6,5,4,3,2}},
+  {0x0C,2,OC_EXT_ROWS+ 34,{2,3,7,6,5,4,1,0},{0,4,7,6,5,3,2,1}},
+  {0x06,2,OC_EXT_ROWS+ 84,{1,2,7,6,5,4,3,0},{0,2,7,6,5,4,3,1}},
+  {0x40,1,OC_EXT_ROWS+  0,{6,7,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x20,1,OC_EXT_ROWS+  0,{5,7,6,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x10,1,OC_EXT_ROWS+  0,{4,7,6,5,3,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x08,1,OC_EXT_ROWS+  0,{3,7,6,5,4,2,1,0},{0,7,6,5,4,3,2,1}},
+  {0x04,1,OC_EXT_ROWS+  0,{2,7,6,5,4,3,1,0},{0,7,6,5,4,3,2,1}},
+  {0x02,1,OC_EXT_ROWS+  0,{1,7,6,5,4,3,2,0},{0,7,6,5,4,3,2,1}}
+};
 
-  Kaup and Aach take the approach of constructing the set in an iterative
-   manner by starting with only the DC coefficient and adding the basis
-   function which minimizes the approximation error at each step.
-  Let G(i,j) be the coefficient of the ith pixel in the jth basis function in
-   the 2D iDCT, c(j) the DCT coefficients in the current approximation, A the
-   set of non-padding pixels, f(i) the pixel values, and
-   r(i)=f(i)-G(i,:) c(:) the reconstruction error in pixel i.
-  Then the metric to minimize is
 
-    E(j) = \frac{\[\sum_{i\in A} r(i)*G(i,j)\]^2}{\sum_{i\in A} G(i,j)^2}
 
-  This formulation guarantees a linearly independent set of vectors is chosen.
-  It avoids ill-conditioning by stopping the iteration when the residual
-   energy or the change in residual energy falls below a threshold.
-  The coefficients which minimize the residual at each stage can be computed
-   by formulating the least squares solution G^T G c(:) = G^T f(:).
-  Because G^T G is Hermitian, this can be solved by finding the Cholesky
-   decomposition L such that L L^T = G^T G and L is lower-triangular.
-  Then c(:) = L^-T L^-1 G^T f(:), where the multiplication by L^-T and L^-1 can
-   be done by forward and backward substitution.
+#include <stdio.h>
 
-  Unfortunately, this method now depends on the actual pixel values.
-  While it has the advantage of actually working, the other methods above
-   can all be reduced to finding a fixed matrix for a given shape to multiply
-   the existing pixel values by.
-  Thus, so long as the shape does not change, the processing can all be done
-   in advance and run-time computation is reduced to a simple matrix
-   multiplication.
-  This dependency can be avoided by choosing some sample pixel values f(n)
-   to drive approximation instead of using the actual pixel values.
-  While that approach works very well in some cases, it can still
-   occaisionally select a set of basis vectors with small singular values.
-  There is a more robust method, however, that does not require choosing
-   sample pixel values.
-
-  One thing to note is that as each basis vector is added to G, a single row
-   is added to the Cholesky decompositing L L^T of G^T G.
-  This row only depends on the previous rows of G, and does not change as more
-   rows are added to G.
-  When computing c(:) from f(:), division by a diagonal element of L is
-   performed twice, once during forward substitution and once during backward
-   substitution.
-  These are the only divisions in this computation.
-  Thus it is small values of these diagonal elements which lead to the
-   numerical instability.
-  So, instead of trying to minimize the residual energy at each stage, we can
-   instead try to maximize the next diagonal element of L.
-  This is a measure of how orthogonal the next basis function is to the
-   subspace spanned by the existing basis functions.
-  Because the divisions do not accumulate in the formulation of the Cholesky
-   decomposition, the values on the diagonal are well-conditioned, unlike the
-   pivot values in Gaussian elimination.
-
-  This method turns out to work very well in practice, and is implemented
-   below.
-  The greedy optimization approach appears sufficient to avoid any of the
-   numerical instabilities present in the other methods discussed.
-  A threshold is no longer required for a stopping criteria, since a
-   sufficiently independent set of basis functions can always be found in the
-   DCT.
-
-  Once the set of basis functions has been selected, the linear system
-   f'(:) = G' (L^-T L^-1) G^T f(:) is solved, where f'(:) is the vector of
-   extrapolated pixel values, and G' is the portion of the 2D iDCT basis
-   functions used in G that corresponds to these pixels.
-  This results in a simple m by n matrix, where m is the number of padding
-   pixels and n is the number of non-padding pixels.
-  The entries of this matrix are scaled down by a power of two to ensure their
-   magnitude is less than 1, and then they are stored as signed 1.15
-   fixed-point values.
-  During the encode, the non-padding pixel values are scaled by this power of
-   two before multiplication.
-  The total additonal cost of computing the padding is at most 16 multiplies
-   per pixel.
-  By comparison, Theora's fDCT implementation requires 4 multiplies per pixel.
-
-  The complexity can be further reduced by noting that if the selected basis
-   vectors are all orthogonal, then (L^-T L^-1) G^T =
-   (G^-1 G^-T) G^T = G^-1 = D^-2 G^T, where D is a diagonal matrix consisting
-   of the magnitude of the rows of G.
-  Thus computation of padding pixel values can be avoided completely.
-  The entire operation reduces to padding with 0's, performing an fDCT, and
-   scaling the output.
-  To reiterate, this only works when the rows of G are orthogonal, and so this
-   is not done here.
-
-  There is no longer any bias in this method towards including low-frequency
-   coefficients in the set.
-  However, such a bias is neither always possible, nor strictly necessary.
-  If the signal statistics already tend to send high frequency coefficients to
-   small values, this will not change when some of those high frequencies are
-   forced to zero.
-  Thus, most of the high frequency coefficients will still be quantized to
-   zero, even if they do happen to be included in the basis.
-  The important part is that all of the coefficients not in the basis will be
-   guaranteed to be zero.
-  Zeroing out low-frequency coefficients can be as beneficial to compression
-   as high frequency components.
-  Although they do not often form parts of long zero runs, they tend to have
-   larger magnitudes that require more bits to store.
-
-  @INPROCEEDINGS{KA94,
-    author="Andr\'{e} Kaup and Til Aach",
-    title="Segment-Oriented Coding of Textured Images Based on Successive
-     Approximation",
-    booktitle="Proceedings of the International Symposium on Speech, Image
-     Processing, and Neural Networks",
-    pages="197--200",
-    address="Hong Kong",
-    month="Apr.",
-    year=1994,
-    URL="http://www.lnt.de/~kaup/paper/issipnn-94.pdf"
-  }*/
-
-/*Adds the last row/column to the symmetric matrix G^T G, and then updates the
-   Cholesky factorization of it, L.
-  The diagonal element is left squared, since sometimes we only care about the
-   magnitude of this element relative to other choices.
-  Thus callers must manually take the square root when the actual Cholesky
-   factorization is needed.
-  _l:   The L matrix to update.
-  _ac:  The set of basis functions used for each row.
-  _nac: The number of rows.
-  _ap:  The set of pixels that are not padding.
-  _nap: The number of pixels.*/
-static void oc_update_l(float _l[64][64],int _ac[64],int _nac,
- int _ap[64],int _nap){
-  int aci;
-  int acj;
-  int ack;
-  int api;
-  int cri;
-  int cci;
-  int crj;
-  int ccj;
-  int pri;
-  int pci;
-  /*Step 1: Add the next row/column of G^T G.*/
-  aci=_nac-1;
-  cri=_ac[aci]>>3;
-  cci=_ac[aci]&7;
-  for(acj=0;acj<=aci;acj++){
-    crj=_ac[acj]>>3;
-    ccj=_ac[acj]&7;
-    _l[aci][acj]=0;
-    for(api=0;api<_nap;api++){
-      pri=_ap[api]>>3;
-      pci=_ap[api]&7;
-      _l[aci][acj]+=OC_IDCT_MATRIX[pci][ccj]*OC_IDCT_MATRIX[pri][crj]*
-       OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri];
-    }
+/*Pads a single row of a partial block and then performs a forward Type-II DCT
+   on the result.
+  The output is scaled by a factor of 2 from the orthonormal version of the
+   transform.
+  _y: The buffer to store the result in.
+      Data will be placed in every 8th entry (e.g., in a column of an 8x8
+       block).
+  _x: The input coefficients.
+      The first 8 entries are used (e.g., from a row of an 8x8 block).
+  _e: The extension information for the shape.*/
+static void fdct8_ext(ogg_int16_t *_y,ogg_int16_t _x[8],
+ const oc_extension_info *_e){
+  if(_e->na==1){
+    int ci;
+    /*While the branch below is still correct for shapes with na==1, we can
+       perform the entire transform with just 1 multiply in this case instead
+       of 23.*/
+    _y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[_e->pi[0]]<<3)));
+    for(ci=8;ci<64;ci+=8)_y[ci]=0;
   }
-  /*Step 2: Convert the newly added row to the corresponding row of the
-     Cholesky factorization.*/
-  for(acj=0;acj<aci;acj++){
-    for(ack=0;ack<acj;ack++)_l[aci][acj]-=_l[aci][ack]*_l[acj][ack];
-    _l[aci][acj]/=_l[acj][acj];
-  }
-  for(ack=0;ack<aci;ack++)_l[aci][aci]-=_l[aci][ack]*_l[aci][ack];
-}
-
-/*Computes the set of basis functions to use to synthesize a partial block.
-  These basis functions are then used to derive a matrix that extends the
-   signal into the missing region from the pixel values that are present.
-  _border_info: The border info structure to initialize with details about the
-                 selected basis set.
-  _border:      A description of the partial block.*/
-void oc_border_enc_info_init(oc_border_enc_info *_border_info,
- const oc_border_info *_border){
-  float ext_matrix[64][64];
-  float b[64];
-  float l[64][64];
-  float best_l_nn2;
-  float scale;
-  float w;
-  int   ap[64];
-  int   zp[64];
-  int   ac[64];
-  int   best_aci;
-  int   shift;
-  int   nap;
-  int   nzp;
-  int   nac;
-  int   api;
-  int   zpi;
-  int   aci;
-  int   acj;
-  int   cri;
-  int   cci;
-  int   pri;
-  int   pci;
-  int   pi;
-  int   ci;
-  for(zpi=api=pi=0;pi<64;pi++){
-    if((int)(_border->mask>>pi)&1)ap[api++]=pi;
-    else zp[zpi++]=pi;
-  }
-  nap=api;
-  nzp=zpi;
-  for(api=0;api<nap;api++)_border_info->pi[api]=(unsigned char)ap[api];
-  for(zpi=0;zpi<nzp;zpi++)_border_info->pi[nap+zpi]=(unsigned char)zp[zpi];
-  /*Step 1: Select a linearly independent set of basis functions.*/
-  for(ci=0;ci<64;ci++)ac[OC_IZIG_ZAG[ci]]=ci;
-  /*Always include the DC coefficient.*/
-  for(nac=1;;nac++){
-    /*Update L with the most recently selected basis function.*/
-    oc_update_l(l,ac,nac,ap,nap);
-    l[nac-1][nac-1]=OC_SQRTF(l[nac-1][nac-1]);
-    if(nac>=nap)break;
-    /*Choose the next basis function to add based on its potential entry on
-       the diagonal of the Cholesky factorization L.*/
-    best_aci=nac;
-    best_l_nn2=0;
-    for(aci=nac;aci<64;aci++){
-      ci=ac[nac];
-      ac[nac]=ac[aci];
-      ac[aci]=ci;
-      oc_update_l(l,ac,nac+1,ap,nap);
-      if(l[nac][nac]>best_l_nn2){
-        best_l_nn2=l[nac][nac];
-        best_aci=aci;
-      }
-      ci=ac[nac];
-      ac[nac]=ac[aci];
-      ac[aci]=ci;
+  else{
+    int zpi;
+    int api;
+    int nz;
+    /*First multiply by the extension matrix to compute the padding values.*/
+    nz=8-_e->na;
+    for(zpi=0;zpi<nz;zpi++){
+      ogg_int32_t v;
+      v=0;
+      for(api=0;api<_e->na;api++)v+=_e->ext[zpi][api]*_x[_e->pi[api]];
+      _x[_e->pi[zpi+_e->na]]=
+       (ogg_int16_t)OC_DIV_ROUND_POW2(v,OC_EXT_SHIFT,1<<OC_EXT_SHIFT-1);
     }
-    ci=ac[nac];
-    ac[nac]=ac[best_aci];
-    ac[best_aci]=ci;
+    fdct8(_y,_x);
   }
-  /*Step 2: Compute the extrapolation matrix.
-    We also compute a shift value to apply to all the non-zero pixel values.
-    We scale the input pixels by this amount and then divide the matrix
-     entries by it, so that we're guaranteed that the entries are always less
-     than one and can be approximated by a signed 1.15 fixed point value.*/
-  shift=0;
-  for(api=0;api<nap;api++){
-    pri=ap[api]>>3;
-    pci=ap[api]&7;
-    for(aci=0;aci<nac;aci++){
-      cri=ac[aci]>>3;
-      cci=ac[aci]&7;
-      b[aci]=OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri];
-    }
-    for(aci=0;aci<nac;aci++){
-      for(acj=0;acj<aci;acj++)b[aci]-=l[aci][acj]*b[acj];
-      b[aci]/=l[aci][aci];
-    }
-    for(aci=nac;aci-->0;){
-      for(acj=aci+1;acj<nac;acj++)b[aci]-=l[acj][aci]*b[acj];
-      b[aci]/=l[aci][aci];
-    }
-    for(zpi=0;zpi<nzp;zpi++){
-      pri=zp[zpi]>>3;
-      pci=zp[zpi]&7;
-      w=0;
-      for(aci=0;aci<nac;aci++){
-        cri=ac[aci]>>3;
-        cci=ac[aci]&7;
-        w+=OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri]*b[aci];
-      }
-      ext_matrix[zpi][api]=w;
-      w=OC_FABSF(w);
-      while(w>=(1<<shift))shift++;
-    }
-  }
-  _border_info->shift=shift;
-  scale=1.0F/(1<<shift);
-  _border_info->ext_matrix=(ogg_int16_t **)oc_malloc_2d(nzp,nap,
-   sizeof(_border_info->ext_matrix[0][0]));
-  for(zpi=0;zpi<nzp;zpi++){
-    for(api=0;api<nap;api++){
-      w=ext_matrix[zpi][api]*scale;
-      _border_info->ext_matrix[zpi][api]=(ogg_int16_t)
-       OC_CLAMPI(-32768,(int)(32768*w+(w<0?-0.5F:0.5F)),32767);
-    }
-  }
 }
 
 /*Performs a forward 8x8 Type-II DCT transform on blocks which overlap the
-   border of the displayable region.
+   border of the picture region.
+  This method ONLY works with rectangular regions.
   _border:      A description of which pixels are inside the border.
-  _border_info: Information needed to extrapolate pixel values outside the
-                 border.
   _y:           The buffer to store the result in.
                 This may be the same as _x.
   _x:           The input coefficients.
                 Pixel values outside the border will be modified.*/
-void oc_fdct8x8_border(const oc_border_info *_border,
- const oc_border_enc_info *_border_info,
- ogg_int16_t _y[64],ogg_int16_t _x[64]){
-  ogg_int32_t ap[64];
-  int         nap;
-  int         nzp;
-  int         api;
-  int         zpi;
-  nap=_border->npixels;
-  nzp=64-nap;
-  for(api=0;api<nap;api++){
-    ap[api]=_x[_border_info->pi[api]]<<_border_info->shift;
+void oc_fdct8x8_border(const oc_border_info *_border,ogg_int16_t _y[64],
+ ogg_int16_t _x[64]){
+  ogg_int16_t             *in;
+  ogg_int16_t             *end;
+  ogg_int16_t             *out;
+  ogg_int16_t              w[64];
+  ogg_int64_t              mask;
+  const oc_extension_info *rext;
+  const oc_extension_info *cext;
+  int                      rmask;
+  int                      cmask;
+  int                      ri;
+  /*Identify the shapes of the non-zero rows and columns.*/
+  rmask=cmask=0;
+  mask=_border->mask;
+  for(ri=0;ri<8;ri++){
+    rmask|=mask&0xFF;
+    cmask|=((mask&0xFF)!=0)<<ri;
+    mask>>=8;
   }
-  for(zpi=0;zpi<nzp;zpi++){
-    ogg_int32_t v;
-    for(v=api=0;api<nap;api++){
-      v+=_border_info->ext_matrix[zpi][api]*ap[api];
+  /*Find the associated extension info for these shapes.*/
+  if(rmask==0xFF)rext=NULL;
+  else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){
+    /*If we somehow can't find the shape, then just do an unpadded fDCT.
+      It won't be efficient, but it should still be correct.*/
+    if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){
+      oc_fdct8x8(_y,_x);
+      return;
     }
-    _x[_border_info->pi[nap+zpi]]=(ogg_int16_t)OC_DIV_ROUND_POW2(v,15,16384);
   }
-  oc_fdct8x8(_y,_x);
+  if(cmask==0xFF)cext=NULL;
+  else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){
+    /*If we somehow can't find the shape, then just do an unpadded fDCT.
+      It won't be efficient, but it should still be correct.*/
+    if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){
+      oc_fdct8x8(_y,_x);
+      return;
+    }
+  }
+  /*Transform the rows.
+    We can ignore zero rows without a problem.*/
+  if(rext==NULL)for(in=_x,out=w,end=out+8;out<end;in+=8,out++)fdct8(out,in);
+  else for(in=_x,out=w,end=out+8,ri=cmask;out<end;in+=8,out++,ri>>=1){
+    if(ri&1)fdct8_ext(out,in,rext);
+  }
+  /*Transform the columns.
+    We transform even columns that are supposedly zero, because rounding errors
+     may make them slightly non-zero, and this will give a more precise
+     reconstruction with very small quantizers.*/
+  if(cext==NULL)for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8(out,in);
+  else for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8_ext(out,in,cext);
 }

Modified: experimental/derf/theora-exp/lib/fdct.h
===================================================================
--- experimental/derf/theora-exp/lib/fdct.h	2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/fdct.h	2004-10-15 06:58:04 UTC (rev 8006)
@@ -4,10 +4,7 @@
 # define _fdct_H (1)
 
 void oc_fdct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_border_enc_info_init(oc_border_enc_info *_border_info,
- const oc_border_info *_border);
-void oc_fdct8x8_border(const oc_border_info *_border,
- const oc_border_enc_info *_border_info,ogg_int16_t _y[64],
+void oc_fdct8x8_border(const oc_border_info *_border,ogg_int16_t _y[64],
  ogg_int16_t _x[64]);
 
 #endif

Added: experimental/derf/theora-exp/tools/extgen.c
===================================================================
--- experimental/derf/theora-exp/tools/extgen.c	2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/tools/extgen.c	2004-10-15 06:58:04 UTC (rev 8006)
@@ -0,0 +1,406 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <string.h>
+
+
+
+/*This utility generates the extrapolation matrices used to pad incomplete
+   blocks when given frame sizes that are not a multiple of two.
+  Although the technique used here is generalizable to non-rectangular shapes,
+   the storage needed for the matrices becomes larger (exactly 13.75K, if we
+   discount the superfluous mask, na, pi, and ci members of the
+   oc_coeffcient_info structure and use a 256-byte mask-padding lookup table
+   instead).*/
+
+
+
+/*The matrix representation of the Theora's actual iDCT.
+  This is computed using the 16-bit approximations of the sines and cosines
+   that Theora uses, but using floating point values and assuming that no
+   truncation occurs after division (i.e., pretending that the transform is
+   linear like the real DCT).*/
+static const double OC_IDCT_MATRIX[8][8]={
+  {
+    +0.7071075439453125,+0.9807891845703125,
+    +0.9238739013671875,+0.8314666748046875,
+    +0.7071075439453125,+0.555572509765625,
+    +0.3826904296875,   +0.1950836181640625
+  },
+  {
+    +0.7071075439453125,+0.8314685295335948,
+    +0.3826904296875,   -0.1950868454296142,
+    -0.7071075439453125,-0.9807858711574227,
+    -0.9238739013671875,-0.5555783333256841
+  },
+  {
+    +0.7071075439453125,+0.5555783333256841,
+    -0.3826904296875,   -0.9807858711574227,
+    -0.7071075439453125,+0.1950868454296142,
+     0.9238739013671875,+0.8314685295335948
+  },
+  {
+    +0.7071075439453125,+0.1950836181640625,
+    -0.9238739013671875,-0.555572509765625,
+    +0.7071075439453125,+0.8314666748046875,
+    -0.3826904296875,   -0.9807891845703125
+  },
+  {
+    +0.7071075439453125,-0.1950836181640625,
+    -0.9238739013671875,+0.555572509765625,
+    +0.7071075439453125,-0.8314666748046875,
+    -0.3826904296875,   +0.9807891845703125
+  },
+  {
+    +0.7071075439453125,-0.5555783333256841,
+    -0.3826904296875,    0.9807858711574227,
+    -0.7071075439453125,-0.1950868454296142,
+    +0.9238739013671875,-0.8314685295335948
+  },
+  {
+    +0.7071075439453125,-0.8314685295335948,
+    +0.3826904296875,    +0.1950868454296142,
+    -0.7071075439453125,+0.9807858711574227,
+    -0.9238739013671875,+0.5555783333256841
+  },
+  {
+    +0.7071075439453125,-0.9807891845703125,
+    +0.9238739013671875,-0.8314666748046875,
+    +0.7071075439453125,-0.555572509765625,
+    +0.3826904296875,   -0.1950836181640625
+  }
+};
+
+/*A table of the non-zero coefficients to keep for each possible shape.
+  Our basis selection is chosen to optimize the coding gain.
+  This gives us marginally better performance than other optimization criteria
+   for the border extension case (and significantly better performance in the
+   general case).*/
+static const unsigned char OC_PAD_MASK_GAIN[256]={
+  0x00,0x01,0x01,0x11,0x01,0x11,0x05,0x49,
+  0x01,0x05,0x11,0x15,0x11,0x15,0xA1,0x55,
+  0x01,0x05,0x11,0x15,0x11,0x29,0x51,0x55,
+  0x03,0x85,0x19,0xA5,0x61,0x35,0xB1,0xAD,
+  0x01,0x11,0x05,0x23,0x03,0x85,0x0D,0x2B,
+  0x11,0x15,0x61,0x55,0x13,0x1B,0xA3,0x5B,
+  0x11,0x83,0x83,0x55,0x13,0x93,0xC3,0xCB,
+  0x61,0xC3,0x33,0x3B,0x99,0xCB,0xB3,0xDB,
+  0x01,0x11,0x03,0x0B,0x05,0x0B,0x07,0x4B,
+  0x11,0x07,0x07,0x1B,0x83,0x8B,0x87,0x57,
+  0x11,0x23,0x07,0x33,0x61,0x2B,0x0F,0x57,
+  0x19,0x87,0x1B,0x2F,0x33,0x8F,0x8F,0xB7,
+  0x05,0x83,0x07,0x63,0x0D,0x87,0x87,0x6B,
+  0x51,0x47,0x0F,0x67,0xC3,0xCB,0xE3,0xD7,
+  0xA1,0xA3,0x87,0x73,0xA3,0x6B,0xE3,0xCF,
+  0xB1,0x8F,0x8F,0xE7,0xB3,0xCF,0xE7,0xF7,
+  0x01,0x03,0x11,0x43,0x11,0x31,0x83,0x93,
+  0x05,0x07,0x23,0x27,0x83,0x87,0xA3,0x57,
+  0x05,0x07,0x07,0xC3,0x15,0x0F,0x47,0x4F,
+  0x85,0x87,0x87,0xA7,0xC3,0x8F,0x8F,0xAF,
+  0x11,0x31,0x0B,0x1B,0x85,0x1B,0x87,0x2F,
+  0x29,0x0F,0x2B,0x1F,0x93,0x1F,0x6B,0x5F,
+  0x15,0x87,0x8B,0x57,0x1B,0x1F,0xCB,0x5F,
+  0x35,0x8F,0x8F,0x3F,0xCB,0x9F,0xCF,0xDF,
+  0x11,0x43,0x0B,0x33,0x23,0x1B,0x63,0x9B,
+  0x15,0xC3,0x33,0x37,0x55,0x57,0x73,0x77,
+  0x15,0x27,0x1B,0x37,0x55,0x1F,0x67,0x3F,
+  0xA5,0xA7,0x2F,0xB7,0x3B,0x3F,0xE7,0xBF,
+  0x49,0x93,0x4B,0x9B,0x2B,0x2F,0x6B,0x7B,
+  0x55,0x4F,0x57,0x3F,0xCB,0x5F,0xCF,0x7F,
+  0x55,0x57,0x57,0x77,0x5B,0x5F,0xD7,0x7F,
+  0xAD,0xAF,0xB7,0xBF,0xDB,0xDF,0xF7,0xFF
+};
+
+
+
+/*Computes the Cholesky factorization L L^T of the matrix G^T G, where G is the
+   currently selected bass functions restricted to the region of spatial
+   support.
+  The reciprocal of the diagonal element is stored instead of the diagonal
+   itself, so that the division only needs to be done once.
+  _l:   The L matrix to compute.
+  _ac:  The set of basis functions used for each row.
+  _ap:  The set of pixels that are not padding.
+  _na:  The number of active pixels/coefficients.*/
+static void oc_cholesky8(double _l[8][8],int _ac[8],int _ap[8],int _na){
+  int aci;
+  int acj;
+  int ack;
+  int api;
+  int ci;
+  int cj;
+  int pi;
+  for(aci=0;aci<_na;aci++){
+    /*Step 1: Add the next row/column of G^T G.*/
+    ci=_ac[aci];
+    for(acj=0;acj<=aci;acj++){
+      cj=_ac[acj];
+      _l[aci][acj]=0;
+      for(api=0;api<_na;api++){
+        pi=_ap[api];
+        _l[aci][acj]+=OC_IDCT_MATRIX[pi][cj]*OC_IDCT_MATRIX[pi][ci];
+      }
+    }
+    /*Step 2: Convert the newly added row to the corresponding row of the
+       Cholesky factorization.*/
+    for(acj=0;acj<aci;acj++){
+      for(ack=0;ack<acj;ack++)_l[aci][acj]-=_l[aci][ack]*_l[acj][ack];
+      _l[aci][acj]*=_l[acj][acj];
+    }
+    for(ack=0;ack<aci;ack++)_l[aci][aci]-=_l[aci][ack]*_l[aci][ack];
+    _l[aci][aci]=1/sqrt(_l[aci][aci]);
+  }
+}
+
+/*Computes the padding extrapolation matrix for a single 1-D 8-point DCT.*/
+int oc_calc_ext8(double _e[8][8],int _ap[8],int _ac[8],int _mask,int _pad){
+  double b[8];
+  double l[8][8];
+  double w;
+  int   api;
+  int   zpi;
+  int   aci;
+  int   acj;
+  int   ci;
+  int   pi;
+  int   na;
+  int   nz;
+  na=nz=0;
+  for(pi=0;pi<8;pi++){
+    if(_mask&1<<pi)_ap[na++]=pi;
+    else _ap[8-++nz]=pi;
+  }
+  na=nz=0;
+  for(ci=0;ci<8;ci++){
+    if(_pad&1<<ci)_ac[na++]=ci;
+    else _ac[8-++nz]=ci;
+  }
+  oc_cholesky8(l,_ac,_ap,na);
+  for(api=0;api<na;api++){
+    pi=_ap[api];
+    for(aci=0;aci<na;aci++){
+      b[aci]=OC_IDCT_MATRIX[pi][_ac[aci]];
+      for(acj=0;acj<aci;acj++)b[aci]-=l[aci][acj]*b[acj];
+      b[aci]*=l[aci][aci];
+    }
+    for(aci=na;aci-->0;){
+      for(acj=aci+1;acj<na;acj++)b[aci]-=l[acj][aci]*b[acj];
+      b[aci]*=l[aci][aci];
+    }
+    for(zpi=na;zpi<8;zpi++){
+      pi=_ap[zpi];
+      w=0;
+      for(aci=0;aci<na;aci++)w+=OC_IDCT_MATRIX[pi][_ac[aci]]*b[aci];
+      _e[zpi][api]=w;
+    }
+  }
+  return na;
+}
+
+/*The precision at which the matrix coefficients are stored.*/
+#define OC_EXT_SHIFT (19)
+
+/*The number of shapes we need.*/
+#define OC_NSHAPES   (35)
+
+/*These are all the possible shapes that could be encountered in border
+   extension.
+  The first 14 correspond to the left and right borders (or top and bottom when
+   considered vertically), and the next 21 arise from the combination of the
+   two, to handle the case of a video less than 8 pixels wide (or tall).
+  Order matters here, because we will search this array linearly for the entry
+   to use during the actual transform.*/
+static int masks[OC_NSHAPES]={
+  0x7F,0xFE,0x3F,0xFC,0x1F,0xF8,0x0F,0xF0,0x07,0xE0,0x03,0xC0,0x01,0x80,
+  0x7E,
+  0x7C,0x3E,
+  0x78,0x3C,0x1E,
+  0x70,0x38,0x1C,0x0E,
+  0x60,0x30,0x18,0x0C,0x06,
+  0x40,0x20,0x10,0x08,0x04,0x02
+};
+
+int main(void){
+  int   coeffs[8192];
+  int   ncoeffs;
+  int   ci;
+  int   crstart[OC_NSHAPES*8];
+  int   crend[OC_NSHAPES*8];
+  int   ncr;
+  int   cri;
+  int   offsets[1024];
+  int   noffsets;
+  int   oi;
+  int   orstart[OC_NSHAPES*8];
+  int   orend[OC_NSHAPES*8];
+  int   nor;
+  int   ori;
+  int   ap[OC_NSHAPES][8];
+  int   ac[OC_NSHAPES][8];
+  int   na[OC_NSHAPES];
+  int   zpi;
+  int   mi;
+  ncoeffs=ncr=0;
+  /*We store all the coefficients from the extension matrices in one large
+     table.
+    This lets us exploit the repetitiveness and overlap of many of the rows to
+     reduce the size of the tables by about 49%.
+    Our overlap search strategy is pretty simple, so we could probably do even
+     better, but further improvement would be small.*/
+  for(mi=0;mi<OC_NSHAPES;mi++){
+    double e[8][8];
+    int    api;
+    na[mi]=oc_calc_ext8(e,ap[mi],ac[mi],masks[mi],OC_PAD_MASK_GAIN[masks[mi]]);
+    for(zpi=na[mi];zpi<8;zpi++){
+      int cr[8];
+      int best_ci;
+      int best_cre;
+      int cj;
+      for(api=0;api<na[mi];api++){
+        cr[api]=(int)(e[zpi][api]*(1<<OC_EXT_SHIFT)+(e[zpi][api]<0?-0.5F:0.5F));
+      }
+      best_ci=ncoeffs;
+      best_cre=0;
+      for(ci=0;ci<ncoeffs;ci++){
+        for(cj=ci;cj<ncoeffs&&cj-ci<na[mi]&&coeffs[cj]==cr[cj-ci];cj++);
+        if(cj-ci>best_cre){
+          if(cj-ci<na[mi]){
+            /*If we need to extend the range, check to make sure we aren't
+               interrupting any other range.*/
+            for(cri=0;cri<ncr&&(crstart[cri]>=cj||crend[cri]<=cj);cri++);
+            if(cri>=ncr){
+              /*No conflicting ranges were found, so we can insert the
+                 necessary coefficients.*/
+              best_ci=ci;
+              best_cre=cj-ci;
+            }
+          }
+          else{
+            /*Otherwise, we have a complete match, so we can stop searching.*/
+            best_ci=ci;
+            best_cre=cj-ci;
+            break;
+          }
+        }
+      }
+      if(best_cre<na[mi]){
+        /*We don't have a complete match, so we need to insert the remaining
+           coefficients.*/
+        memmove(coeffs+best_ci+na[mi],coeffs+best_ci+best_cre,
+         sizeof(coeffs[0])*(ncoeffs-best_ci-best_cre));
+        /*And now we need to update all the ranges that start after the
+           insertion point.*/
+        for(cri=0;cri<ncr;cri++){
+          if(crstart[cri]>=best_ci+best_cre){
+            crstart[cri]+=na[mi]-best_cre;
+            crend[cri]+=na[mi]-best_cre;
+          }
+        }
+      }
+      /*Actually add the coefficients.*/
+      for(cj=best_cre;cj<na[mi];cj++)coeffs[best_ci+cj]=cr[cj];
+      ncoeffs+=na[mi]-best_cre;
+      /*Store the endpoints of the range.*/
+      crstart[ncr]=best_ci;
+      crend[ncr]=best_ci+na[mi];
+      ncr++;
+    }
+
+  }
+  printf("/*The precision at which the matrix coefficients are stored.*/\n");
+  printf("#define OC_EXT_SHIFT (%i)\n\n",OC_EXT_SHIFT);
+  printf("/*The number of shapes we need.*/\n");
+  printf("#define OC_NSHAPES   (%i)\n\n",OC_NSHAPES);
+  printf("static const ogg_int32_t OC_EXT_COEFFS[%i]={",ncoeffs);
+  for(ci=0;ci<ncoeffs;ci++){
+    if((ci&7)==0)printf("\n  ");
+    printf("%c0x%05X",coeffs[ci]<0?'-':' ',abs(coeffs[ci]));
+    if(ci+1<ncoeffs)printf(",");
+  }
+  printf("\n};\n\n");
+  /*Now we repeat the same overlap strategy on the row pointers.
+    This only gets us around 45% space reduction, but that's still worth it.*/
+  noffsets=nor=0;
+  for(cri=mi=0;mi<OC_NSHAPES;mi++){
+    int best_oi;
+    int best_ore;
+    int oj;
+    int nz;
+    best_oi=noffsets;
+    best_ore=0;
+    nz=8-na[mi];
+    for(oi=0;oi<noffsets;oi++){
+      for(oj=oi;oj<noffsets&&oj-oi<nz&&offsets[oj]==crstart[cri+oj-oi];
+       oj++);
+      if(oj-oi>best_ore){
+        if(oj-oi<nz){
+          /*If we need to extend the range, check to make sure we aren't
+             interrupting any other range.*/
+          for(ori=0;ori<nor&&(orstart[ori]>=oj||orend[ori]<=oj);ori++);
+          if(ori>=nor){
+            /*No conflicting ranges were found, so we can insert the
+               necessary coefficients.*/
+            best_oi=oi;
+            best_ore=oj-oi;
+          }
+        }
+        else{
+          /*Otherwise, we have a complete match, so we can stop searching.*/
+          best_oi=oi;
+          best_ore=oj-oi;
+          break;
+        }
+      }
+    }
+    if(best_ore<nz){
+      /*We don't have a complete match, so we need to insert the remaining
+         offsets.*/
+      memmove(offsets+best_oi+nz,offsets+best_oi+best_ore,
+       sizeof(offsets[0])*(noffsets-best_oi-best_ore));
+      /*And now we need to update all the ranges that start after the
+         insertion point.*/
+      for(ori=0;ori<nor;ori++){
+        if(orstart[ori]>=best_oi+best_ore){
+          orstart[ori]+=nz-best_ore;
+          orend[ori]+=nz-best_ore;
+        }
+      }
+    }
+    /*Actually add the offsets.*/
+    for(oj=best_ore;oj<nz;oj++)offsets[best_oi+oj]=crstart[cri+oj];
+    noffsets+=nz-best_ore;
+    /*Store the endpoints of the range.*/
+    orstart[nor]=best_oi;
+    orend[nor]=best_oi+nz;
+    nor++;
+    cri+=nz;
+  }
+  printf("static const ogg_int32_t *const OC_EXT_ROWS[%i]={",noffsets);
+  for(oi=0;oi<noffsets;oi++){
+    if((oi&3)==0)printf("\n  ");
+    printf("OC_EXT_COEFFS+%4i",offsets[oi]);
+    if(oi+1<noffsets)printf(",");
+  }
+  printf("\n};\n\n");
+  printf("static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={\n");
+  for(mi=0;mi<OC_NSHAPES;mi++){
+    int i;
+    printf("  {");
+    printf("0x%02X,%i,OC_EXT_ROWS+%3i,",masks[mi],na[mi],orstart[mi]);
+    printf("{");
+    for(i=0;i<8;i++){
+      printf("%i",ap[mi][i]);
+      if(i<7)printf(",");
+    }
+    printf("},{");
+    for(i=0;i<8;i++){
+      printf("%i",ac[mi][i]);
+      if(i<7)printf(",");
+    }
+    printf("}}");
+    if(mi+1<OC_NSHAPES)printf(",");
+    printf("\n");
+  }
+  printf("};\n");
+  return 0;
+}