[xiph-commits] r8006 - in experimental/derf/theora-exp: lib tools
tterribe at motherfish-iii.xiph.org
tterribe at motherfish-iii.xiph.org
Thu Oct 14 23:58:04 PDT 2004
Author: tterribe
Date: 2004-10-14 23:58:04 -0700 (Thu, 14 Oct 2004)
New Revision: 8006
Added:
experimental/derf/theora-exp/tools/extgen.c
Modified:
experimental/derf/theora-exp/lib/encint.h
experimental/derf/theora-exp/lib/encode.c
experimental/derf/theora-exp/lib/fdct.c
experimental/derf/theora-exp/lib/fdct.h
Log:
Commit a much faster implementation of boundary padding.
The total cost is now no more than twice that of a normal fDCT, and needs only
around 1.5K of global side information, instead of around 64K per encoder
instance.
Performance should be indistinguishable.
Also add another part of the zero-length packet for dropped frames.
Not sure why this wasn't committed before.
Modified: experimental/derf/theora-exp/lib/encint.h
===================================================================
--- experimental/derf/theora-exp/lib/encint.h 2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/encint.h 2004-10-15 06:58:04 UTC (rev 8006)
@@ -6,7 +6,6 @@
typedef struct oc_fragment_enc_info oc_fragment_enc_info;
typedef struct oc_mb_enc_info oc_mb_enc_info;
-typedef struct oc_border_enc_info oc_border_enc_info;
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
typedef struct oc_impmap_ctx oc_impmap_ctx;
typedef struct oc_mcenc_ctx oc_mcenc_ctx;
@@ -76,19 +75,6 @@
int aerror4mv;
};
-/*Information used to extend fragments which straddle the cropping border in
- an optimal manner.*/
-struct oc_border_enc_info{
- /*The indices of pixels with valid values first and then padding pixels.*/
- unsigned char pi[64];
- /*The amount to scale pixel values by before multiplying by the extension
- matrix.*/
- int shift;
- /*The matrix used to extend the fragment into the padding region, in signed
- 1.15 fixed-point format.*/
- ogg_int16_t **ext_matrix;
-};
-
/*A structure used to track the optimal mode coding scheme, for use in
estimating the cost of coding each mode label during the mode selection
process.*/
@@ -143,8 +129,6 @@
oc_fragment_enc_info *frinfo;
/*Encoder-specific macro block information.*/
oc_mb_enc_info *mbinfo;
- /*Encoder-specific border information.*/
- oc_border_enc_info border_info[16];
/*Minimum psychovisual tolerance for the DC coefficients in each plane.*/
unsigned dc_tol_mins[3];
/*The qi value lists selected for each potential frame type.*/
Modified: experimental/derf/theora-exp/lib/encode.c
===================================================================
--- experimental/derf/theora-exp/lib/encode.c 2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/encode.c 2004-10-15 06:58:04 UTC (rev 8006)
@@ -850,7 +850,6 @@
pixels.*/
if(_frag->border!=NULL){
ogg_int64_t mask;
- int bii;
mask=_frag->border->mask;
for(pixi=y=0;y<8;y++){
for(x=0;x<8;x++,pixi++){
@@ -868,8 +867,7 @@
}
pixels+=_ystride;
}
- bii=_frag->border-_enc->state.borders;
- oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,pix_buf);
+ oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
}
/*Otherwise, copy all the pixels in the fragment and do a normal DCT.*/
else{
@@ -897,7 +895,6 @@
int ref_framei;
int mvoffset0;
int mvoffset1;
- int bii;
int y;
int x;
src_ystride=_enc->state.input[_pli].ystride;
@@ -921,9 +918,7 @@
ref0+=ref_ystride;
ref1+=ref_ystride;
}
- bii=_frag->border-_enc->state.borders;
- oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,
- pix_buf);
+ oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
}
else{
for(pixi=y=0;y<8;y++){
@@ -950,9 +945,7 @@
src+=src_ystride;
ref0+=ref_ystride;
}
- bii=_frag->border-_enc->state.borders;
- oc_fdct8x8_border(_frag->border,_enc->border_info+bii,_dct_vals,
- pix_buf);
+ oc_fdct8x8_border(_frag->border,_dct_vals,pix_buf);
}
else{
for(pixi=y=0;y<8;y++){
@@ -2437,24 +2430,30 @@
}
static int oc_enc_deltaframe(oc_enc_ctx *_enc){
- oc_enc_mark_coded(_enc);
- oc_enc_quant_sel_quality(_enc,0);
- _enc->state.frame_type=oc_enc_choose_mbmodes(_enc);
- if(_enc->state.frame_type==OC_INTER_FRAME)oc_enc_do_inter_dcts(_enc);
- oc_enc_quant_dc(_enc);
- oc_enc_residual_tokenize(_enc);
oggpackB_reset(&_enc->opb);
- oc_enc_frame_header_pack(_enc);
- if(_enc->state.frame_type==OC_INTER_FRAME){
- oggpackB_writecopy(&_enc->opb,
- oggpackB_get_buffer(&_enc->opb_coded_flags),
- oggpackB_bits(&_enc->opb_coded_flags));
- oc_enc_mb_modes_pack(_enc);
- oc_enc_mvs_pack(_enc);
+ oc_enc_mark_coded(_enc);
+ /*Only proceed if we have some coded blocks.
+ No coded blocks -> dropped frame -> 0 byte packet.*/
+ if(_enc->state.ncoded_fragis[0]!=0||
+ _enc->state.ncoded_fragis[1]!=0||
+ _enc->state.ncoded_fragis[2]!=0){
+ oc_enc_quant_sel_quality(_enc,0);
+ _enc->state.frame_type=oc_enc_choose_mbmodes(_enc);
+ if(_enc->state.frame_type==OC_INTER_FRAME)oc_enc_do_inter_dcts(_enc);
+ oc_enc_quant_dc(_enc);
+ oc_enc_residual_tokenize(_enc);
+ oc_enc_frame_header_pack(_enc);
+ if(_enc->state.frame_type==OC_INTER_FRAME){
+ oggpackB_writecopy(&_enc->opb,
+ oggpackB_get_buffer(&_enc->opb_coded_flags),
+ oggpackB_bits(&_enc->opb_coded_flags));
+ oc_enc_mb_modes_pack(_enc);
+ oc_enc_mvs_pack(_enc);
+ }
+ oc_enc_block_qis_pack(_enc);
+ /*Pack the quantized DCT coefficients.*/
+ oc_enc_residual_tokens_pack(_enc);
}
- oc_enc_block_qis_pack(_enc);
- /*Pack the quantized DCT coefficients.*/
- oc_enc_residual_tokens_pack(_enc);
/*Success: Mark the packet as ready to be flushed.*/
_enc->packet_state=OC_PACKET_READY;
return 0;
@@ -2462,17 +2461,12 @@
static int oc_enc_init(oc_enc_ctx *_enc,const theora_info *_info){
- int bii;
int ret;
/*Initialize the shared encoder/decoder state.*/
ret=oc_state_init(&_enc->state,_info);
if(ret<0)return ret;
_enc->block_coded_flags=_ogg_calloc(_enc->state.nfrags,
sizeof(_enc->block_coded_flags[0]));
- /*Compute the matrices needed to extend the frame into the border regions.*/
- for(bii=0;bii<_enc->state.nborders;bii++){
- oc_border_enc_info_init(_enc->border_info+bii,_enc->state.borders+bii);
- }
/*Initialize our packet buffers.*/
oggpackB_writeinit(&_enc->opb);
oggpackB_writeinit(&_enc->opb_coded_flags);
Modified: experimental/derf/theora-exp/lib/fdct.c
===================================================================
--- experimental/derf/theora-exp/lib/fdct.c 2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/fdct.c 2004-10-15 06:58:04 UTC (rev 8006)
@@ -2,66 +2,9 @@
#include <string.h>
#include "dct.h"
#include "fdct.h"
-#include "idct.h"
-/*The matrix representation of the Theora's actual iDCT.
- This is computed using the 16-bit approximations of the sines and cosines
- that Theora uses, but using floating point values and assuming that no
- truncation occurs after division (i.e., pretending that the transform is
- linear like the real DCT).*/
-static const float OC_IDCT_MATRIX[8][8]={
- {
- +0.7071075439453125F,+0.9807891845703125F,
- +0.9238739013671875F,+0.8314666748046875F,
- +0.7071075439453125F,+0.555572509765625F,
- +0.3826904296875F, +0.1950836181640625F
- },
- {
- +0.7071075439453125F,+0.8314685295335948F,
- +0.3826904296875F, -0.1950868454296142F,
- -0.7071075439453125F,-0.9807858711574227F,
- -0.9238739013671875F,-0.5555783333256841F
- },
- {
- +0.7071075439453125F,+0.5555783333256841F,
- -0.3826904296875F, -0.9807858711574227F,
- -0.7071075439453125F,+0.1950868454296142F,
- 0.9238739013671875F,+0.8314685295335948F
- },
- {
- +0.7071075439453125F,+0.1950836181640625F,
- -0.9238739013671875F,-0.555572509765625F,
- +0.7071075439453125F,+0.8314666748046875F,
- -0.3826904296875F, -0.9807891845703125F
- },
- {
- +0.7071075439453125F,-0.1950836181640625F,
- -0.9238739013671875F,+0.555572509765625F,
- +0.7071075439453125F,-0.8314666748046875F,
- -0.3826904296875F, +0.9807891845703125F
- },
- {
- +0.7071075439453125F,-0.5555783333256841F,
- -0.3826904296875F, 0.9807858711574227F,
- -0.7071075439453125F,-0.1950868454296142F,
- +0.9238739013671875F,-0.8314685295335948F
- },
- {
- +0.7071075439453125F,-0.8314685295335948F,
- +0.3826904296875, +0.1950868454296142F,
- -0.7071075439453125F,+0.9807858711574227F,
- -0.9238739013671875F,+0.5555783333256841F
- },
- {
- +0.7071075439453125F,-0.9807891845703125F,
- +0.9238739013671875F,-0.8314666748046875F,
- +0.7071075439453125F,-0.555572509765625F,
- +0.3826904296875F, -0.1950836181640625F
- }
-};
-
/*Performs a forward 8 point Type-II DCT transform.
The output is scaled by a factor of 2 from the orthonormal version of the
transform.
@@ -126,7 +69,6 @@
/*Performs a forward 8x8 Type-II DCT transform.
The output is scaled by a factor of 4 relative to the orthonormal version
of the transform.
- TODO: What is the maximum dynamic range of the input? output?
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients. */
@@ -141,376 +83,238 @@
for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8(out,in);
}
+/*Information needed to pad boundary blocks.
+ We multiply each row/column by an extension matrix that fills in the padding
+ values as a linear combination of the active values, so that an equivalent
+ number of coefficients are forced to zero.
+ This costs at most 16 multiplies, the same as a 1-D fDCT itself, and as
+ little as 7 multiplies.
+ We compute the extension matrices for every possible shape in advance, as
+ there are only 35.
+ The coefficients for all matrices are stored in a single array to take
+ advantage of the overlap and repetitiveness of many of the shapes.
+ A similar technique is applied to the offsets into this array.
+ This reduces the required table storage by about 48%.
+ See tools/extgen.c for details.
+ We could conceivably do the same for all 256 possible shapes.*/
+typedef struct oc_extension_info{
+ /*The mask of the active pixels in the shape.*/
+ short mask;
+ /*The number of active pixels in the shape.*/
+ short na;
+ /*The extension matrix.
+ This is (8-na)xna*/
+ const ogg_int32_t *const *ext;
+ /*The pixel indices: na active pixels followed by 8-na padding pixels.*/
+ unsigned char pi[8];
+ /*The coefficient indices: na unconstrained coefficients followed by 8-na
+ coefficients to be forced to zero.*/
+ unsigned char ci[8];
+}oc_extension_info;
-/*A general low-pass extension technique:
- The idea behind low-pass signal extension is to extend a signal beyond its
- boundary in such a way that the signal energy is concentrated in the
- low-frequency coefficients (i.e., is easy to compress).
- There are several different ways to go about this.
- The obvious one is to select the coefficients at the end of the zig-zag run,
- and select padding values that force these coefficients to zero.
- This can be done in an iterative manner, by filling in the padding values
- with some initial guess (e.g., the average non-padding pixel value),
- performing a DCT, zeroing the desired coefficients, inverting the DCT, and
- updating the padding values with the result of the inverse (keeping the
- non-padding values fixed).
- I remember reading about this method somewhere, but I can no longer find the
- reference for it.
- In practice, however, this method does not actually work when implemented as
- just described.
+/*The precision at which the matrix coefficients are stored.*/
+#define OC_EXT_SHIFT (19)
- The reason for this becomes clear when examining the non-iterative
- formulation of the problem as a system of linear equations.
- One wants to solve for the padding values that zero out the desired set of
- DCT coefficients.
- This can be done by extracting the appropriate submatrix from the general
- 2D DCT transformation matrix and inverting it, then multiplying this
- inverse by the actual values of these coefficients computed with some
- arbitrary initial padding value (e.g., 0).
+/*The number of shapes we need.*/
+#define OC_NSHAPES (35)
- Unfortunately, the linear system derived in this manner is ill-conditioned.
- Due to the symmetry of the basis functions, the submatrices are often
- singular.
- One could instead use the pseudoinverse to find a least-squares solution
- for the coefficient values, however the system is also fairly stiff,
- containing large order of magnitude differences in the singular values.
- This causes very large values to appear in the pseudoinverse, producing
- interpolated pixel values that are well outside the normal [-255,255]
- range and creating generally large coefficient values in the non-zero
- coefficients.
+static const ogg_int32_t OC_EXT_COEFFS[231]={
+ 0x80000,-0x1E085,-0x96FC9,-0x55871, 0x55871, 0x96FC9, 0x1E085, 0x96FC9,
+ 0x55871,-0x55871,-0x96FC9,-0x1E085, 0x80000, 0x00000, 0x00000, 0x00000,
+ 0x80000, 0x00000, 0x00000, 0x80000,-0x80000, 0x80000, 0x00000, 0x00000,
+ 0x80000,-0x1E086, 0x1E086,-0x4F591,-0x55E37, 0x337C7, 0x8C148, 0x43448,
+ 0x2266F, 0x43448, 0x8C148, 0x337C7,-0x55E37,-0x4F591,-0x75743, 0x4F591,
+ 0x03B45,-0x1D29D, 0x929E0, 0x2CF2A, 0x929E0,-0x1D29D, 0x03B45, 0x4F591,
+ -0x75743, 0x11033, 0x7AEF5, 0x5224C,-0x209FA,-0x3D77A,-0x209FA, 0x5224C,
+ 0x7AEF5, 0x11033, 0x668A4,-0x29124, 0x3A15B, 0x0E6BD,-0x05F98, 0x0E6BD,
+ 0x3A15B,-0x29124, 0x668A4, 0x2A78F, 0x24019,-0x67F0F, 0x50F49, 0x4881E,
+ 0x50F49,-0x67F0F, 0x24019, 0x2A78F,-0x06898, 0x27678, 0x5F220, 0x27678,
+ -0x06898, 0x1F910, 0x76C13,-0x16523, 0x76C13, 0x1F910, 0x9EB2C,-0x2E7B1,
+ 0x0FC86,-0x2E7B1, 0x9EB2C, 0x4F594, 0x43452,-0x129E6, 0x43452, 0x4F594,
+ -0x0A8C0, 0x5D997, 0x2CF29, 0x5D997,-0x0A8C0, 0x30FBD,-0x0B7E5,-0x6AC38,
+ -0x153C8, 0x8B7E5, 0x4F043, 0x8B7E5,-0x153C8,-0x6AC38,-0x0B7E5, 0x30FBD,
+ -0x5A827, 0x153C8, 0x6AC38, 0x3C7A2, 0x1E085, 0x3C7A2, 0x6AC38, 0x153C8,
+ -0x5A827, 0x5A827, 0x6AC38, 0x153C8,-0x3C7A2,-0x1E085,-0x3C7A2, 0x153C8,
+ 0x6AC38, 0x5A827, 0x30FBB, 0x4F045, 0x273D7,-0x273D7, 0x273D7, 0x1E08B,
+ 0x61F75, 0x1E08B, 0x273D7,-0x273D7, 0x273D7, 0x4F045, 0x30FBB,-0x273D7,
+ 0x273D7, 0x9E08B,-0x1E08B, 0x9E08B, 0x273D7,-0x273D7, 0x4F045, 0x30FBB,
+ -0x273D7, 0x273D7,-0x273D7, 0x30FBB, 0x4F045, 0x1FC86, 0x67AC8, 0x18538,
+ -0x1FC86, 0x18538, 0x67AC8, 0x1FC86, 0x45460,-0x1FC87, 0x1FC87, 0x3ABA0,
+ 0x1FC87,-0x1FC87, 0x45460, 0x35052, 0x5586D,-0x0A8BF, 0x5586D, 0x35052,
+ -0x43EF2, 0x78F43, 0x4AFAE,-0x070BD, 0x3C10E, 0x3C10E,-0x070BD, 0x4AFAE,
+ 0x78F43,-0x43EF2, 0x3C10E, 0x78F43,-0x35052, 0x78F43, 0x3C10E, 0x070BD,
+ 0x236D6, 0x5586D, 0x236D6, 0x070BD,-0x07755, 0x3C79F, 0x4AFB6,-0x190D2,
+ 0x4E11C, 0x9FC86, 0x153C4,-0x3504A, 0xAC38E, 0x08CBB, 0x1E086,-0x1E086,
+ 0x80000, 0x08CBB, 0xAC38E,-0x3504A, 0x153C4, 0x9FC86, 0x4E11C,-0x190D2,
+ 0x4AFB6, 0x3C79F,-0x07755,-0x5A818, 0xDA818,-0x5A818,-0x101C31, 0x181C31,
+ -0x101C31,-0xD0C67, 0x150C67,-0xD0C67,-0x7643F, 0xF643F,-0x7643F
+};
- Another obvious choices of coefficients to zero is the m highest frequency
- values in each column, where m is the number of rows that have at least as
- many as 8-ci padding values, where ci is the column index.
- This selects the highest frequency coefficients with some non-zero support
- in the padding region, and ensures that every pixel in the padding region
- is covered by at least one such coefficient's basis function.
- However, these conditions alone are not enough to guarantee that the
- corresponding submatrix of the DCT transform is non-singular, nor does it
- even help reduce the stiffness of the least squares problem.
+static const ogg_int32_t *const OC_EXT_ROWS[96]={
+ OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,
+ OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 6,
+ OC_EXT_COEFFS+ 27,OC_EXT_COEFFS+ 38,OC_EXT_COEFFS+ 43,OC_EXT_COEFFS+ 32,
+ OC_EXT_COEFFS+ 49,OC_EXT_COEFFS+ 58,OC_EXT_COEFFS+ 67,OC_EXT_COEFFS+ 71,
+ OC_EXT_COEFFS+ 62,OC_EXT_COEFFS+ 53,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,
+ OC_EXT_COEFFS+ 14,OC_EXT_COEFFS+ 13,OC_EXT_COEFFS+ 76,OC_EXT_COEFFS+ 81,
+ OC_EXT_COEFFS+ 86,OC_EXT_COEFFS+ 91,OC_EXT_COEFFS+ 96,OC_EXT_COEFFS+ 98,
+ OC_EXT_COEFFS+ 93,OC_EXT_COEFFS+ 88,OC_EXT_COEFFS+ 83,OC_EXT_COEFFS+ 78,
+ OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,
+ OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,
+ OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 101,OC_EXT_COEFFS+ 106,
+ OC_EXT_COEFFS+ 112,OC_EXT_COEFFS+ 16,OC_EXT_COEFFS+ 121,OC_EXT_COEFFS+ 125,
+ OC_EXT_COEFFS+ 20,OC_EXT_COEFFS+ 116,OC_EXT_COEFFS+ 130,OC_EXT_COEFFS+ 133,
+ OC_EXT_COEFFS+ 143,OC_EXT_COEFFS+ 150,OC_EXT_COEFFS+ 157,OC_EXT_COEFFS+ 164,
+ OC_EXT_COEFFS+ 167,OC_EXT_COEFFS+ 160,OC_EXT_COEFFS+ 153,OC_EXT_COEFFS+ 146,
+ OC_EXT_COEFFS+ 136,OC_EXT_COEFFS+ 139,OC_EXT_COEFFS+ 171,OC_EXT_COEFFS+ 176,
+ OC_EXT_COEFFS+ 181,OC_EXT_COEFFS+ 186,OC_EXT_COEFFS+ 191,OC_EXT_COEFFS+ 196,
+ OC_EXT_COEFFS+ 201,OC_EXT_COEFFS+ 206,OC_EXT_COEFFS+ 209,OC_EXT_COEFFS+ 214,
+ OC_EXT_COEFFS+ 198,OC_EXT_COEFFS+ 203,OC_EXT_COEFFS+ 24,OC_EXT_COEFFS+ 211,
+ OC_EXT_COEFFS+ 216,OC_EXT_COEFFS+ 193,OC_EXT_COEFFS+ 188,OC_EXT_COEFFS+ 178,
+ OC_EXT_COEFFS+ 183,OC_EXT_COEFFS+ 173,OC_EXT_COEFFS+ 219,OC_EXT_COEFFS+ 220,
+ OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 219,
+ OC_EXT_COEFFS+ 219,OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 222,OC_EXT_COEFFS+ 225,
+ OC_EXT_COEFFS+ 228,OC_EXT_COEFFS+ 229,OC_EXT_COEFFS+ 226,OC_EXT_COEFFS+ 223
+};
- A better strategy is to select a linearly independent set of DCT basis
- functions to represent the pixels inside the boundary, and then extrapolate
- the remaining pixel values using the coefficients derived from these basis
- functions.
- This is the approach taken by Kaup and Aach \cite{KA94}.
- The coefficients corresponding to the basis functions that are not selected
- are implicitly zeroed out by this process.
- One can attempt to select the linearly independent set using Gaussian
- elimination.
- However, experiments show straightforward methods of determining when a
- pivot value is small enough to consider zero do not work reliably, and this
- still doesn't solve the stiffness problem.
+static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={
+ {0x7F,7,OC_EXT_ROWS+ 0,{0,1,2,3,4,5,6,7},{0,1,2,4,5,6,7,3}},
+ {0xFE,7,OC_EXT_ROWS+ 7,{1,2,3,4,5,6,7,0},{0,1,2,4,5,6,7,3}},
+ {0x3F,6,OC_EXT_ROWS+ 8,{0,1,2,3,4,5,7,6},{0,1,3,4,6,7,5,2}},
+ {0xFC,6,OC_EXT_ROWS+ 10,{2,3,4,5,6,7,1,0},{0,1,3,4,6,7,5,2}},
+ {0x1F,5,OC_EXT_ROWS+ 12,{0,1,2,3,4,7,6,5},{0,2,3,5,7,6,4,1}},
+ {0xF8,5,OC_EXT_ROWS+ 15,{3,4,5,6,7,2,1,0},{0,2,3,5,7,6,4,1}},
+ {0x0F,4,OC_EXT_ROWS+ 18,{0,1,2,3,7,6,5,4},{0,2,4,6,7,5,3,1}},
+ {0xF0,4,OC_EXT_ROWS+ 18,{4,5,6,7,3,2,1,0},{0,2,4,6,7,5,3,1}},
+ {0x07,3,OC_EXT_ROWS+ 22,{0,1,2,7,6,5,4,3},{0,3,6,7,5,4,2,1}},
+ {0xE0,3,OC_EXT_ROWS+ 27,{5,6,7,4,3,2,1,0},{0,3,6,7,5,4,2,1}},
+ {0x03,2,OC_EXT_ROWS+ 32,{0,1,7,6,5,4,3,2},{0,4,7,6,5,3,2,1}},
+ {0xC0,2,OC_EXT_ROWS+ 32,{6,7,5,4,3,2,1,0},{0,4,7,6,5,3,2,1}},
+ {0x01,1,OC_EXT_ROWS+ 0,{0,7,6,5,4,3,2,1},{0,7,6,5,4,3,2,1}},
+ {0x80,1,OC_EXT_ROWS+ 0,{7,6,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+ {0x7E,6,OC_EXT_ROWS+ 42,{1,2,3,4,5,6,7,0},{0,1,2,5,6,7,4,3}},
+ {0x7C,5,OC_EXT_ROWS+ 44,{2,3,4,5,6,7,1,0},{0,1,4,5,7,6,3,2}},
+ {0x3E,5,OC_EXT_ROWS+ 47,{1,2,3,4,5,7,6,0},{0,1,4,5,7,6,3,2}},
+ {0x78,4,OC_EXT_ROWS+ 50,{3,4,5,6,7,2,1,0},{0,4,5,7,6,3,2,1}},
+ {0x3C,4,OC_EXT_ROWS+ 54,{2,3,4,5,7,6,1,0},{0,3,4,7,6,5,2,1}},
+ {0x1E,4,OC_EXT_ROWS+ 58,{1,2,3,4,7,6,5,0},{0,4,5,7,6,3,2,1}},
+ {0x70,3,OC_EXT_ROWS+ 62,{4,5,6,7,3,2,1,0},{0,5,7,6,4,3,2,1}},
+ {0x38,3,OC_EXT_ROWS+ 67,{3,4,5,7,6,2,1,0},{0,5,6,7,4,3,2,1}},
+ {0x1C,3,OC_EXT_ROWS+ 72,{2,3,4,7,6,5,1,0},{0,5,6,7,4,3,2,1}},
+ {0x0E,3,OC_EXT_ROWS+ 77,{1,2,3,7,6,5,4,0},{0,5,7,6,4,3,2,1}},
+ {0x60,2,OC_EXT_ROWS+ 82,{5,6,7,4,3,2,1,0},{0,2,7,6,5,4,3,1}},
+ {0x30,2,OC_EXT_ROWS+ 36,{4,5,7,6,3,2,1,0},{0,4,7,6,5,3,2,1}},
+ {0x18,2,OC_EXT_ROWS+ 90,{3,4,7,6,5,2,1,0},{0,1,7,6,5,4,3,2}},
+ {0x0C,2,OC_EXT_ROWS+ 34,{2,3,7,6,5,4,1,0},{0,4,7,6,5,3,2,1}},
+ {0x06,2,OC_EXT_ROWS+ 84,{1,2,7,6,5,4,3,0},{0,2,7,6,5,4,3,1}},
+ {0x40,1,OC_EXT_ROWS+ 0,{6,7,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+ {0x20,1,OC_EXT_ROWS+ 0,{5,7,6,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
+ {0x10,1,OC_EXT_ROWS+ 0,{4,7,6,5,3,2,1,0},{0,7,6,5,4,3,2,1}},
+ {0x08,1,OC_EXT_ROWS+ 0,{3,7,6,5,4,2,1,0},{0,7,6,5,4,3,2,1}},
+ {0x04,1,OC_EXT_ROWS+ 0,{2,7,6,5,4,3,1,0},{0,7,6,5,4,3,2,1}},
+ {0x02,1,OC_EXT_ROWS+ 0,{1,7,6,5,4,3,2,0},{0,7,6,5,4,3,2,1}}
+};
- Kaup and Aach take the approach of constructing the set in an iterative
- manner by starting with only the DC coefficient and adding the basis
- function which minimizes the approximation error at each step.
- Let G(i,j) be the coefficient of the ith pixel in the jth basis function in
- the 2D iDCT, c(j) the DCT coefficients in the current approximation, A the
- set of non-padding pixels, f(i) the pixel values, and
- r(i)=f(i)-G(i,:) c(:) the reconstruction error in pixel i.
- Then the metric to minimize is
- E(j) = \frac{\[\sum_{i\in A} r(i)*G(i,j)\]^2}{\sum_{i\in A} G(i,j)^2}
- This formulation guarantees a linearly independent set of vectors is chosen.
- It avoids ill-conditioning by stopping the iteration when the residual
- energy or the change in residual energy falls below a threshold.
- The coefficients which minimize the residual at each stage can be computed
- by formulating the least squares solution G^T G c(:) = G^T f(:).
- Because G^T G is Hermitian, this can be solved by finding the Cholesky
- decomposition L such that L L^T = G^T G and L is lower-triangular.
- Then c(:) = L^-T L^-1 G^T f(:), where the multiplication by L^-T and L^-1 can
- be done by forward and backward substitution.
+#include <stdio.h>
- Unfortunately, this method now depends on the actual pixel values.
- While it has the advantage of actually working, the other methods above
- can all be reduced to finding a fixed matrix for a given shape to multiply
- the existing pixel values by.
- Thus, so long as the shape does not change, the processing can all be done
- in advance and run-time computation is reduced to a simple matrix
- multiplication.
- This dependency can be avoided by choosing some sample pixel values f(n)
- to drive approximation instead of using the actual pixel values.
- While that approach works very well in some cases, it can still
- occaisionally select a set of basis vectors with small singular values.
- There is a more robust method, however, that does not require choosing
- sample pixel values.
-
- One thing to note is that as each basis vector is added to G, a single row
- is added to the Cholesky decompositing L L^T of G^T G.
- This row only depends on the previous rows of G, and does not change as more
- rows are added to G.
- When computing c(:) from f(:), division by a diagonal element of L is
- performed twice, once during forward substitution and once during backward
- substitution.
- These are the only divisions in this computation.
- Thus it is small values of these diagonal elements which lead to the
- numerical instability.
- So, instead of trying to minimize the residual energy at each stage, we can
- instead try to maximize the next diagonal element of L.
- This is a measure of how orthogonal the next basis function is to the
- subspace spanned by the existing basis functions.
- Because the divisions do not accumulate in the formulation of the Cholesky
- decomposition, the values on the diagonal are well-conditioned, unlike the
- pivot values in Gaussian elimination.
-
- This method turns out to work very well in practice, and is implemented
- below.
- The greedy optimization approach appears sufficient to avoid any of the
- numerical instabilities present in the other methods discussed.
- A threshold is no longer required for a stopping criteria, since a
- sufficiently independent set of basis functions can always be found in the
- DCT.
-
- Once the set of basis functions has been selected, the linear system
- f'(:) = G' (L^-T L^-1) G^T f(:) is solved, where f'(:) is the vector of
- extrapolated pixel values, and G' is the portion of the 2D iDCT basis
- functions used in G that corresponds to these pixels.
- This results in a simple m by n matrix, where m is the number of padding
- pixels and n is the number of non-padding pixels.
- The entries of this matrix are scaled down by a power of two to ensure their
- magnitude is less than 1, and then they are stored as signed 1.15
- fixed-point values.
- During the encode, the non-padding pixel values are scaled by this power of
- two before multiplication.
- The total additonal cost of computing the padding is at most 16 multiplies
- per pixel.
- By comparison, Theora's fDCT implementation requires 4 multiplies per pixel.
-
- The complexity can be further reduced by noting that if the selected basis
- vectors are all orthogonal, then (L^-T L^-1) G^T =
- (G^-1 G^-T) G^T = G^-1 = D^-2 G^T, where D is a diagonal matrix consisting
- of the magnitude of the rows of G.
- Thus computation of padding pixel values can be avoided completely.
- The entire operation reduces to padding with 0's, performing an fDCT, and
- scaling the output.
- To reiterate, this only works when the rows of G are orthogonal, and so this
- is not done here.
-
- There is no longer any bias in this method towards including low-frequency
- coefficients in the set.
- However, such a bias is neither always possible, nor strictly necessary.
- If the signal statistics already tend to send high frequency coefficients to
- small values, this will not change when some of those high frequencies are
- forced to zero.
- Thus, most of the high frequency coefficients will still be quantized to
- zero, even if they do happen to be included in the basis.
- The important part is that all of the coefficients not in the basis will be
- guaranteed to be zero.
- Zeroing out low-frequency coefficients can be as beneficial to compression
- as high frequency components.
- Although they do not often form parts of long zero runs, they tend to have
- larger magnitudes that require more bits to store.
-
- @INPROCEEDINGS{KA94,
- author="Andr\'{e} Kaup and Til Aach",
- title="Segment-Oriented Coding of Textured Images Based on Successive
- Approximation",
- booktitle="Proceedings of the International Symposium on Speech, Image
- Processing, and Neural Networks",
- pages="197--200",
- address="Hong Kong",
- month="Apr.",
- year=1994,
- URL="http://www.lnt.de/~kaup/paper/issipnn-94.pdf"
- }*/
-
-/*Adds the last row/column to the symmetric matrix G^T G, and then updates the
- Cholesky factorization of it, L.
- The diagonal element is left squared, since sometimes we only care about the
- magnitude of this element relative to other choices.
- Thus callers must manually take the square root when the actual Cholesky
- factorization is needed.
- _l: The L matrix to update.
- _ac: The set of basis functions used for each row.
- _nac: The number of rows.
- _ap: The set of pixels that are not padding.
- _nap: The number of pixels.*/
-static void oc_update_l(float _l[64][64],int _ac[64],int _nac,
- int _ap[64],int _nap){
- int aci;
- int acj;
- int ack;
- int api;
- int cri;
- int cci;
- int crj;
- int ccj;
- int pri;
- int pci;
- /*Step 1: Add the next row/column of G^T G.*/
- aci=_nac-1;
- cri=_ac[aci]>>3;
- cci=_ac[aci]&7;
- for(acj=0;acj<=aci;acj++){
- crj=_ac[acj]>>3;
- ccj=_ac[acj]&7;
- _l[aci][acj]=0;
- for(api=0;api<_nap;api++){
- pri=_ap[api]>>3;
- pci=_ap[api]&7;
- _l[aci][acj]+=OC_IDCT_MATRIX[pci][ccj]*OC_IDCT_MATRIX[pri][crj]*
- OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri];
- }
+/*Pads a single row of a partial block and then performs a forward Type-II DCT
+ on the result.
+ The output is scaled by a factor of 2 from the orthonormal version of the
+ transform.
+ _y: The buffer to store the result in.
+ Data will be placed in every 8th entry (e.g., in a column of an 8x8
+ block).
+ _x: The input coefficients.
+ The first 8 entries are used (e.g., from a row of an 8x8 block).
+ _e: The extension information for the shape.*/
+static void fdct8_ext(ogg_int16_t *_y,ogg_int16_t _x[8],
+ const oc_extension_info *_e){
+ if(_e->na==1){
+ int ci;
+ /*While the branch below is still correct for shapes with na==1, we can
+ perform the entire transform with just 1 multiply in this case instead
+ of 23.*/
+ _y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[_e->pi[0]]<<3)));
+ for(ci=8;ci<64;ci+=8)_y[ci]=0;
}
- /*Step 2: Convert the newly added row to the corresponding row of the
- Cholesky factorization.*/
- for(acj=0;acj<aci;acj++){
- for(ack=0;ack<acj;ack++)_l[aci][acj]-=_l[aci][ack]*_l[acj][ack];
- _l[aci][acj]/=_l[acj][acj];
- }
- for(ack=0;ack<aci;ack++)_l[aci][aci]-=_l[aci][ack]*_l[aci][ack];
-}
-
-/*Computes the set of basis functions to use to synthesize a partial block.
- These basis functions are then used to derive a matrix that extends the
- signal into the missing region from the pixel values that are present.
- _border_info: The border info structure to initialize with details about the
- selected basis set.
- _border: A description of the partial block.*/
-void oc_border_enc_info_init(oc_border_enc_info *_border_info,
- const oc_border_info *_border){
- float ext_matrix[64][64];
- float b[64];
- float l[64][64];
- float best_l_nn2;
- float scale;
- float w;
- int ap[64];
- int zp[64];
- int ac[64];
- int best_aci;
- int shift;
- int nap;
- int nzp;
- int nac;
- int api;
- int zpi;
- int aci;
- int acj;
- int cri;
- int cci;
- int pri;
- int pci;
- int pi;
- int ci;
- for(zpi=api=pi=0;pi<64;pi++){
- if((int)(_border->mask>>pi)&1)ap[api++]=pi;
- else zp[zpi++]=pi;
- }
- nap=api;
- nzp=zpi;
- for(api=0;api<nap;api++)_border_info->pi[api]=(unsigned char)ap[api];
- for(zpi=0;zpi<nzp;zpi++)_border_info->pi[nap+zpi]=(unsigned char)zp[zpi];
- /*Step 1: Select a linearly independent set of basis functions.*/
- for(ci=0;ci<64;ci++)ac[OC_IZIG_ZAG[ci]]=ci;
- /*Always include the DC coefficient.*/
- for(nac=1;;nac++){
- /*Update L with the most recently selected basis function.*/
- oc_update_l(l,ac,nac,ap,nap);
- l[nac-1][nac-1]=OC_SQRTF(l[nac-1][nac-1]);
- if(nac>=nap)break;
- /*Choose the next basis function to add based on its potential entry on
- the diagonal of the Cholesky factorization L.*/
- best_aci=nac;
- best_l_nn2=0;
- for(aci=nac;aci<64;aci++){
- ci=ac[nac];
- ac[nac]=ac[aci];
- ac[aci]=ci;
- oc_update_l(l,ac,nac+1,ap,nap);
- if(l[nac][nac]>best_l_nn2){
- best_l_nn2=l[nac][nac];
- best_aci=aci;
- }
- ci=ac[nac];
- ac[nac]=ac[aci];
- ac[aci]=ci;
+ else{
+ int zpi;
+ int api;
+ int nz;
+ /*First multiply by the extension matrix to compute the padding values.*/
+ nz=8-_e->na;
+ for(zpi=0;zpi<nz;zpi++){
+ ogg_int32_t v;
+ v=0;
+ for(api=0;api<_e->na;api++)v+=_e->ext[zpi][api]*_x[_e->pi[api]];
+ _x[_e->pi[zpi+_e->na]]=
+ (ogg_int16_t)OC_DIV_ROUND_POW2(v,OC_EXT_SHIFT,1<<OC_EXT_SHIFT-1);
}
- ci=ac[nac];
- ac[nac]=ac[best_aci];
- ac[best_aci]=ci;
+ fdct8(_y,_x);
}
- /*Step 2: Compute the extrapolation matrix.
- We also compute a shift value to apply to all the non-zero pixel values.
- We scale the input pixels by this amount and then divide the matrix
- entries by it, so that we're guaranteed that the entries are always less
- than one and can be approximated by a signed 1.15 fixed point value.*/
- shift=0;
- for(api=0;api<nap;api++){
- pri=ap[api]>>3;
- pci=ap[api]&7;
- for(aci=0;aci<nac;aci++){
- cri=ac[aci]>>3;
- cci=ac[aci]&7;
- b[aci]=OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri];
- }
- for(aci=0;aci<nac;aci++){
- for(acj=0;acj<aci;acj++)b[aci]-=l[aci][acj]*b[acj];
- b[aci]/=l[aci][aci];
- }
- for(aci=nac;aci-->0;){
- for(acj=aci+1;acj<nac;acj++)b[aci]-=l[acj][aci]*b[acj];
- b[aci]/=l[aci][aci];
- }
- for(zpi=0;zpi<nzp;zpi++){
- pri=zp[zpi]>>3;
- pci=zp[zpi]&7;
- w=0;
- for(aci=0;aci<nac;aci++){
- cri=ac[aci]>>3;
- cci=ac[aci]&7;
- w+=OC_IDCT_MATRIX[pci][cci]*OC_IDCT_MATRIX[pri][cri]*b[aci];
- }
- ext_matrix[zpi][api]=w;
- w=OC_FABSF(w);
- while(w>=(1<<shift))shift++;
- }
- }
- _border_info->shift=shift;
- scale=1.0F/(1<<shift);
- _border_info->ext_matrix=(ogg_int16_t **)oc_malloc_2d(nzp,nap,
- sizeof(_border_info->ext_matrix[0][0]));
- for(zpi=0;zpi<nzp;zpi++){
- for(api=0;api<nap;api++){
- w=ext_matrix[zpi][api]*scale;
- _border_info->ext_matrix[zpi][api]=(ogg_int16_t)
- OC_CLAMPI(-32768,(int)(32768*w+(w<0?-0.5F:0.5F)),32767);
- }
- }
}
/*Performs a forward 8x8 Type-II DCT transform on blocks which overlap the
- border of the displayable region.
+ border of the picture region.
+ This method ONLY works with rectangular regions.
_border: A description of which pixels are inside the border.
- _border_info: Information needed to extrapolate pixel values outside the
- border.
_y: The buffer to store the result in.
This may be the same as _x.
_x: The input coefficients.
Pixel values outside the border will be modified.*/
-void oc_fdct8x8_border(const oc_border_info *_border,
- const oc_border_enc_info *_border_info,
- ogg_int16_t _y[64],ogg_int16_t _x[64]){
- ogg_int32_t ap[64];
- int nap;
- int nzp;
- int api;
- int zpi;
- nap=_border->npixels;
- nzp=64-nap;
- for(api=0;api<nap;api++){
- ap[api]=_x[_border_info->pi[api]]<<_border_info->shift;
+void oc_fdct8x8_border(const oc_border_info *_border,ogg_int16_t _y[64],
+ ogg_int16_t _x[64]){
+ ogg_int16_t *in;
+ ogg_int16_t *end;
+ ogg_int16_t *out;
+ ogg_int16_t w[64];
+ ogg_int64_t mask;
+ const oc_extension_info *rext;
+ const oc_extension_info *cext;
+ int rmask;
+ int cmask;
+ int ri;
+ /*Identify the shapes of the non-zero rows and columns.*/
+ rmask=cmask=0;
+ mask=_border->mask;
+ for(ri=0;ri<8;ri++){
+ rmask|=mask&0xFF;
+ cmask|=((mask&0xFF)!=0)<<ri;
+ mask>>=8;
}
- for(zpi=0;zpi<nzp;zpi++){
- ogg_int32_t v;
- for(v=api=0;api<nap;api++){
- v+=_border_info->ext_matrix[zpi][api]*ap[api];
+ /*Find the associated extension info for these shapes.*/
+ if(rmask==0xFF)rext=NULL;
+ else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){
+ /*If we somehow can't find the shape, then just do an unpadded fDCT.
+ It won't be efficient, but it should still be correct.*/
+ if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){
+ oc_fdct8x8(_y,_x);
+ return;
}
- _x[_border_info->pi[nap+zpi]]=(ogg_int16_t)OC_DIV_ROUND_POW2(v,15,16384);
}
- oc_fdct8x8(_y,_x);
+ if(cmask==0xFF)cext=NULL;
+ else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){
+ /*If we somehow can't find the shape, then just do an unpadded fDCT.
+ It won't be efficient, but it should still be correct.*/
+ if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){
+ oc_fdct8x8(_y,_x);
+ return;
+ }
+ }
+ /*Transform the rows.
+ We can ignore zero rows without a problem.*/
+ if(rext==NULL)for(in=_x,out=w,end=out+8;out<end;in+=8,out++)fdct8(out,in);
+ else for(in=_x,out=w,end=out+8,ri=cmask;out<end;in+=8,out++,ri>>=1){
+ if(ri&1)fdct8_ext(out,in,rext);
+ }
+ /*Transform the columns.
+ We transform even columns that are supposedly zero, because rounding errors
+ may make them slightly non-zero, and this will give a more precise
+ reconstruction with very small quantizers.*/
+ if(cext==NULL)for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8(out,in);
+ else for(in=w,out=_y,end=out+8;out<end;in+=8,out++)fdct8_ext(out,in,cext);
}
Modified: experimental/derf/theora-exp/lib/fdct.h
===================================================================
--- experimental/derf/theora-exp/lib/fdct.h 2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/lib/fdct.h 2004-10-15 06:58:04 UTC (rev 8006)
@@ -4,10 +4,7 @@
# define _fdct_H (1)
void oc_fdct8x8(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
-void oc_border_enc_info_init(oc_border_enc_info *_border_info,
- const oc_border_info *_border);
-void oc_fdct8x8_border(const oc_border_info *_border,
- const oc_border_enc_info *_border_info,ogg_int16_t _y[64],
+void oc_fdct8x8_border(const oc_border_info *_border,ogg_int16_t _y[64],
ogg_int16_t _x[64]);
#endif
Added: experimental/derf/theora-exp/tools/extgen.c
===================================================================
--- experimental/derf/theora-exp/tools/extgen.c 2004-10-15 06:55:05 UTC (rev 8005)
+++ experimental/derf/theora-exp/tools/extgen.c 2004-10-15 06:58:04 UTC (rev 8006)
@@ -0,0 +1,406 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <string.h>
+
+
+
+/*This utility generates the extrapolation matrices used to pad incomplete
+ blocks when given frame sizes that are not a multiple of two.
+ Although the technique used here is generalizable to non-rectangular shapes,
+ the storage needed for the matrices becomes larger (exactly 13.75K, if we
+ discount the superfluous mask, na, pi, and ci members of the
+ oc_coeffcient_info structure and use a 256-byte mask-padding lookup table
+ instead).*/
+
+
+
+/*The matrix representation of the Theora's actual iDCT.
+ This is computed using the 16-bit approximations of the sines and cosines
+ that Theora uses, but using floating point values and assuming that no
+ truncation occurs after division (i.e., pretending that the transform is
+ linear like the real DCT).*/
+static const double OC_IDCT_MATRIX[8][8]={
+ {
+ +0.7071075439453125,+0.9807891845703125,
+ +0.9238739013671875,+0.8314666748046875,
+ +0.7071075439453125,+0.555572509765625,
+ +0.3826904296875, +0.1950836181640625
+ },
+ {
+ +0.7071075439453125,+0.8314685295335948,
+ +0.3826904296875, -0.1950868454296142,
+ -0.7071075439453125,-0.9807858711574227,
+ -0.9238739013671875,-0.5555783333256841
+ },
+ {
+ +0.7071075439453125,+0.5555783333256841,
+ -0.3826904296875, -0.9807858711574227,
+ -0.7071075439453125,+0.1950868454296142,
+ 0.9238739013671875,+0.8314685295335948
+ },
+ {
+ +0.7071075439453125,+0.1950836181640625,
+ -0.9238739013671875,-0.555572509765625,
+ +0.7071075439453125,+0.8314666748046875,
+ -0.3826904296875, -0.9807891845703125
+ },
+ {
+ +0.7071075439453125,-0.1950836181640625,
+ -0.9238739013671875,+0.555572509765625,
+ +0.7071075439453125,-0.8314666748046875,
+ -0.3826904296875, +0.9807891845703125
+ },
+ {
+ +0.7071075439453125,-0.5555783333256841,
+ -0.3826904296875, 0.9807858711574227,
+ -0.7071075439453125,-0.1950868454296142,
+ +0.9238739013671875,-0.8314685295335948
+ },
+ {
+ +0.7071075439453125,-0.8314685295335948,
+ +0.3826904296875, +0.1950868454296142,
+ -0.7071075439453125,+0.9807858711574227,
+ -0.9238739013671875,+0.5555783333256841
+ },
+ {
+ +0.7071075439453125,-0.9807891845703125,
+ +0.9238739013671875,-0.8314666748046875,
+ +0.7071075439453125,-0.555572509765625,
+ +0.3826904296875, -0.1950836181640625
+ }
+};
+
+/*A table of the non-zero coefficients to keep for each possible shape.
+ Our basis selection is chosen to optimize the coding gain.
+ This gives us marginally better performance than other optimization criteria
+ for the border extension case (and significantly better performance in the
+ general case).*/
+static const unsigned char OC_PAD_MASK_GAIN[256]={
+ 0x00,0x01,0x01,0x11,0x01,0x11,0x05,0x49,
+ 0x01,0x05,0x11,0x15,0x11,0x15,0xA1,0x55,
+ 0x01,0x05,0x11,0x15,0x11,0x29,0x51,0x55,
+ 0x03,0x85,0x19,0xA5,0x61,0x35,0xB1,0xAD,
+ 0x01,0x11,0x05,0x23,0x03,0x85,0x0D,0x2B,
+ 0x11,0x15,0x61,0x55,0x13,0x1B,0xA3,0x5B,
+ 0x11,0x83,0x83,0x55,0x13,0x93,0xC3,0xCB,
+ 0x61,0xC3,0x33,0x3B,0x99,0xCB,0xB3,0xDB,
+ 0x01,0x11,0x03,0x0B,0x05,0x0B,0x07,0x4B,
+ 0x11,0x07,0x07,0x1B,0x83,0x8B,0x87,0x57,
+ 0x11,0x23,0x07,0x33,0x61,0x2B,0x0F,0x57,
+ 0x19,0x87,0x1B,0x2F,0x33,0x8F,0x8F,0xB7,
+ 0x05,0x83,0x07,0x63,0x0D,0x87,0x87,0x6B,
+ 0x51,0x47,0x0F,0x67,0xC3,0xCB,0xE3,0xD7,
+ 0xA1,0xA3,0x87,0x73,0xA3,0x6B,0xE3,0xCF,
+ 0xB1,0x8F,0x8F,0xE7,0xB3,0xCF,0xE7,0xF7,
+ 0x01,0x03,0x11,0x43,0x11,0x31,0x83,0x93,
+ 0x05,0x07,0x23,0x27,0x83,0x87,0xA3,0x57,
+ 0x05,0x07,0x07,0xC3,0x15,0x0F,0x47,0x4F,
+ 0x85,0x87,0x87,0xA7,0xC3,0x8F,0x8F,0xAF,
+ 0x11,0x31,0x0B,0x1B,0x85,0x1B,0x87,0x2F,
+ 0x29,0x0F,0x2B,0x1F,0x93,0x1F,0x6B,0x5F,
+ 0x15,0x87,0x8B,0x57,0x1B,0x1F,0xCB,0x5F,
+ 0x35,0x8F,0x8F,0x3F,0xCB,0x9F,0xCF,0xDF,
+ 0x11,0x43,0x0B,0x33,0x23,0x1B,0x63,0x9B,
+ 0x15,0xC3,0x33,0x37,0x55,0x57,0x73,0x77,
+ 0x15,0x27,0x1B,0x37,0x55,0x1F,0x67,0x3F,
+ 0xA5,0xA7,0x2F,0xB7,0x3B,0x3F,0xE7,0xBF,
+ 0x49,0x93,0x4B,0x9B,0x2B,0x2F,0x6B,0x7B,
+ 0x55,0x4F,0x57,0x3F,0xCB,0x5F,0xCF,0x7F,
+ 0x55,0x57,0x57,0x77,0x5B,0x5F,0xD7,0x7F,
+ 0xAD,0xAF,0xB7,0xBF,0xDB,0xDF,0xF7,0xFF
+};
+
+
+
+/*Computes the Cholesky factorization L L^T of the matrix G^T G, where G is the
+ currently selected bass functions restricted to the region of spatial
+ support.
+ The reciprocal of the diagonal element is stored instead of the diagonal
+ itself, so that the division only needs to be done once.
+ _l: The L matrix to compute.
+ _ac: The set of basis functions used for each row.
+ _ap: The set of pixels that are not padding.
+ _na: The number of active pixels/coefficients.*/
+static void oc_cholesky8(double _l[8][8],int _ac[8],int _ap[8],int _na){
+ int aci;
+ int acj;
+ int ack;
+ int api;
+ int ci;
+ int cj;
+ int pi;
+ for(aci=0;aci<_na;aci++){
+ /*Step 1: Add the next row/column of G^T G.*/
+ ci=_ac[aci];
+ for(acj=0;acj<=aci;acj++){
+ cj=_ac[acj];
+ _l[aci][acj]=0;
+ for(api=0;api<_na;api++){
+ pi=_ap[api];
+ _l[aci][acj]+=OC_IDCT_MATRIX[pi][cj]*OC_IDCT_MATRIX[pi][ci];
+ }
+ }
+ /*Step 2: Convert the newly added row to the corresponding row of the
+ Cholesky factorization.*/
+ for(acj=0;acj<aci;acj++){
+ for(ack=0;ack<acj;ack++)_l[aci][acj]-=_l[aci][ack]*_l[acj][ack];
+ _l[aci][acj]*=_l[acj][acj];
+ }
+ for(ack=0;ack<aci;ack++)_l[aci][aci]-=_l[aci][ack]*_l[aci][ack];
+ _l[aci][aci]=1/sqrt(_l[aci][aci]);
+ }
+}
+
+/*Computes the padding extrapolation matrix for a single 1-D 8-point DCT.*/
+int oc_calc_ext8(double _e[8][8],int _ap[8],int _ac[8],int _mask,int _pad){
+ double b[8];
+ double l[8][8];
+ double w;
+ int api;
+ int zpi;
+ int aci;
+ int acj;
+ int ci;
+ int pi;
+ int na;
+ int nz;
+ na=nz=0;
+ for(pi=0;pi<8;pi++){
+ if(_mask&1<<pi)_ap[na++]=pi;
+ else _ap[8-++nz]=pi;
+ }
+ na=nz=0;
+ for(ci=0;ci<8;ci++){
+ if(_pad&1<<ci)_ac[na++]=ci;
+ else _ac[8-++nz]=ci;
+ }
+ oc_cholesky8(l,_ac,_ap,na);
+ for(api=0;api<na;api++){
+ pi=_ap[api];
+ for(aci=0;aci<na;aci++){
+ b[aci]=OC_IDCT_MATRIX[pi][_ac[aci]];
+ for(acj=0;acj<aci;acj++)b[aci]-=l[aci][acj]*b[acj];
+ b[aci]*=l[aci][aci];
+ }
+ for(aci=na;aci-->0;){
+ for(acj=aci+1;acj<na;acj++)b[aci]-=l[acj][aci]*b[acj];
+ b[aci]*=l[aci][aci];
+ }
+ for(zpi=na;zpi<8;zpi++){
+ pi=_ap[zpi];
+ w=0;
+ for(aci=0;aci<na;aci++)w+=OC_IDCT_MATRIX[pi][_ac[aci]]*b[aci];
+ _e[zpi][api]=w;
+ }
+ }
+ return na;
+}
+
+/*The precision at which the matrix coefficients are stored.*/
+#define OC_EXT_SHIFT (19)
+
+/*The number of shapes we need.*/
+#define OC_NSHAPES (35)
+
+/*These are all the possible shapes that could be encountered in border
+ extension.
+ The first 14 correspond to the left and right borders (or top and bottom when
+ considered vertically), and the next 21 arise from the combination of the
+ two, to handle the case of a video less than 8 pixels wide (or tall).
+ Order matters here, because we will search this array linearly for the entry
+ to use during the actual transform.*/
+static int masks[OC_NSHAPES]={
+ 0x7F,0xFE,0x3F,0xFC,0x1F,0xF8,0x0F,0xF0,0x07,0xE0,0x03,0xC0,0x01,0x80,
+ 0x7E,
+ 0x7C,0x3E,
+ 0x78,0x3C,0x1E,
+ 0x70,0x38,0x1C,0x0E,
+ 0x60,0x30,0x18,0x0C,0x06,
+ 0x40,0x20,0x10,0x08,0x04,0x02
+};
+
+int main(void){
+ int coeffs[8192];
+ int ncoeffs;
+ int ci;
+ int crstart[OC_NSHAPES*8];
+ int crend[OC_NSHAPES*8];
+ int ncr;
+ int cri;
+ int offsets[1024];
+ int noffsets;
+ int oi;
+ int orstart[OC_NSHAPES*8];
+ int orend[OC_NSHAPES*8];
+ int nor;
+ int ori;
+ int ap[OC_NSHAPES][8];
+ int ac[OC_NSHAPES][8];
+ int na[OC_NSHAPES];
+ int zpi;
+ int mi;
+ ncoeffs=ncr=0;
+ /*We store all the coefficients from the extension matrices in one large
+ table.
+ This lets us exploit the repetitiveness and overlap of many of the rows to
+ reduce the size of the tables by about 49%.
+ Our overlap search strategy is pretty simple, so we could probably do even
+ better, but further improvement would be small.*/
+ for(mi=0;mi<OC_NSHAPES;mi++){
+ double e[8][8];
+ int api;
+ na[mi]=oc_calc_ext8(e,ap[mi],ac[mi],masks[mi],OC_PAD_MASK_GAIN[masks[mi]]);
+ for(zpi=na[mi];zpi<8;zpi++){
+ int cr[8];
+ int best_ci;
+ int best_cre;
+ int cj;
+ for(api=0;api<na[mi];api++){
+ cr[api]=(int)(e[zpi][api]*(1<<OC_EXT_SHIFT)+(e[zpi][api]<0?-0.5F:0.5F));
+ }
+ best_ci=ncoeffs;
+ best_cre=0;
+ for(ci=0;ci<ncoeffs;ci++){
+ for(cj=ci;cj<ncoeffs&&cj-ci<na[mi]&&coeffs[cj]==cr[cj-ci];cj++);
+ if(cj-ci>best_cre){
+ if(cj-ci<na[mi]){
+ /*If we need to extend the range, check to make sure we aren't
+ interrupting any other range.*/
+ for(cri=0;cri<ncr&&(crstart[cri]>=cj||crend[cri]<=cj);cri++);
+ if(cri>=ncr){
+ /*No conflicting ranges were found, so we can insert the
+ necessary coefficients.*/
+ best_ci=ci;
+ best_cre=cj-ci;
+ }
+ }
+ else{
+ /*Otherwise, we have a complete match, so we can stop searching.*/
+ best_ci=ci;
+ best_cre=cj-ci;
+ break;
+ }
+ }
+ }
+ if(best_cre<na[mi]){
+ /*We don't have a complete match, so we need to insert the remaining
+ coefficients.*/
+ memmove(coeffs+best_ci+na[mi],coeffs+best_ci+best_cre,
+ sizeof(coeffs[0])*(ncoeffs-best_ci-best_cre));
+ /*And now we need to update all the ranges that start after the
+ insertion point.*/
+ for(cri=0;cri<ncr;cri++){
+ if(crstart[cri]>=best_ci+best_cre){
+ crstart[cri]+=na[mi]-best_cre;
+ crend[cri]+=na[mi]-best_cre;
+ }
+ }
+ }
+ /*Actually add the coefficients.*/
+ for(cj=best_cre;cj<na[mi];cj++)coeffs[best_ci+cj]=cr[cj];
+ ncoeffs+=na[mi]-best_cre;
+ /*Store the endpoints of the range.*/
+ crstart[ncr]=best_ci;
+ crend[ncr]=best_ci+na[mi];
+ ncr++;
+ }
+
+ }
+ printf("/*The precision at which the matrix coefficients are stored.*/\n");
+ printf("#define OC_EXT_SHIFT (%i)\n\n",OC_EXT_SHIFT);
+ printf("/*The number of shapes we need.*/\n");
+ printf("#define OC_NSHAPES (%i)\n\n",OC_NSHAPES);
+ printf("static const ogg_int32_t OC_EXT_COEFFS[%i]={",ncoeffs);
+ for(ci=0;ci<ncoeffs;ci++){
+ if((ci&7)==0)printf("\n ");
+ printf("%c0x%05X",coeffs[ci]<0?'-':' ',abs(coeffs[ci]));
+ if(ci+1<ncoeffs)printf(",");
+ }
+ printf("\n};\n\n");
+ /*Now we repeat the same overlap strategy on the row pointers.
+ This only gets us around 45% space reduction, but that's still worth it.*/
+ noffsets=nor=0;
+ for(cri=mi=0;mi<OC_NSHAPES;mi++){
+ int best_oi;
+ int best_ore;
+ int oj;
+ int nz;
+ best_oi=noffsets;
+ best_ore=0;
+ nz=8-na[mi];
+ for(oi=0;oi<noffsets;oi++){
+ for(oj=oi;oj<noffsets&&oj-oi<nz&&offsets[oj]==crstart[cri+oj-oi];
+ oj++);
+ if(oj-oi>best_ore){
+ if(oj-oi<nz){
+ /*If we need to extend the range, check to make sure we aren't
+ interrupting any other range.*/
+ for(ori=0;ori<nor&&(orstart[ori]>=oj||orend[ori]<=oj);ori++);
+ if(ori>=nor){
+ /*No conflicting ranges were found, so we can insert the
+ necessary coefficients.*/
+ best_oi=oi;
+ best_ore=oj-oi;
+ }
+ }
+ else{
+ /*Otherwise, we have a complete match, so we can stop searching.*/
+ best_oi=oi;
+ best_ore=oj-oi;
+ break;
+ }
+ }
+ }
+ if(best_ore<nz){
+ /*We don't have a complete match, so we need to insert the remaining
+ offsets.*/
+ memmove(offsets+best_oi+nz,offsets+best_oi+best_ore,
+ sizeof(offsets[0])*(noffsets-best_oi-best_ore));
+ /*And now we need to update all the ranges that start after the
+ insertion point.*/
+ for(ori=0;ori<nor;ori++){
+ if(orstart[ori]>=best_oi+best_ore){
+ orstart[ori]+=nz-best_ore;
+ orend[ori]+=nz-best_ore;
+ }
+ }
+ }
+ /*Actually add the offsets.*/
+ for(oj=best_ore;oj<nz;oj++)offsets[best_oi+oj]=crstart[cri+oj];
+ noffsets+=nz-best_ore;
+ /*Store the endpoints of the range.*/
+ orstart[nor]=best_oi;
+ orend[nor]=best_oi+nz;
+ nor++;
+ cri+=nz;
+ }
+ printf("static const ogg_int32_t *const OC_EXT_ROWS[%i]={",noffsets);
+ for(oi=0;oi<noffsets;oi++){
+ if((oi&3)==0)printf("\n ");
+ printf("OC_EXT_COEFFS+%4i",offsets[oi]);
+ if(oi+1<noffsets)printf(",");
+ }
+ printf("\n};\n\n");
+ printf("static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={\n");
+ for(mi=0;mi<OC_NSHAPES;mi++){
+ int i;
+ printf(" {");
+ printf("0x%02X,%i,OC_EXT_ROWS+%3i,",masks[mi],na[mi],orstart[mi]);
+ printf("{");
+ for(i=0;i<8;i++){
+ printf("%i",ap[mi][i]);
+ if(i<7)printf(",");
+ }
+ printf("},{");
+ for(i=0;i<8;i++){
+ printf("%i",ac[mi][i]);
+ if(i<7)printf(",");
+ }
+ printf("}}");
+ if(mi+1<OC_NSHAPES)printf(",");
+ printf("\n");
+ }
+ printf("};\n");
+ return 0;
+}
More information about the commits
mailing list