[Flac-dev] libFLAC bitbuffer optimizations
Eric Wong
eric at petta-tech.com
Sat Jan 1 17:36:16 PST 2005
Josh Coalson <xflac at yahoo.com> wrote:
> thanks for the patch.
No prob :)
> also, if you have miroslav's patch again a more updated version
> of bitbuffer.c that would be great. I have been meaning to get
> around to applying it for a long time.
This is Miroslav's patch, from the mailing list post I dug up in the archives:
--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -62,6 +62,24 @@
* keeping in mind the limit from the first paragraph.
*/
static const unsigned FLAC__BITBUFFER_DEFAULT_CAPACITY = ((65536 - 64) * 8) / FLAC__BITS_PER_BLURB; /* blurbs */
+static const unsigned char byte_to_unary_table[] = {
+ 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
#if FLAC__BITS_PER_BLURB == 8
#define FLAC__BITS_PER_BLURB_LOG2 3
@@ -70,6 +88,7 @@
#define FLAC__BLURB_TOP_BIT_ONE ((FLAC__byte)0x80)
#define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)'\x80') >> (b))
#define CRC16_UPDATE_BLURB(bb, blurb, crc) FLAC__CRC16_UPDATE((blurb), (crc));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) (byte_to_unary_table[blurb])
#elif FLAC__BITS_PER_BLURB == 32
#define FLAC__BITS_PER_BLURB_LOG2 5
#define FLAC__BYTES_PER_BLURB 4
@@ -77,6 +96,13 @@
#define FLAC__BLURB_TOP_BIT_ONE ((FLAC__uint32)0x80000000)
#define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)0x80000000) >> (b))
#define CRC16_UPDATE_BLURB(bb, blurb, crc) crc16_update_blurb((bb), (blurb));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) ((blurb) <= 0xff \
+ ? byte_to_unary_table[blurb] + 24 \
+ : ((blurb) <= 0xffff \
+ ? byte_to_unary_table[(blurb) >> 8] + 16 \
+ : ((blurb) <=0xffffff \
+ ? byte_to_unary_table[(blurb) >> 16] + 8 \
+ : byte_to_unary_table[(blurb) >> 24])))
#else
/* ERROR, only sizes of 8 and 32 are supported */
#endif
@@ -2109,114 +2135,18 @@
if(nvals == 0)
return true;
+ cbits = bb->consumed_bits;
i = bb->consumed_blurbs;
- /*
- * We unroll the main loop to take care of partially consumed blurbs here.
- */
- if(bb->consumed_bits > 0) {
- save_blurb = blurb = buffer[i];
- cbits = bb->consumed_bits;
- blurb <<= cbits;
-
- while(1) {
- if(state == 0) {
- if(blurb) {
- for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
- blurb <<= 1;
- msbs += j;
-
- /* dispose of the unary end bit */
- blurb <<= 1;
- j++;
- cbits += j;
-
- uval = 0;
- lsbs_left = parameter;
- state++;
- if(cbits == FLAC__BITS_PER_BLURB) {
- cbits = 0;
- CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
- break;
- }
- }
- else {
- msbs += FLAC__BITS_PER_BLURB - cbits;
- cbits = 0;
- CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
- break;
- }
- }
- else {
- const unsigned available_bits = FLAC__BITS_PER_BLURB - cbits;
- if(lsbs_left >= available_bits) {
- uval <<= available_bits;
- uval |= (blurb >> cbits);
- cbits = 0;
- CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-
- if(lsbs_left == available_bits) {
- /* compose the value */
- uval |= (msbs << parameter);
- if(uval & 1)
- vals[val_i++] = -((int)(uval >> 1)) - 1;
- else
- vals[val_i++] = (int)(uval >> 1);
- if(val_i == nvals)
- break;
-
- msbs = 0;
- state = 0;
- }
-
- lsbs_left -= available_bits;
- break;
- }
- else {
- uval <<= lsbs_left;
- uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
- blurb <<= lsbs_left;
- cbits += lsbs_left;
-
- /* compose the value */
- uval |= (msbs << parameter);
- if(uval & 1)
- vals[val_i++] = -((int)(uval >> 1)) - 1;
- else
- vals[val_i++] = (int)(uval >> 1);
- if(val_i == nvals) {
- /* back up one if we exited the for loop because we read all nvals but the end came in the middle of a blurb */
- i--;
- break;
- }
-
- msbs = 0;
- state = 0;
- }
- }
- }
- i++;
-
- bb->consumed_blurbs = i;
- bb->consumed_bits = cbits;
- bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
- }
-
- /*
- * Now that we are blurb-aligned the logic is slightly simpler
- */
+
while(val_i < nvals) {
- for( ; i < bb->blurbs && val_i < nvals; i++) {
- save_blurb = blurb = buffer[i];
- cbits = 0;
+ for( ; i < bb->blurbs; i++) {
+ blurb = (save_blurb = buffer[i]) << cbits;
while(1) {
if(state == 0) {
if(blurb) {
- for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
- blurb <<= 1;
+ j = FLAC__ALIGNED_BLURB_UNARY(blurb);
msbs += j;
- /* dispose of the unary end bit */
- blurb <<= 1;
j++;
cbits += j;
@@ -2228,6 +2158,7 @@
CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
break;
}
+ blurb <<= j;
}
else {
msbs += FLAC__BITS_PER_BLURB - cbits;
@@ -2247,12 +2178,11 @@
if(lsbs_left == available_bits) {
/* compose the value */
uval |= (msbs << parameter);
- if(uval & 1)
- vals[val_i++] = -((int)(uval >> 1)) - 1;
- else
- vals[val_i++] = (int)(uval >> 1);
- if(val_i == nvals)
- break;
+ vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+ if(val_i == nvals) {
+ i++;
+ goto break2;
+ }
msbs = 0;
state = 0;
@@ -2262,29 +2192,23 @@
break;
}
else {
+ cbits += lsbs_left;
uval <<= lsbs_left;
uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
blurb <<= lsbs_left;
- cbits += lsbs_left;
/* compose the value */
uval |= (msbs << parameter);
- if(uval & 1)
- vals[val_i++] = -((int)(uval >> 1)) - 1;
- else
- vals[val_i++] = (int)(uval >> 1);
- if(val_i == nvals) {
- /* back up one if we exited the for loop because we read all nvals but the end came in the middle of a blurb */
- i--;
- break;
- }
-
+ vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+ if(val_i == nvals)
+ goto break2;
msbs = 0;
state = 0;
}
}
}
}
+break2:
bb->consumed_blurbs = i;
bb->consumed_bits = cbits;
bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
This is my patch on top of Miroslavs, which micro optimizes away some indexing
and comparison overhead.
--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -2123,7 +2123,7 @@
{
const FLAC__blurb *buffer = bb->buffer;
- unsigned i, j, val_i = 0;
+ unsigned i, j, val_i = nvals;
unsigned cbits = 0, uval = 0, msbs = 0, lsbs_left = 0;
FLAC__blurb blurb, save_blurb;
unsigned state = 0; /* 0 = getting unary MSBs, 1 = getting binary LSBs */
@@ -2138,7 +2138,7 @@
cbits = bb->consumed_bits;
i = bb->consumed_blurbs;
- while(val_i < nvals) {
+ while(val_i != 0) {
for( ; i < bb->blurbs; i++) {
blurb = (save_blurb = buffer[i]) << cbits;
while(1) {
@@ -2178,11 +2178,13 @@
if(lsbs_left == available_bits) {
/* compose the value */
uval |= (msbs << parameter);
- vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
- if(val_i == nvals) {
+ *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+ --val_i;
+ if(val_i == 0) {
i++;
goto break2;
}
+ *(++vals);
msbs = 0;
state = 0;
@@ -2199,9 +2201,12 @@
/* compose the value */
uval |= (msbs << parameter);
- vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
- if(val_i == nvals)
+ *vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+ --val_i;
+ if(val_i == 0)
goto break2;
+ *(++vals);
+
msbs = 0;
state = 0;
}
@@ -2212,7 +2217,7 @@
bb->consumed_blurbs = i;
bb->consumed_bits = cbits;
bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
- if(val_i < nvals) {
+ if(val_i > 0) {
if(!bitbuffer_read_from_client_(bb, read_callback, client_data))
return false;
/* these must be zero because we can only get here if we got to the end of the buffer */
>
> btw how are you playing it on the ipod?
With a modified version of Music Player Daemon (MPD, www.musicpd.org).
http://www.ipodlinux.org/MPD has more info about it.
> not sure how to help out with lpc_restore_signal(). there are
> x86 and PPC versions of the routines in CVS that might be good
> references. it is basically a multiply-accumulate loop but you
> have to be careful about overflow. an ARM7 version of this
> function would help the speed on several other FLAC-supported
> devices.
This is my current C version of lpc_restore_signal. It only works when
(order<=8) (not a problem for me right now), but the inner loop for larger
orders could be done using Duff's device and still be fast without getting much
bigger. I reading up on ARM assembly right now.
void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
unsigned i;
FLAC__int32 sum;
const FLAC__int32 *history, *qlp;
const int tmp = (0 - order - 1);
for(i = data_len; i != 0; --i) {
sum = 0;
qlp = &qlp_coeff[order];
history = &data[tmp];
switch (order) {
case 8: sum += (*(--qlp)) * (*(++history));
case 7: sum += (*(--qlp)) * (*(++history));
case 6: sum += (*(--qlp)) * (*(++history));
case 5: sum += (*(--qlp)) * (*(++history));
case 4: sum += (*(--qlp)) * (*(++history));
case 3: sum += (*(--qlp)) * (*(++history));
case 2: sum += (*(--qlp)) * (*(++history));
case 1: sum += (*(--qlp)) * (*(++history));
break;
}
*(data++) = *(residual++) + (sum >> lp_quantization);
}
}
--
Eric Wong
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20050101/90fa859a/attachment.pgp
More information about the Flac-dev
mailing list