[Flac-dev] libFLAC bitbuffer optimizations

Eric Wong eric at petta-tech.com
Sat Jan 1 17:36:16 PST 2005


Josh Coalson <xflac at yahoo.com> wrote:
> thanks for the patch.

No prob :)

> also, if you have miroslav's patch again a more updated version
> of bitbuffer.c that would be great.  I have been meaning to get
> around to applying it for a long time.

This is Miroslav's patch, from the mailing list post I dug up in the archives:

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -62,6 +62,24 @@
  * keeping in mind the limit from the first paragraph.
  */
 static const unsigned FLAC__BITBUFFER_DEFAULT_CAPACITY = ((65536 - 64) * 8) / FLAC__BITS_PER_BLURB; /* blurbs */
+static const unsigned char byte_to_unary_table[] = {
+      8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
 
 #if FLAC__BITS_PER_BLURB == 8
 #define FLAC__BITS_PER_BLURB_LOG2 3
@@ -70,6 +88,7 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__byte)0x80)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)'\x80') >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) FLAC__CRC16_UPDATE((blurb), (crc));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) (byte_to_unary_table[blurb])
 #elif FLAC__BITS_PER_BLURB == 32
 #define FLAC__BITS_PER_BLURB_LOG2 5
 #define FLAC__BYTES_PER_BLURB 4
@@ -77,6 +96,13 @@
 #define FLAC__BLURB_TOP_BIT_ONE ((FLAC__uint32)0x80000000)
 #define BLURB_BIT_TO_MASK(b) (((FLAC__blurb)0x80000000) >> (b))
 #define CRC16_UPDATE_BLURB(bb, blurb, crc) crc16_update_blurb((bb), (blurb));
+#define FLAC__ALIGNED_BLURB_UNARY(blurb) ((blurb) <= 0xff \
+		? byte_to_unary_table[blurb] + 24 \
+		: ((blurb) <= 0xffff \
+			? byte_to_unary_table[(blurb) >> 8] + 16 \
+			: ((blurb) <=0xffffff \
+				? byte_to_unary_table[(blurb) >> 16] + 8 \
+				: byte_to_unary_table[(blurb) >> 24])))
 #else
 /* ERROR, only sizes of 8 and 32 are supported */
 #endif
@@ -2109,114 +2135,18 @@
 	if(nvals == 0)
 		return true;
 
+	cbits = bb->consumed_bits;
 	i = bb->consumed_blurbs;
-	/*
-	 * We unroll the main loop to take care of partially consumed blurbs here.
-	 */
-	if(bb->consumed_bits > 0) {
-		save_blurb = blurb = buffer[i];
-		cbits = bb->consumed_bits;
-		blurb <<= cbits;
-
-		while(1) {
-			if(state == 0) {
-				if(blurb) {
-					for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
-						blurb <<= 1;
-					msbs += j;
-
-					/* dispose of the unary end bit */
-					blurb <<= 1;
-					j++;
-					cbits += j;
-
-					uval = 0;
-					lsbs_left = parameter;
-					state++;
-					if(cbits == FLAC__BITS_PER_BLURB) {
-						cbits = 0;
-						CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-						break;
-					}
-				}
-				else {
-					msbs += FLAC__BITS_PER_BLURB - cbits;
-					cbits = 0;
-					CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-					break;
-				}
-			}
-			else {
-				const unsigned available_bits = FLAC__BITS_PER_BLURB - cbits;
-				if(lsbs_left >= available_bits) {
-					uval <<= available_bits;
-					uval |= (blurb >> cbits);
-					cbits = 0;
-					CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
-
-					if(lsbs_left == available_bits) {
-						/* compose the value */
-						uval |= (msbs << parameter);
-						if(uval & 1)
-							vals[val_i++] = -((int)(uval >> 1)) - 1;
-						else
-							vals[val_i++] = (int)(uval >> 1);
-						if(val_i == nvals)
-							break;
-
-						msbs = 0;
-						state = 0;
-					}
-
-					lsbs_left -= available_bits;
-					break;
-				}
-				else {
-					uval <<= lsbs_left;
-					uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
-					blurb <<= lsbs_left;
-					cbits += lsbs_left;
-
-					/* compose the value */
-					uval |= (msbs << parameter);
-					if(uval & 1)
-						vals[val_i++] = -((int)(uval >> 1)) - 1;
-					else
-						vals[val_i++] = (int)(uval >> 1);
-					if(val_i == nvals) {
-						/* back up one if we exited the for loop because we read all nvals but the end came in the middle of a blurb */
-						i--;
-						break;
-					}
-
-					msbs = 0;
-					state = 0;
-				}
-			}
-		}
-		i++;
-
-		bb->consumed_blurbs = i;
-		bb->consumed_bits = cbits;
-		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
-	}
-
-	/*
-	 * Now that we are blurb-aligned the logic is slightly simpler
-	 */
+	
 	while(val_i < nvals) {
-		for( ; i < bb->blurbs && val_i < nvals; i++) {
-			save_blurb = blurb = buffer[i];
-			cbits = 0;
+		for( ; i < bb->blurbs; i++) {
+			blurb = (save_blurb = buffer[i]) << cbits;
 			while(1) {
 				if(state == 0) {
 					if(blurb) {
-						for(j = 0; !(blurb & FLAC__BLURB_TOP_BIT_ONE); j++)
-							blurb <<= 1;
+						j = FLAC__ALIGNED_BLURB_UNARY(blurb);
 						msbs += j;
 
-						/* dispose of the unary end bit */
-						blurb <<= 1;
 						j++;
 						cbits += j;
 
@@ -2228,6 +2158,7 @@
 							CRC16_UPDATE_BLURB(bb, save_blurb, bb->read_crc16);
 							break;
 						}
+						blurb <<= j;
 					}
 					else {
 						msbs += FLAC__BITS_PER_BLURB - cbits;
@@ -2247,12 +2178,11 @@
 						if(lsbs_left == available_bits) {
 							/* compose the value */
 							uval |= (msbs << parameter);
-							if(uval & 1)
-								vals[val_i++] = -((int)(uval >> 1)) - 1;
-							else
-								vals[val_i++] = (int)(uval >> 1);
-							if(val_i == nvals)
-								break;
+							vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+							if(val_i == nvals) {
+								i++;
+								goto break2;
+							}
 
 							msbs = 0;
 							state = 0;
@@ -2262,29 +2192,23 @@
 						break;
 					}
 					else {
+						cbits += lsbs_left;
 						uval <<= lsbs_left;
 						uval |= (blurb >> (FLAC__BITS_PER_BLURB - lsbs_left));
 						blurb <<= lsbs_left;
-						cbits += lsbs_left;
 
 						/* compose the value */
 						uval |= (msbs << parameter);
-						if(uval & 1)
-							vals[val_i++] = -((int)(uval >> 1)) - 1;
-						else
-							vals[val_i++] = (int)(uval >> 1);
-						if(val_i == nvals) {
-							/* back up one if we exited the for loop because we read all nvals but the end came in the middle of a blurb */
-							i--;
-							break;
-						}
-
+						vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
+						if(val_i == nvals)
+							goto break2;
 						msbs = 0;
 						state = 0;
 					}
 				}
 			}
 		}
+break2:
 		bb->consumed_blurbs = i;
 		bb->consumed_bits = cbits;
 		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;




This is my patch on top of Miroslavs, which micro optimizes away some indexing
and comparison overhead.

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -2123,7 +2123,7 @@
 {
 	const FLAC__blurb *buffer = bb->buffer;
 
-	unsigned i, j, val_i = 0;
+	unsigned i, j, val_i = nvals;
 	unsigned cbits = 0, uval = 0, msbs = 0, lsbs_left = 0;
 	FLAC__blurb blurb, save_blurb;
 	unsigned state = 0; /* 0 = getting unary MSBs, 1 = getting binary LSBs */
@@ -2138,7 +2138,7 @@
 	cbits = bb->consumed_bits;
 	i = bb->consumed_blurbs;
 	
-	while(val_i < nvals) {
+	while(val_i != 0) {
 		for( ; i < bb->blurbs; i++) {
 			blurb = (save_blurb = buffer[i]) << cbits;
 			while(1) {
@@ -2178,11 +2178,13 @@
 						if(lsbs_left == available_bits) {
 							/* compose the value */
 							uval |= (msbs << parameter);
-							vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
-							if(val_i == nvals) {
+							*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+							--val_i;
+							if(val_i == 0) {
 								i++;
 								goto break2;
 							}
+							*(++vals);
 
 							msbs = 0;
 							state = 0;
@@ -2199,9 +2201,12 @@
 
 						/* compose the value */
 						uval |= (msbs << parameter);
-						vals[val_i++] = (int)(uval >> 1 ^ -(int)(uval & 1));
-						if(val_i == nvals)
+						*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+						--val_i;
+						if(val_i == 0)
 							goto break2;
+						*(++vals);
+
 						msbs = 0;
 						state = 0;
 					}
@@ -2212,7 +2217,7 @@
 		bb->consumed_blurbs = i;
 		bb->consumed_bits = cbits;
 		bb->total_consumed_bits = (i << FLAC__BITS_PER_BLURB_LOG2) | cbits;
-		if(val_i < nvals) {
+		if(val_i > 0) {
 			if(!bitbuffer_read_from_client_(bb, read_callback, client_data))
 				return false;
 			/* these must be zero because we can only get here if we got to the end of the buffer */



> 
> btw how are you playing it on the ipod?

With a modified version of Music Player Daemon (MPD, www.musicpd.org).
http://www.ipodlinux.org/MPD has more info about it.

> not sure how to help out with lpc_restore_signal().  there are
> x86 and PPC versions of the routines in CVS that might be good
> references.  it is basically a multiply-accumulate loop but you
> have to be careful about overflow.  an ARM7 version of this
> function would help the speed on several other FLAC-supported
> devices.

This is my current C version of lpc_restore_signal.  It only works when
(order<=8) (not a problem for me right now), but the inner loop for larger
orders could be done using Duff's device and still be fast without getting much
bigger.  I reading up on ARM assembly right now.

void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
	unsigned i;
	FLAC__int32 sum;
	const FLAC__int32 *history, *qlp;
	const int tmp = (0 - order - 1);
	
	for(i = data_len; i != 0; --i) {
		sum = 0;
		qlp = &qlp_coeff[order];
		history = &data[tmp];
		switch (order) {
			case 8: sum += (*(--qlp)) * (*(++history));
			case 7: sum += (*(--qlp)) * (*(++history));
			case 6: sum += (*(--qlp)) * (*(++history));
			case 5: sum += (*(--qlp)) * (*(++history));
			case 4: sum += (*(--qlp)) * (*(++history));
			case 3: sum += (*(--qlp)) * (*(++history));
			case 2: sum += (*(--qlp)) * (*(++history));
			case 1: sum += (*(--qlp)) * (*(++history));
				break;
		}
		*(data++) = *(residual++) + (sum >> lp_quantization);
	}
}

-- 
Eric Wong
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20050101/90fa859a/attachment.pgp


More information about the Flac-dev mailing list