[Flac-dev] bitreader optimizations

Miroslav Lichvar lichvarm at gmail.com
Fri Mar 14 11:36:31 PDT 2008


Hi,

attached are patches that improve decoding speed a bit. The first
patch improves the bit scan macro used for decoding unary values, the
second one adds a GCC inline assembly for bswap and the third patch
replaces the read_rice_block function.

In my testing it turned out to be even faster than the _ia32_bswap
function. If the code produced by MSVC is faster as well, I'd suggest
to remove the assembly function completely and avoid the problems with
text relocation (sf bug #1793536).

-- 
Miroslav Lichvar
-------------- next part --------------
Index: src/libFLAC/bitreader.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c	28 Feb 2008 05:34:26 -0000	1.15
+++ src/libFLAC/bitreader.c	14 Mar 2008 11:07:07 -0000
@@ -69,13 +69,12 @@
 #endif
 /* counts the # of zero MSBs in a word */
 #define COUNT_ZERO_MSBS(word) ( \
-	(word) <= 0xffff ? \
-		( (word) <= 0xff? byte_to_unary_table[word] + 24 : byte_to_unary_table[(word) >> 8] + 16 ) : \
-		( (word) <= 0xffffff? byte_to_unary_table[word >> 16] + 8 : byte_to_unary_table[(word) >> 24] ) \
+	word > 0xffffff ? byte_to_unary_table[(word) >> 24] : \
+	!word ? 32 : \
+	word > 0xffff ? byte_to_unary_table[word >> 16] + 8 : \
+	word > 0xff ? byte_to_unary_table[(word) >> 8] + 16 : \
+	byte_to_unary_table[word] + 24 \
 )
-/* this alternate might be slightly faster on some systems/compilers: */
-#define COUNT_ZERO_MSBS2(word) ( (word) <= 0xff ? byte_to_unary_table[word] + 24 : ((word) <= 0xffff ? byte_to_unary_table[(word) >> 8] + 16 : ((word) <= 0xffffff ? byte_to_unary_table[(word) >> 16] + 8 : byte_to_unary_table[(word) >> 24])) )
-
 
 /*
  * This should be at least twice as large as the largest number of words
-------------- next part --------------
Index: src/libFLAC/bitreader.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c	28 Feb 2008 05:34:26 -0000	1.15
+++ src/libFLAC/bitreader.c	14 Mar 2008 13:19:46 -0000
@@ -149,6 +148,7 @@
 	FLAC__CPUInfo cpu_info;
 };
 
+#if FLAC__BYTES_PER_WORD == 4 && FLAC__CPU_IA32
 #ifdef _MSC_VER
 /* OPT: an MSVC built-in would be better */
 static _inline FLAC__uint32 local_swap32_(FLAC__uint32 x)
@@ -173,6 +173,15 @@
 done1:
 	}
 }
+#elif __GNUC__
+static void local_swap32_block_(FLAC__uint32 *start, FLAC__uint32 len)
+{
+	FLAC__uint32 *end;
+
+	for(end = start + len; start < end; start++)
+		asm ("bswap %0" : "=r"(*start) : "0"(*start));
+}
+#endif
 #endif
 
 static FLaC__INLINE void crc16_update_word_(FLAC__BitReader *br, brword word)
@@ -263,7 +272,7 @@
 #if WORDS_BIGENDIAN
 #else
 	end = (br->words*FLAC__BYTES_PER_WORD + br->bytes + bytes + (FLAC__BYTES_PER_WORD-1)) / FLAC__BYTES_PER_WORD;
-# if defined(_MSC_VER) && (FLAC__BYTES_PER_WORD == 4)
+# if FLAC__CPU_IA32 && (__GNUC__ || defined(_MSC_VER)) && FLAC__BYTES_PER_WORD == 4
 	if(br->cpu_info.type == FLAC__CPUINFO_TYPE_IA32 && br->cpu_info.data.ia32.bswap) {
 		start = br->words;
 		local_swap32_block_(br->buffer + start, end - start);
-------------- next part --------------
Index: src/libFLAC/bitreader.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/bitreader.c,v
retrieving revision 1.15
diff -u -r1.15 bitreader.c
--- src/libFLAC/bitreader.c	28 Feb 2008 05:34:26 -0000	1.15
+++ src/libFLAC/bitreader.c	14 Mar 2008 12:44:27 -0000
@@ -803,379 +802,141 @@
 }
 
 /* this is by far the most heavily used reader call.  it ain't pretty but it's fast */
-/* a lot of the logic is copied, then adapted, from FLAC__bitreader_read_unary_unsigned() and FLAC__bitreader_read_raw_uint32() */
 FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
-/* OPT: possibly faster version for use with MSVC */
-#ifdef _MSC_VER
 {
-	unsigned i;
-	unsigned uval = 0;
-	unsigned bits; /* the # of binary LSBs left to read to finish a rice codeword */
-
 	/* try and get br->consumed_words and br->consumed_bits into register;
 	 * must remember to flush them back to *br before calling other
-	 * bitwriter functions that use them, and before returning */
-	register unsigned cwords;
-	register unsigned cbits;
+	 * bitreader functions that use them, and before returning */
+	unsigned cwords, words, lsbs, msbs, x, y;
+	unsigned ucbits; /* keep track of the number of unconsumed bits in word */
+	brword b;
+	int *val, *end;
 
 	FLAC__ASSERT(0 != br);
 	FLAC__ASSERT(0 != br->buffer);
 	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
 	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
 	FLAC__ASSERT(parameter < 32);
-	/* the above two asserts also guarantee that the binary part never straddles more that 2 words, so we don't have to loop to read it */
-
-	if(nvals == 0)
-		return true;
+	/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
 
-	cbits = br->consumed_bits;
-	cwords = br->consumed_words;
+	val = vals;
+	end = vals + nvals;
 
-	while(1) {
-
-		/* read unary part */
-		while(1) {
-			while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				brword b = br->buffer[cwords] << cbits;
-				if(b) {
-#if 0 /* slower, probably due to bad register allocation... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32
-					__asm {
-						bsr eax, b
-						not eax
-						and eax, 31
-						mov i, eax
-					}
-#else
-					i = COUNT_ZERO_MSBS(b);
-#endif
-					uval += i;
-					bits = parameter;
-					i++;
-					cbits += i;
-					if(cbits == FLAC__BITS_PER_WORD) {
-						crc16_update_word_(br, br->buffer[cwords]);
-						cwords++;
-						cbits = 0;
-					}
-					goto break1;
-				}
-				else {
-					uval += FLAC__BITS_PER_WORD - cbits;
-					crc16_update_word_(br, br->buffer[cwords]);
-					cwords++;
-					cbits = 0;
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* at this point we've eaten up all the whole words; have to try
-			 * reading through any tail bytes before calling the read callback.
-			 * this is a repeat of the above logic adjusted for the fact we
-			 * don't have a whole word.  note though if the client is feeding
-			 * us data a byte at a time (unlikely), br->consumed_bits may not
-			 * be zero.
-			 */
-			if(br->bytes) {
-				const unsigned end = br->bytes * 8;
-				brword b = (br->buffer[cwords] & (FLAC__WORD_ALL_ONES << (FLAC__BITS_PER_WORD-end))) << cbits;
-				if(b) {
-					i = COUNT_ZERO_MSBS(b);
-					uval += i;
-					bits = parameter;
-					i++;
-					cbits += i;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					goto break1;
-				}
-				else {
-					uval += end - cbits;
-					cbits += end;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* flush registers and read; bitreader_read_from_client_() does
-			 * not touch br->consumed_bits at all but we still need to set
-			 * it in case it fails and we have to return false.
-			 */
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			if(!bitreader_read_from_client_(br))
+	if(parameter == 0) {
+		while(val < end) {
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
 				return false;
-			cwords = br->consumed_words;
-		}
-break1:
-		/* read binary part */
-		FLAC__ASSERT(cwords <= br->words);
-
-		if(bits) {
-			while((br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits < bits) {
-				/* flush registers and read; bitreader_read_from_client_() does
-				 * not touch br->consumed_bits at all but we still need to set
-				 * it in case it fails and we have to return false.
-				 */
-				br->consumed_bits = cbits;
-				br->consumed_words = cwords;
-				if(!bitreader_read_from_client_(br))
-					return false;
-				cwords = br->consumed_words;
-			}
-			if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					const unsigned n = FLAC__BITS_PER_WORD - cbits;
-					const brword word = br->buffer[cwords];
-					if(bits < n) {
-						uval <<= bits;
-						uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-bits);
-						cbits += bits;
-						goto break2;
-					}
-					uval <<= n;
-					uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-					bits -= n;
-					crc16_update_word_(br, word);
-					cwords++;
-					cbits = 0;
-					if(bits) { /* if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-						uval <<= bits;
-						uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits));
-						cbits = bits;
-					}
-					goto break2;
-				}
-				else {
-					FLAC__ASSERT(bits < FLAC__BITS_PER_WORD);
-					uval <<= bits;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-					cbits = bits;
-					goto break2;
-				}
-			}
-			else {
-				/* in this case we're starting our read at a partial tail word;
-				 * the reader has guaranteed that we have at least 'bits' bits
-				 * available to read, which makes this case simpler.
-				 */
-				uval <<= bits;
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					FLAC__ASSERT(cbits + bits <= br->bytes*8);
-					uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-bits);
-					cbits += bits;
-					goto break2;
-				}
-				else {
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-bits);
-					cbits += bits;
-					goto break2;
-				}
-			}
-		}
-break2:
-		/* compose the value */
-		*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
 
-		/* are we done? */
-		--nvals;
-		if(nvals == 0) {
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			return true;
+			*val++ = (int)(msbs >> 1) ^ -(int)(msbs & 1);
 		}
 
-		uval = 0;
-		++vals;
-
+		return true;
 	}
-}
-#else
-{
-	unsigned i;
-	unsigned uval = 0;
-
-	/* try and get br->consumed_words and br->consumed_bits into register;
-	 * must remember to flush them back to *br before calling other
-	 * bitwriter functions that use them, and before returning */
-	register unsigned cwords;
-	register unsigned cbits;
-	unsigned ucbits; /* keep track of the number of unconsumed bits in the buffer */
-
-	FLAC__ASSERT(0 != br);
-	FLAC__ASSERT(0 != br->buffer);
-	/* WATCHOUT: code does not work with <32bit words; we can make things much faster with this assertion */
-	FLAC__ASSERT(FLAC__BITS_PER_WORD >= 32);
-	FLAC__ASSERT(parameter < 32);
-	/* the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it */
 
-	if(nvals == 0)
-		return true;
+	FLAC__ASSERT(parameter > 0);
 
-	cbits = br->consumed_bits;
 	cwords = br->consumed_words;
-	ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
+	words = br->words;
+	ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+	b = br->buffer[cwords] << br->consumed_bits;  /* keep unconsumed bits aligned to left */
+
+	/* if we've not consumed up to a partial tail word... */
+	if(cwords >= words) {
+		x = 0;
+		goto process_tail;
+	}
+
+	while(val < end) {
+		/* read the unary MSBs and end bit */
+		x = y = COUNT_ZERO_MSBS(b);
+		if(x == FLAC__BITS_PER_WORD) {
+			x = ucbits;
+			do {
+				/* didn't find stop bit yet, have to keep going... */
+				crc16_update_word_(br, br->buffer[cwords++]);
+				if (cwords >= words)
+					goto incomplete_msbs;
+				b = br->buffer[cwords];
+				y = COUNT_ZERO_MSBS(b);
+				x += y;
+			} while(y == FLAC__BITS_PER_WORD);
+		}
+		b <<= y;
+		b <<= 1; /* account for stop bit */
+		ucbits = (ucbits - x - 1) % FLAC__BITS_PER_WORD;
+		msbs = x;
+
+		/* read the binary LSBs */
+		x = b >> (FLAC__BITS_PER_WORD - parameter);
+		if(parameter <= ucbits) {
+			ucbits -= parameter;
+			b <<= parameter;
+		} else {
+			/* there are still bits left to read, they will all be in the next word */
+			crc16_update_word_(br, br->buffer[cwords++]);
+			if (cwords >= words)
+				goto incomplete_lsbs;
+			b = br->buffer[cwords];
+			ucbits += FLAC__BITS_PER_WORD - parameter;
+			x |= b >> ucbits;
+			b <<= FLAC__BITS_PER_WORD - ucbits;
+		}
+		lsbs = x;
 
-	while(1) {
+		/* compose the value */
+		x = (msbs << parameter) | lsbs;
+		*val++ = (int)(x >> 1) ^ -(int)(x & 1);
 
-		/* read unary part */
-		while(1) {
-			while(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				brword b = br->buffer[cwords] << cbits;
-				if(b) {
-#if 0 /* is not discernably faster... */ && defined FLAC__CPU_IA32 && !defined FLAC__NO_ASM && FLAC__BITS_PER_WORD == 32 && defined __GNUC__
-					asm volatile (
-						"bsrl %1, %0;"
-						"notl %0;"
-						"andl $31, %0;"
-						: "=r"(i)
-						: "r"(b)
-					);
-#else
-					i = COUNT_ZERO_MSBS(b);
-#endif
-					uval += i;
-					cbits += i;
-					cbits++; /* skip over stop bit */
-					if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
-						crc16_update_word_(br, br->buffer[cwords]);
-						cwords++;
-						cbits = 0;
-					}
-					goto break1;
-				}
-				else {
-					uval += FLAC__BITS_PER_WORD - cbits;
-					crc16_update_word_(br, br->buffer[cwords]);
-					cwords++;
-					cbits = 0;
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* at this point we've eaten up all the whole words; have to try
-			 * reading through any tail bytes before calling the read callback.
-			 * this is a repeat of the above logic adjusted for the fact we
-			 * don't have a whole word.  note though if the client is feeding
-			 * us data a byte at a time (unlikely), br->consumed_bits may not
-			 * be zero.
-			 */
-			if(br->bytes) {
-				const unsigned end = br->bytes * 8;
-				brword b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
-				if(b) {
-					i = COUNT_ZERO_MSBS(b);
-					uval += i;
-					cbits += i;
-					cbits++; /* skip over stop bit */
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					goto break1;
-				}
-				else {
-					uval += end - cbits;
-					cbits += end;
-					FLAC__ASSERT(cbits < FLAC__BITS_PER_WORD);
-					/* didn't find stop bit yet, have to keep going... */
-				}
-			}
-			/* flush registers and read; bitreader_read_from_client_() does
-			 * not touch br->consumed_bits at all but we still need to set
-			 * it in case it fails and we have to return false.
-			 */
-			br->consumed_bits = cbits;
+		continue;
+
+		/* at this point we've eaten up all the whole words */
+incomplete_msbs:
+		br->consumed_bits = 0;
+process_tail:
+		do {
 			br->consumed_words = cwords;
-			if(!bitreader_read_from_client_(br))
+			
+			/* read the unary MSBs and end bit */
+			if(!FLAC__bitreader_read_unary_unsigned(br, &msbs))
 				return false;
-			cwords = br->consumed_words;
-			ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval;
-			/* + uval to offset our count by the # of unary bits already
-			 * consumed before the read, because we will add these back
-			 * in all at once at break1
-			 */
-		}
-break1:
-		ucbits -= uval;
-		ucbits--; /* account for stop bit */
-
-		/* read binary part */
-		FLAC__ASSERT(cwords <= br->words);
-
-		if(parameter) {
-			while(ucbits < parameter) {
-				/* flush registers and read; bitreader_read_from_client_() does
-				 * not touch br->consumed_bits at all but we still need to set
-				 * it in case it fails and we have to return false.
-				 */
-				br->consumed_bits = cbits;
+			msbs += x;
+			x = ucbits = 0;
+
+			if(0) {
+incomplete_lsbs:
+				br->consumed_bits = 0;
 				br->consumed_words = cwords;
-				if(!bitreader_read_from_client_(br))
-					return false;
-				cwords = br->consumed_words;
-				ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
-			}
-			if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just slower than necessary for that case */
-					const unsigned n = FLAC__BITS_PER_WORD - cbits;
-					const brword word = br->buffer[cwords];
-					if(parameter < n) {
-						uval <<= parameter;
-						uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
-						cbits += parameter;
-					}
-					else {
-						uval <<= n;
-						uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
-						crc16_update_word_(br, word);
-						cwords++;
-						cbits = parameter - n;
-						if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
-							uval <<= cbits;
-							uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
-						}
-					}
-				}
-				else {
-					cbits = parameter;
-					uval <<= parameter;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-				}
 			}
-			else {
-				/* in this case we're starting our read at a partial tail word;
-				 * the reader has guaranteed that we have at least 'parameter'
-				 * bits available to read, which makes this case simpler.
-				 */
-				uval <<= parameter;
-				if(cbits) {
-					/* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
-					FLAC__ASSERT(cbits + parameter <= br->bytes*8);
-					uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
-					cbits += parameter;
-				}
-				else {
-					cbits = parameter;
-					uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
-				}
-			}
-		}
-
-		ucbits -= parameter;
 
-		/* compose the value */
-		*vals = (int)(uval >> 1 ^ -(int)(uval & 1));
+			/* read the binary LSBs */
+			if(!FLAC__bitreader_read_raw_uint32(br, &lsbs, parameter - ucbits))
+				return false;
+			lsbs = x | lsbs;
 
-		/* are we done? */
-		--nvals;
-		if(nvals == 0) {
-			br->consumed_bits = cbits;
-			br->consumed_words = cwords;
-			return true;
-		}
+			/* compose the value */
+			x = (msbs << parameter) | lsbs;
+			*val++ = (int)(x >> 1) ^ -(int)(x & 1);
+			x = 0;
 
-		uval = 0;
-		++vals;
+			cwords = br->consumed_words;
+			words = br->words;
+			ucbits = FLAC__BITS_PER_WORD - br->consumed_bits;
+			b = br->buffer[cwords] << br->consumed_bits;
+		} while(cwords >= words && val < end);
+	}
 
+	if(ucbits == 0 && cwords < words) {
+		/* don't leave the head word with no unconsumed bits */
+		crc16_update_word_(br, br->buffer[cwords++]);
+		ucbits = FLAC__BITS_PER_WORD;
 	}
+
+	br->consumed_bits = FLAC__BITS_PER_WORD - ucbits;
+	br->consumed_words = cwords;
+
+	return true;
 }
-#endif
 
 #if 0 /* UNUSED */
 FLAC__bool FLAC__bitreader_read_golomb_signed(FLAC__BitReader *br, int *val, unsigned parameter)
Index: src/libFLAC/stream_decoder.c
===================================================================
RCS file: /cvsroot/flac/flac/src/libFLAC/stream_decoder.c,v
retrieving revision 1.148
diff -u -r1.148 stream_decoder.c
--- src/libFLAC/stream_decoder.c	28 Feb 2008 05:34:26 -0000	1.148
+++ src/libFLAC/stream_decoder.c	14 Mar 2008 12:44:28 -0000
@@ -421,7 +421,7 @@
 #ifdef FLAC__CPU_IA32
 		FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
 #ifdef FLAC__HAS_NASM
-#if 1 /*@@@@@@ OPT: not clearly faster, needs more testing */
+#if 0 /*@@@@@@ OPT: not clearly faster, needs more testing */
 		if(decoder->private_->cpuinfo.data.ia32.bswap)
 			decoder->private_->local_bitreader_read_rice_signed_block = FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap;
 #endif


More information about the Flac-dev mailing list