[Flac-dev] two small-ish optimizations (death by a thousand cuts)

Wed Feb 2 23:19:26 PST 2005

This lpc_restore_order was partially inspired by Miroslav's affd, though
my (not very great) ARM asm version resembled this, as well.

The other two reduce CPU array indexing overhead in loops a little.

Additionally, a request for help:

 My not very optimized lpc_restore_signal is at the below URL, I
 couldn't get the ldm* instructions to work as advertised, even though
 I've talked to several ARM asm hackers who said they looked right.  I
 can use the fp as a regular register since since I'm compiling without
 it.  Comments within should explain what I'm  having trouble with:
 
 http://archzoom.sourcecontrol.net/archzoom.cgi/eric@petta-tech.com--2005a-normalperson/flac--ipod--1.1.0--patch-19/src/libFLAC/arm/lpc_asm.s

-- 
Eric Wong

--- orig/src/libFLAC/lpc.c
+++ mod/src/libFLAC/lpc.c
@@ -293,6 +293,209 @@
 
 void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
 {
+	register const FLAC__int32 *qlp0 = &qlp_coeff[(order-1)];
+	register FLAC__int32 sum;
+	register const FLAC__int32 *history, *qlp;
+
+	history = &data[(-order)];
+
+	switch (order) {
+	case 12:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    + (qlp0[-7] * history[7])
+			    + (qlp0[-8] * history[8])
+			    + (qlp0[-9] * history[9])
+			    + (qlp0[-10] * history[10])
+			    + (qlp0[-11] * history[11])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 11:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    + (qlp0[-7] * history[7])
+			    + (qlp0[-8] * history[8])
+			    + (qlp0[-9] * history[9])
+			    + (qlp0[-10] * history[10])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 10:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    + (qlp0[-7] * history[7])
+			    + (qlp0[-8] * history[8])
+			    + (qlp0[-9] * history[9])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 9:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    + (qlp0[-7] * history[7])
+			    + (qlp0[-8] * history[8])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 8:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    + (qlp0[-7] * history[7])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 7:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    + (qlp0[-6] * history[6])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		} 
+		return;
+	case 6:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    + (qlp0[-5] * history[5])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		} 
+		return;
+	case 5:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    + (qlp0[-4] * history[4])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		} 
+		return;
+	case 4:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    + (qlp0[-3] * history[3])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		} 
+		return;
+	case 3:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    + (qlp0[-2] * history[2])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 2:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * history[0])
+			    + (qlp0[-1] * history[1])
+			    ;
+			++history;
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		}
+		return;
+	case 1:
+		for( ; data_len != 0; --data_len) {
+			sum = (qlp0[0] * (*(history++)));
+			*(data++) = *(residual++) + (sum >> lp_quantization);
+		} 
+		return;
+	default:
+		{ 
+			/* handle everything else: (order > 12)
+			 * with Duff's Device to reduce jumps */
+			const unsigned n0 = (order + 7)/8;
+			const int tmp = 0 - order - 1;
+			register const FLAC__int32 *qlpd = &qlp_coeff[order];
+			for( ; data_len != 0; --data_len) {
+				register unsigned n = n0;
+				sum = 0;
+				qlp = qlpd;
+				history = &data[tmp];
+			
+				switch(order%8) {
+				case 0: do {
+					sum += (*(--qlp)) * (*(++history));
+				case 7: sum += (*(--qlp)) * (*(++history));
+				case 6: sum += (*(--qlp)) * (*(++history));
+				case 5: sum += (*(--qlp)) * (*(++history));
+				case 4: sum += (*(--qlp)) * (*(++history));
+				case 3: sum += (*(--qlp)) * (*(++history));
+				case 2: sum += (*(--qlp)) * (*(++history));
+				case 1: sum += (*(--qlp)) * (*(++history));
+					} while (--n);     
+				}
+				
+				*(data++) = *(residual++) + (sum >> lp_quantization);
+			}
+			return;
+		}
+	}
+}
+
+#if 0
+void FLAC__lpc_restore_signal_orig(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
 #ifdef FLAC__OVERFLOW_DETECT
 	FLAC__int64 sumo;
 #endif
@@ -339,6 +542,7 @@
 	}
 	*/
 }
+#endif /* 0 */
 
 void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
 {

--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -1466,6 +1469,7 @@
 {
 	unsigned i, bits_ = bits;
 	FLAC__uint32 v = 0;
+	FLAC__blurb *bbb;
 
 	FLAC__ASSERT(0 != bb);
 	FLAC__ASSERT(0 != bb->buffer);
@@ -1485,18 +1489,20 @@
 #if FLAC__BITS_PER_BLURB > 8
 	if(bb->bits == 0 || bb->consumed_blurbs < bb->blurbs) { /*@@@ comment on why this is here*/
 #endif
+		bbb = &bb->buffer[bb->consumed_blurbs];
 		if(bb->consumed_bits) {
 			i = FLAC__BITS_PER_BLURB - bb->consumed_bits;
 			if(i <= bits_) {
-				v = bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits);
+				v = (*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits);
 				bits_ -= i;
-				CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16);
+				CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+				++bbb;
 				bb->consumed_blurbs++;
 				bb->consumed_bits = 0;
 				/* we hold off updating bb->total_consumed_bits until the end */
 			}
 			else {
-				*val = (bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_);
+				*val = ((*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_);
 				bb->consumed_bits += bits_;
 				bb->total_consumed_bits += bits_;
 				return true;
@@ -1516,9 +1522,10 @@
 #else
 		while(bits_ >= FLAC__BITS_PER_BLURB) {
 			v <<= FLAC__BITS_PER_BLURB;
-			v |= bb->buffer[bb->consumed_blurbs];
+			v |= (*bbb);
 			bits_ -= FLAC__BITS_PER_BLURB;
-			CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16);
+			CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+			++bbb;
 			bb->consumed_blurbs++;
 			/* bb->consumed_bits is already 0 */
 			/* we hold off updating bb->total_consumed_bits until the end */
@@ -1526,7 +1533,7 @@
 #endif
 		if(bits_ > 0) {
 			v <<= bits_;
-			v |= (bb->buffer[bb->consumed_blurbs] >> (FLAC__BITS_PER_BLURB-bits_));
+			v |= ((*bbb) >> (FLAC__BITS_PER_BLURB-bits_));
 			bb->consumed_bits = bits_;
 			/* we hold off updating bb->total_consumed_bits until the end */
 		}
--- orig/src/libFLAC/stream_decoder.c
+++ mod/src/libFLAC/stream_decoder.c
@@ -74,6 +74,7 @@
  ***********************************************************************/
 
 static void set_defaults_(FLAC__StreamDecoder *decoder);
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder);
 static FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels);
 static FLAC__bool has_id_filtered_(FLAC__StreamDecoder *decoder, FLAC__byte *id);
 static FLAC__bool find_metadata_(FLAC__StreamDecoder *decoder);
@@ -776,6 +768,54 @@
 	decoder->private_->metadata_filter_ids_count = 0;
 }
 
+/* Undo any special channel coding */
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder)
+{
+	register FLAC__int32 left, right;
+	register unsigned i;
+	register FLAC__int32 *lchan, *rchan;
+	switch(decoder->private_->frame.header.channel_assignment) {
+		case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
+			/* do nothing */
+			break;
+		case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
+			FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+			lchan = &(decoder->private_->output[0])[0];
+			rchan = &(decoder->private_->output[1])[0];
+			for(i = decoder->private_->frame.header.blocksize; i != 0; --i) {
+				*rchan = *(lchan++) - *rchan;
+				++rchan;
+			}
+			break;
+		case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+			FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+			lchan = &(decoder->private_->output[0])[0];
+			rchan = &(decoder->private_->output[1])[0];
+			for(i = decoder->private_->frame.header.blocksize; i != 0; --i)
+				*(lchan++) += *(rchan++);
+			break;
+		case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
+			FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+			lchan = &(decoder->private_->output[0])[0];
+			rchan = &(decoder->private_->output[1])[0];
+			for(i = decoder->private_->frame.header.blocksize; i != 0; --i) {
+				register FLAC__int32 mid = *lchan;
+				register FLAC__int32 side = *rchan;
+				mid <<= 1;
+				if(side & 1) /* i.e. if 'side' is odd... */
+					++mid;
+				left = mid + side;
+				right = mid - side;
+				*(lchan++) = left >> 1;
+				*(rchan++) = right >> 1;
+			}
+			break;
+		default:
+			FLAC__ASSERT(0);
+			break;
+	}
+}
+
 FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels)
 {
 	unsigned i;
@@ -1380,8 +1418,6 @@
 FLAC__bool read_frame_(FLAC__StreamDecoder *decoder, FLAC__bool *got_a_frame, FLAC__bool do_full_decode)
 {
 	unsigned channel;
-	unsigned i;
-	FLAC__int32 mid, side, left, right;
 	FLAC__uint16 frame_crc; /* the one we calculate from the input stream */
 	FLAC__uint32 x;
 
@@ -1446,41 +1482,9 @@
 	if(!FLAC__bitbuffer_read_raw_uint32(decoder->private_->input, &x, FLAC__FRAME_FOOTER_CRC_LEN, read_callback_, decoder))
 		return false; /* the read_callback_ sets the state for us */
 	if(frame_crc == (FLAC__uint16)x) {
-		if(do_full_decode) {
-			/* Undo any special channel coding */
-			switch(decoder->private_->frame.header.channel_assignment) {
-				case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
-					/* do nothing */
-					break;
-				case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
-					FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-					for(i = 0; i < decoder->private_->frame.header.blocksize; i++)
-						decoder->private_->output[1][i] = decoder->private_->output[0][i] - decoder->private_->output[1][i];
-					break;
-				case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
-					FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-					for(i = 0; i < decoder->private_->frame.header.blocksize; i++)
-						decoder->private_->output[0][i] += decoder->private_->output[1][i];
-					break;
-				case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
-					FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
-					for(i = 0; i < decoder->private_->frame.header.blocksize; i++) {
-						mid = decoder->private_->output[0][i];
-						side = decoder->private_->output[1][i];
-						mid <<= 1;
-						if(side & 1) /* i.e. if 'side' is odd... */
-							mid++;
-						left = mid + side;
-						right = mid - side;
-						decoder->private_->output[0][i] = left >> 1;
-						decoder->private_->output[1][i] = right >> 1;
-					}
-					break;
-				default:
-					FLAC__ASSERT(0);
-					break;
-			}
-		}
+		if(do_full_decode)
+			read_channel_coding(decoder);
+		
 	}
 	else {
 		/* Bad frame, emit error and zero the output signal */


/EOF
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20050202/d001db2c/attachment.pgp