[Flac-dev] two small-ish optimizations (death by a thousand cuts)
Eric Wong
eric at petta-tech.com
Wed Feb 2 23:19:26 PST 2005
This lpc_restore_order was partially inspired by Miroslav's affd, though
my (not very great) ARM asm version resembled this, as well.
The other two reduce CPU array indexing overhead in loops a little.
Additionally, a request for help:
My not very optimized lpc_restore_signal is at the below URL, I
couldn't get the ldm* instructions to work as advertised, even though
I've talked to several ARM asm hackers who said they looked right. I
can use the fp as a regular register since since I'm compiling without
it. Comments within should explain what I'm having trouble with:
http://archzoom.sourcecontrol.net/archzoom.cgi/eric@petta-tech.com--2005a-normalperson/flac--ipod--1.1.0--patch-19/src/libFLAC/arm/lpc_asm.s
--
Eric Wong
--- orig/src/libFLAC/lpc.c
+++ mod/src/libFLAC/lpc.c
@@ -293,6 +293,209 @@
void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
+ register const FLAC__int32 *qlp0 = &qlp_coeff[(order-1)];
+ register FLAC__int32 sum;
+ register const FLAC__int32 *history, *qlp;
+
+ history = &data[(-order)];
+
+ switch (order) {
+ case 12:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ + (qlp0[-7] * history[7])
+ + (qlp0[-8] * history[8])
+ + (qlp0[-9] * history[9])
+ + (qlp0[-10] * history[10])
+ + (qlp0[-11] * history[11])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 11:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ + (qlp0[-7] * history[7])
+ + (qlp0[-8] * history[8])
+ + (qlp0[-9] * history[9])
+ + (qlp0[-10] * history[10])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 10:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ + (qlp0[-7] * history[7])
+ + (qlp0[-8] * history[8])
+ + (qlp0[-9] * history[9])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 9:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ + (qlp0[-7] * history[7])
+ + (qlp0[-8] * history[8])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 8:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ + (qlp0[-7] * history[7])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 7:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ + (qlp0[-6] * history[6])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 6:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ + (qlp0[-5] * history[5])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 5:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ + (qlp0[-4] * history[4])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 4:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ + (qlp0[-3] * history[3])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 3:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ + (qlp0[-2] * history[2])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 2:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * history[0])
+ + (qlp0[-1] * history[1])
+ ;
+ ++history;
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ case 1:
+ for( ; data_len != 0; --data_len) {
+ sum = (qlp0[0] * (*(history++)));
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ default:
+ {
+ /* handle everything else: (order > 12)
+ * with Duff's Device to reduce jumps */
+ const unsigned n0 = (order + 7)/8;
+ const int tmp = 0 - order - 1;
+ register const FLAC__int32 *qlpd = &qlp_coeff[order];
+ for( ; data_len != 0; --data_len) {
+ register unsigned n = n0;
+ sum = 0;
+ qlp = qlpd;
+ history = &data[tmp];
+
+ switch(order%8) {
+ case 0: do {
+ sum += (*(--qlp)) * (*(++history));
+ case 7: sum += (*(--qlp)) * (*(++history));
+ case 6: sum += (*(--qlp)) * (*(++history));
+ case 5: sum += (*(--qlp)) * (*(++history));
+ case 4: sum += (*(--qlp)) * (*(++history));
+ case 3: sum += (*(--qlp)) * (*(++history));
+ case 2: sum += (*(--qlp)) * (*(++history));
+ case 1: sum += (*(--qlp)) * (*(++history));
+ } while (--n);
+ }
+
+ *(data++) = *(residual++) + (sum >> lp_quantization);
+ }
+ return;
+ }
+ }
+}
+
+#if 0
+void FLAC__lpc_restore_signal_orig(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
+{
#ifdef FLAC__OVERFLOW_DETECT
FLAC__int64 sumo;
#endif
@@ -339,6 +542,7 @@
}
*/
}
+#endif /* 0 */
void FLAC__lpc_restore_signal_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
{
--- orig/src/libFLAC/bitbuffer.c
+++ mod/src/libFLAC/bitbuffer.c
@@ -1466,6 +1469,7 @@
{
unsigned i, bits_ = bits;
FLAC__uint32 v = 0;
+ FLAC__blurb *bbb;
FLAC__ASSERT(0 != bb);
FLAC__ASSERT(0 != bb->buffer);
@@ -1485,18 +1489,20 @@
#if FLAC__BITS_PER_BLURB > 8
if(bb->bits == 0 || bb->consumed_blurbs < bb->blurbs) { /*@@@ comment on why this is here*/
#endif
+ bbb = &bb->buffer[bb->consumed_blurbs];
if(bb->consumed_bits) {
i = FLAC__BITS_PER_BLURB - bb->consumed_bits;
if(i <= bits_) {
- v = bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits);
+ v = (*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits);
bits_ -= i;
- CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16);
+ CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+ ++bbb;
bb->consumed_blurbs++;
bb->consumed_bits = 0;
/* we hold off updating bb->total_consumed_bits until the end */
}
else {
- *val = (bb->buffer[bb->consumed_blurbs] & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_);
+ *val = ((*bbb) & (FLAC__BLURB_ALL_ONES >> bb->consumed_bits)) >> (i-bits_);
bb->consumed_bits += bits_;
bb->total_consumed_bits += bits_;
return true;
@@ -1516,9 +1522,10 @@
#else
while(bits_ >= FLAC__BITS_PER_BLURB) {
v <<= FLAC__BITS_PER_BLURB;
- v |= bb->buffer[bb->consumed_blurbs];
+ v |= (*bbb);
bits_ -= FLAC__BITS_PER_BLURB;
- CRC16_UPDATE_BLURB(bb, bb->buffer[bb->consumed_blurbs], bb->read_crc16);
+ CRC16_UPDATE_BLURB(bb, (*bbb), bb->read_crc16);
+ ++bbb;
bb->consumed_blurbs++;
/* bb->consumed_bits is already 0 */
/* we hold off updating bb->total_consumed_bits until the end */
@@ -1526,7 +1533,7 @@
#endif
if(bits_ > 0) {
v <<= bits_;
- v |= (bb->buffer[bb->consumed_blurbs] >> (FLAC__BITS_PER_BLURB-bits_));
+ v |= ((*bbb) >> (FLAC__BITS_PER_BLURB-bits_));
bb->consumed_bits = bits_;
/* we hold off updating bb->total_consumed_bits until the end */
}
--- orig/src/libFLAC/stream_decoder.c
+++ mod/src/libFLAC/stream_decoder.c
@@ -74,6 +74,7 @@
***********************************************************************/
static void set_defaults_(FLAC__StreamDecoder *decoder);
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder);
static FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels);
static FLAC__bool has_id_filtered_(FLAC__StreamDecoder *decoder, FLAC__byte *id);
static FLAC__bool find_metadata_(FLAC__StreamDecoder *decoder);
@@ -776,6 +768,54 @@
decoder->private_->metadata_filter_ids_count = 0;
}
+/* Undo any special channel coding */
+static inline void read_channel_coding(FLAC__StreamDecoder *decoder)
+{
+ register FLAC__int32 left, right;
+ register unsigned i;
+ register FLAC__int32 *lchan, *rchan;
+ switch(decoder->private_->frame.header.channel_assignment) {
+ case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
+ /* do nothing */
+ break;
+ case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
+ FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+ lchan = &(decoder->private_->output[0])[0];
+ rchan = &(decoder->private_->output[1])[0];
+ for(i = decoder->private_->frame.header.blocksize; i != 0; --i) {
+ *rchan = *(lchan++) - *rchan;
+ ++rchan;
+ }
+ break;
+ case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
+ FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+ lchan = &(decoder->private_->output[0])[0];
+ rchan = &(decoder->private_->output[1])[0];
+ for(i = decoder->private_->frame.header.blocksize; i != 0; --i)
+ *(lchan++) += *(rchan++);
+ break;
+ case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
+ FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
+ lchan = &(decoder->private_->output[0])[0];
+ rchan = &(decoder->private_->output[1])[0];
+ for(i = decoder->private_->frame.header.blocksize; i != 0; --i) {
+ register FLAC__int32 mid = *lchan;
+ register FLAC__int32 side = *rchan;
+ mid <<= 1;
+ if(side & 1) /* i.e. if 'side' is odd... */
+ ++mid;
+ left = mid + side;
+ right = mid - side;
+ *(lchan++) = left >> 1;
+ *(rchan++) = right >> 1;
+ }
+ break;
+ default:
+ FLAC__ASSERT(0);
+ break;
+ }
+}
+
FLAC__bool allocate_output_(FLAC__StreamDecoder *decoder, unsigned size, unsigned channels)
{
unsigned i;
@@ -1380,8 +1418,6 @@
FLAC__bool read_frame_(FLAC__StreamDecoder *decoder, FLAC__bool *got_a_frame, FLAC__bool do_full_decode)
{
unsigned channel;
- unsigned i;
- FLAC__int32 mid, side, left, right;
FLAC__uint16 frame_crc; /* the one we calculate from the input stream */
FLAC__uint32 x;
@@ -1446,41 +1482,9 @@
if(!FLAC__bitbuffer_read_raw_uint32(decoder->private_->input, &x, FLAC__FRAME_FOOTER_CRC_LEN, read_callback_, decoder))
return false; /* the read_callback_ sets the state for us */
if(frame_crc == (FLAC__uint16)x) {
- if(do_full_decode) {
- /* Undo any special channel coding */
- switch(decoder->private_->frame.header.channel_assignment) {
- case FLAC__CHANNEL_ASSIGNMENT_INDEPENDENT:
- /* do nothing */
- break;
- case FLAC__CHANNEL_ASSIGNMENT_LEFT_SIDE:
- FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
- for(i = 0; i < decoder->private_->frame.header.blocksize; i++)
- decoder->private_->output[1][i] = decoder->private_->output[0][i] - decoder->private_->output[1][i];
- break;
- case FLAC__CHANNEL_ASSIGNMENT_RIGHT_SIDE:
- FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
- for(i = 0; i < decoder->private_->frame.header.blocksize; i++)
- decoder->private_->output[0][i] += decoder->private_->output[1][i];
- break;
- case FLAC__CHANNEL_ASSIGNMENT_MID_SIDE:
- FLAC__ASSERT(decoder->private_->frame.header.channels == 2);
- for(i = 0; i < decoder->private_->frame.header.blocksize; i++) {
- mid = decoder->private_->output[0][i];
- side = decoder->private_->output[1][i];
- mid <<= 1;
- if(side & 1) /* i.e. if 'side' is odd... */
- mid++;
- left = mid + side;
- right = mid - side;
- decoder->private_->output[0][i] = left >> 1;
- decoder->private_->output[1][i] = right >> 1;
- }
- break;
- default:
- FLAC__ASSERT(0);
- break;
- }
- }
+ if(do_full_decode)
+ read_channel_coding(decoder);
+
}
else {
/* Bad frame, emit error and zero the output signal */
/EOF
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: Digital signature
Url : http://lists.xiph.org/pipermail/flac-dev/attachments/20050202/d001db2c/attachment.pgp
More information about the Flac-dev
mailing list