[opus] 2 patches related to silk_biquad_alt() optimization

Linfeng Zhang linfengz at google.com
Wed Apr 26 21:15:40 UTC 2017


On Tue, Apr 25, 2017 at 10:31 PM, Jean-Marc Valin <jmvalin at jmvalin.ca>
wrote:

>
> > A_Q28 is split to 2 14-bit (or 16-bit, whatever) integers, to make the
> > multiplication operation within 32-bits. NEON can do 32-bit x 32-bit =
> > 64-bit using 'int64x2_t vmull_s32(int32x2_t a, int32x2_t b)', and it
> > could possibly be faster and less rounding/shifting errors than above C
> > code. But it may increase difficulties for other CPUs not supporting
> > 32-bit multiplication.
>
> OK, so I'm not totally opposed to that, but it increases the
> testing/maintenance cost so it needs to be worth it. So the question is
> how much speedup can you get and how close you can make the result to
> the original function. If you can make the output be always within one
> of two LSBs of the C version, then the asm check can simply be a little
> bit more lax than usual. Otherwise it becomes more complicated. This
> isn't a function that scares me too much about going non-bitexact, but
> it's also not one of the big complexity costs either. In any case, let
> me know what you find.
>

Tested the proposed NEON optimization change, it can increase the whole
encoder speed by about 1% at Complexity 8 and 10 for stride = 2 (compared
to the NEON optimization in the previous patch), and 0.7% at Complexity 8
and 1.1% at Complexity 10 for stride = 1 (compared to the original C code).

Unfortunately, the truncating difference accumulates in S[] and its
difference cannot be easily bounded. The difference in out[] may somehow be
bounded to 5 in my quick testing, but is not guaranteed to other inputs. So
maybe comparing bit exactness with the following
silk_biquad_alt_c_MulSingleAQ28() is better.

Please let me know the decision (whether keeping the original NEON (stride
2 only) or choosing the new NEON (both stride 1 and 2) which optimizes
following silk_biquad_alt_c_MulSingleAQ28()), and I'll wrap up the patch.

Here attached the corresponding C code silk_biquad_alt_c_MulSingleAQ28() for
your information. It uses 64-bit multiplications and is about 0.6% - 0.9%
slower (for the whole encoder) than the original C code using 32-bit
multiplications (for both strides and the same stride 2 unrolling).

void silk_biquad_alt_c_MulSingleAQ28(
    const opus_int16            *in,                /* I     input signal
                                            */
    const opus_int32            *B_Q28,             /* I     MA
coefficients [3]                                        */
    const opus_int32            *A_Q28,             /* I     AR
coefficients [2]                                        */
    opus_int32                  *S,                 /* I/O   State vector
[2*stride]                                    */
    opus_int16                  *out,               /* O     output signal
                                             */
    const opus_int32            len,                /* I     signal length
(must be even)                               */
    opus_int                    stride              /* I     Operate on
interleaved signal if > 1                       */
)
{
    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
    opus_int   k;

    silk_assert( ( stride == 1 ) || ( stride == 2 ) );

    if( stride == 1) {
        opus_int32 out32_Q14;
        for( k = 0; k < len; k++ ) {
            /* S[ 0 ], S[ 1 ]: Q12 */
            out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k
] ), 2 );

            S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14 *
(-A_Q28[ 0 ]), 30 );
            S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k ] );

            S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[ 1
]) , 30 );
            S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k ] );

            /* Scale back to Q0 and saturate */
            out[ k ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14 +
(1<<14) - 1, 14 ) );
        }
    } else {
        opus_int32 out32_Q14[ 2 ];
        for( k = 0; k < len; k++ ) {
            /* S[ 0 ], S[ 1 ]: Q12 */
            out32_Q14[ 0 ] = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0 ],
in[ k * 2 + 0 ] ), 2 );
            out32_Q14[ 1 ] = silk_LSHIFT( silk_SMLAWB( S[ 2 ], B_Q28[ 0 ],
in[ k * 2 + 1 ] ), 2 );

            S[ 0 ] = S[ 1 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ]
* (-A_Q28[ 0 ]), 30 );
            S[ 2 ] = S[ 3 ] + silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ]
* (-A_Q28[ 0 ]), 30 );
            S[ 0 ] = silk_SMLAWB( S[ 0 ], B_Q28[ 1 ], in[ k * 2 + 0 ] );
            S[ 2 ] = silk_SMLAWB( S[ 2 ], B_Q28[ 1 ], in[ k * 2 + 1 ] );

            S[ 1 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 0 ] *
(-A_Q28[ 1 ]), 30 );
            S[ 3 ] = silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ 1 ] *
(-A_Q28[ 1 ]), 30 );
            S[ 1 ] = silk_SMLAWB( S[ 1 ], B_Q28[ 2 ], in[ k * 2 + 0 ] );
            S[ 3 ] = silk_SMLAWB( S[ 3 ], B_Q28[ 2 ], in[ k * 2 + 1 ] );

            /* Scale back to Q0 and saturate */
            out[ k * 2 + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT(
out32_Q14[ 0 ] + (1<<14) - 1, 14 ) );
            out[ k * 2 + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT(
out32_Q14[ 1 ] + (1<<14) - 1, 14 ) );
        }
    }
}

Here is the NEON kernels which uses vqrdmulh_lane_s32() to do the
multiplication and rounding, where A_Q28_s32x{2,4} stores doubled -A_Q28[]:

static inline void silk_biquad_alt_stride1_kernel(const int32x2_t
A_Q28_s32x2, const int32x4_t t_s32x4, int32x2_t *S_s32x2, int32x2_t
*out32_Q14_s32x2)
{
    int32x2_t t_s32x2;

    *out32_Q14_s32x2 = vadd_s32(*S_s32x2, vget_low_s32(t_s32x4));
                 /* silk_SMLAWB( S[ 0 ], B_Q28[ 0 ], in[ k ] )
                   */
    *S_s32x2         = vreinterpret_s32_u64(vshr_n_
u64(vreinterpret_u64_s32(*S_s32x2), 32)); /* S[ 0 ] = S[ 1 ]; S[ 1 ] = 0;
                                           */
    *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2);
                 /* out32_Q14 = silk_LSHIFT( silk_SMLAWB( S[ 0 ], B_Q28[ 0
], in[ k ] ), 2 ); */
    t_s32x2          = vqrdmulh_lane_s32(A_Q28_s32x2, *out32_Q14_s32x2, 0);
                 /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14 * (-A_Q28[
{0,1} ]), 30 )        */
    *S_s32x2         = vadd_s32(*S_s32x2, t_s32x2);
                 /* S[ {0,1} ] = {S[ 1 ],0} + silk_RSHIFT_ROUND( );
                  */
    *S_s32x2         = vadd_s32(*S_s32x2, vget_high_s32(t_s32x4));
                  /* S[ {0,1} ] = silk_SMLAWB( S[ {0,1} ], B_Q28[ {1,2} ],
in[ k ] );          */
}

static inline void silk_biquad_alt_stride2_kernel(const int32x4_t
A_Q28_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const
int32x4_t inval_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2)
{
    int32x4_t t_s32x4, out32_Q14_s32x4;

    *out32_Q14_s32x2 = vadd_s32(vget_low_s32(*S_s32x4), t_s32x2);
 /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k * 2 + {0,1} ] )
                      */
    *S_s32x4         = vcombine_s32(vget_high_s32(*S_s32x4),
vdup_n_s32(0)); /* S{0,1} = S{2,3}; S{2,3} = 0;
                                    */
    *out32_Q14_s32x2 = vshl_n_s32(*out32_Q14_s32x2, 2);
 /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in[ k *
2 + {0,1} ] ), 2 );  */
    out32_Q14_s32x4  = vcombine_s32(*out32_Q14_s32x2, *out32_Q14_s32x2);
  /* out32_Q14_{0,1,0,1}
                       */
    t_s32x4          = vqrdmulhq_s32(out32_Q14_s32x4, A_Q28_s32x4);
 /* silk_RSHIFT_ROUND( (opus_int64)out32_Q14[ {0,1,0,1} ] * (-A_Q28[
{0,0,1,1} ]), 30 )          */
    *S_s32x4         = vaddq_s32(*S_s32x4, t_s32x4);
  /* S[ {0,1,2,3} ] = {S[ {2,3} ],0,0} + silk_RSHIFT_ROUND( );
                       */
    t_s32x4          = vqdmulhq_s32(inval_s32x4, B_Q28_s32x4);
  /* silk_SMULWB(B_Q28[ {1,1,2,2} ], in[ k * 2 + {0,1,0,1} ] )
                       */
    *S_s32x4         = vaddq_s32(*S_s32x4, t_s32x4);
  /* S[ {0,1,2,3} ] = silk_SMLAWB( S[ {0,1,2,3} ], B_Q28[ {1,1,2,2} ], in[
k * 2 + {0,1,0,1} ] ); */
}

Thanks,
Linfeng
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.xiph.org/pipermail/opus/attachments/20170426/0a60318a/attachment-0001.html>


More information about the opus mailing list