ref: 95d4c9f960c9469961781c923ccfdb8c3eba0562
parent: 68688651a4c3ed1fc4345c1bfb3932658e51f0b4
author: Linfeng Zhang <linfengz@google.com>
date: Wed Jul 13 13:25:49 EDT 2016
Optimize silk_LPC_inverse_pred_gain() for ARM NEON The optimization is bit exact with C function. Change-Id: Ib3bdc26a5a4ebe02e7f24be85104e8e9a2a9a738 Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/silk/CNG.c
+++ b/silk/CNG.c
@@ -142,7 +142,7 @@
silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed );
/* Convert CNG NLSF to filter representation */
- silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
+ silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch );
/* Generate CNG signal, by synthesis filtering */
silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) );
--- a/silk/LPC_inv_pred_gain.c
+++ b/silk/LPC_inv_pred_gain.c
@@ -39,7 +39,7 @@
/* Compute inverse of LPC prediction gain, and */
/* test if LPC coefficients are stable (all poles within unit circle) */
-static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inverse prediction gain in energy domain, Q30 */
+static opus_int32 LPC_inverse_pred_gain_QA_c( /* O Returns inverse prediction gain in energy domain, Q30 */
opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
const opus_int order /* I Prediction order */
)
@@ -119,7 +119,7 @@
}
/* For input in Q12 domain */
-opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
const opus_int order /* I Prediction order */
)
@@ -137,5 +137,5 @@
if( DC_resp >= 4096 ) {
return 0;
}
- return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+ return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
}
--- a/silk/NLSF2A.c
+++ b/silk/NLSF2A.c
@@ -66,7 +66,8 @@
void silk_NLSF2A(
opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */
const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */
- const opus_int d /* I filter order (should be even) */
+ const opus_int d, /* I filter order (should be even) */
+ int arch /* I Run-time architecture */
)
{
/* This ordering was found to maximize quality. It improves numerical accuracy of
@@ -128,7 +129,7 @@
/* Convert int32 coefficients to Q12 int16 coefs */
silk_LPC_fit( a_Q12, a32_QA1, 12, QA + 1, d );
- for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) {
+ for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d, arch ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) {
/* Prediction coefficients are (too close to) unstable; apply bandwidth expansion */
/* on the unscaled coefficients, convert to Q12 and measure again */
silk_bwexpander_32( a32_QA1, d, 65536 - silk_LSHIFT( 2, i ) );
--- a/silk/PLC.c
+++ b/silk/PLC.c
@@ -275,7 +275,7 @@
/* Reduce random noise for unvoiced frames with high LPC gain */
opus_int32 invGain_Q30, down_scale_Q30;
- invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order );
+ invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order, arch );
down_scale_Q30 = silk_min_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_HIGH_THRES ), invGain_Q30 );
down_scale_Q30 = silk_max_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_LOW_THRES ), down_scale_Q30 );
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -47,6 +47,10 @@
#include "x86/SigProc_FIX_sse.h"
#endif
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_inv_pred_gain_arm.h"
+#endif
+
/********************************************************************/
/* SIGNAL PROCESSING FUNCTIONS */
/********************************************************************/
@@ -132,7 +136,7 @@
/* Compute inverse of LPC prediction gain, and */
/* test if LPC coefficients are stable (all poles within unit circle) */
-opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
const opus_int order /* I Prediction order */
);
@@ -146,6 +150,10 @@
const opus_int32 N /* I Number of input samples */
);
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+#define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_c(A_Q12, order))
+#endif
+
/********************************************************************/
/* SCALAR FUNCTIONS */
/********************************************************************/
@@ -265,7 +273,8 @@
void silk_NLSF2A(
opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */
const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */
- const opus_int d /* I filter order (should be even) */
+ const opus_int d, /* I filter order (should be even) */
+ int arch /* I Run-time architecture */
);
/* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_arm.h
@@ -1,0 +1,57 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifndef SILK_LPC_INV_PRED_GAIN_ARM_H
+# define SILK_LPC_INV_PRED_GAIN_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+);
+
+# if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))
+# endif
+# endif
+
+# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order);
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_neon(A_Q12, order))
+# endif
+# endif
+
+#endif /* end SILK_LPC_INV_PRED_GAIN_ARM_H */
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -1,0 +1,280 @@
+/***********************************************************************
+Copyright (c) 2017 Google Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+- Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+- Neither the name of Internet Society, IETF or IETF Trust, nor the
+names of specific contributors, may be used to endorse or promote
+products derived from this software without specific prior written
+permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+***********************************************************************/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "SigProc_FIX.h"
+#include "define.h"
+
+#define QA 24
+#define A_LIMIT SILK_FIX_CONST( 0.99975, QA )
+
+#define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
+
+/* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed,
+ * since NEON has no 64-bit min, max or comparison instructions.
+ * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not.
+ * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000.
+ * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62.
+ * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1,
+ * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed.
+ * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000.
+ * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions.
+ * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF.
+ */
+
+/* Compute inverse of LPC prediction gain, and */
+/* test if LPC coefficients are stable (all poles within unit circle) */
+static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ opus_int32 A_QA[ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
+ const opus_int order /* I Prediction order */
+)
+{
+ opus_int k, n, mult2Q;
+ opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2;
+ opus_int32 max, min;
+ int32x4_t max_s32x4, min_s32x4;
+ int32x2_t max_s32x2, min_s32x2;
+
+ max_s32x4 = vdupq_n_s32( silk_int32_MIN );
+ min_s32x4 = vdupq_n_s32( silk_int32_MAX );
+ invGain_Q30 = SILK_FIX_CONST( 1, 30 );
+ for( k = order - 1; k > 0; k-- ) {
+ int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
+ int64x2_t mult2Q_s64x2;
+
+ /* Check for stability */
+ if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {
+ return 0;
+ }
+
+ /* Set RC equal to negated AR coef */
+ rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA );
+
+ /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
+ rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
+ silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */
+ silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
+
+ /* Update inverse gain */
+ /* invGain_Q30 range: [ 0 : 2^30 ] */
+ invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+ silk_assert( invGain_Q30 >= 0 );
+ silk_assert( invGain_Q30 <= ( 1 << 30 ) );
+ if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
+ return 0;
+ }
+
+ /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
+ mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
+ rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
+
+ /* Update AR coefficient */
+ rc_Q31_s32x2 = vdup_n_s32( rc_Q31 );
+ mult2Q_s64x2 = vdupq_n_s64( -mult2Q );
+ rc_mult2_s32x2 = vdup_n_s32( rc_mult2 );
+
+ for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) {
+ /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */
+ int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4;
+ int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+ tmp1_s32x4 = vld1q_s32( A_QA + n );
+ tmp2_s32x4 = vld1q_s32( A_QA + k - n - 4 );
+ tmp2_s32x4 = vrev64q_s32( tmp2_s32x4 );
+ tmp2_s32x4 = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) );
+ t0_s32x4 = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 );
+ t1_s32x4 = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 );
+ t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 );
+ t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 );
+ t0_s64x2 = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 );
+ t1_s64x2 = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 );
+ t2_s64x2 = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 );
+ t3_s64x2 = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 );
+ t0_s64x2 = vrshlq_s64( t0_s64x2, mult2Q_s64x2 );
+ t1_s64x2 = vrshlq_s64( t1_s64x2, mult2Q_s64x2 );
+ t2_s64x2 = vrshlq_s64( t2_s64x2, mult2Q_s64x2 );
+ t3_s64x2 = vrshlq_s64( t3_s64x2, mult2Q_s64x2 );
+ t0_s32x4 = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) );
+ t1_s32x4 = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) );
+ s0_s32x4 = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) );
+ s1_s32x4 = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) );
+ max_s32x4 = vmaxq_s32( max_s32x4, s0_s32x4 );
+ min_s32x4 = vminq_s32( min_s32x4, s0_s32x4 );
+ max_s32x4 = vmaxq_s32( max_s32x4, s1_s32x4 );
+ min_s32x4 = vminq_s32( min_s32x4, s1_s32x4 );
+ t1_s32x4 = vrev64q_s32( t1_s32x4 );
+ t1_s32x4 = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) );
+ vst1q_s32( A_QA + n, t0_s32x4 );
+ vst1q_s32( A_QA + k - n - 4, t1_s32x4 );
+ }
+ for( ; n < (k + 1) >> 1; n++ ) {
+ opus_int64 tmp64;
+ tmp1 = A_QA[ n ];
+ tmp2 = A_QA[ k - n - 1 ];
+ tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1,
+ MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q);
+ if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {
+ return 0;
+ }
+ A_QA[ n ] = ( opus_int32 )tmp64;
+ tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2,
+ MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q);
+ if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {
+ return 0;
+ }
+ A_QA[ k - n - 1 ] = ( opus_int32 )tmp64;
+ }
+ }
+
+ /* Check for stability */
+ if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {
+ return 0;
+ }
+
+ max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) );
+ min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) );
+ max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) );
+ min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) );
+ max = vget_lane_s32( max_s32x2, 0 );
+ min = vget_lane_s32( min_s32x2, 0 );
+ if( ( max > 0 ) || ( min < -1 ) ) {
+ return 0;
+ }
+
+ /* Set RC equal to negated AR coef */
+ rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA );
+
+ /* Range: [ 1 : 2^30 ] */
+ rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );
+
+ /* Update inverse gain */
+ /* Range: [ 0 : 2^30 ] */
+ invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+ silk_assert( invGain_Q30 >= 0 );
+ silk_assert( invGain_Q30 <= ( 1 << 30 ) );
+ if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {
+ return 0;
+ }
+
+ return invGain_Q30;
+}
+
+/* For input in Q12 domain */
+opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+)
+{
+#ifdef OPUS_CHECK_ASM
+ const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order );
+#endif
+
+ opus_int32 invGain_Q30;
+ if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) {
+ invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order );
+ }
+ else {
+ opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ];
+ opus_int32 DC_resp;
+ int16x8_t t0_s16x8, t1_s16x8, t2_s16x8;
+ int32x4_t t0_s32x4;
+ const opus_int leftover = order & 7;
+
+ /* Increase Q domain of the AR coefficients */
+ t0_s16x8 = vld1q_s16( A_Q12 + 0 );
+ t1_s16x8 = vld1q_s16( A_Q12 + 8 );
+ t2_s16x8 = vld1q_s16( A_Q12 + 16 );
+ t0_s32x4 = vpaddlq_s16( t0_s16x8 );
+
+ switch( order - leftover )
+ {
+ case 24:
+ t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );
+ /* Intend to fall through */
+
+ case 16:
+ t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );
+ vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );
+ vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );
+ /* Intend to fall through */
+
+ case 8:
+ {
+ const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) );
+ const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 );
+ DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 );
+ vst1q_s32( Atmp_QA + 8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );
+ vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );
+ }
+ break;
+
+ default:
+ DC_resp = 0;
+ break;
+ }
+ A_Q12 += order - leftover;
+
+ switch( leftover )
+ {
+ case 6:
+ DC_resp += (opus_int32)A_Q12[ 5 ];
+ DC_resp += (opus_int32)A_Q12[ 4 ];
+ /* Intend to fall through */
+
+ case 4:
+ DC_resp += (opus_int32)A_Q12[ 3 ];
+ DC_resp += (opus_int32)A_Q12[ 2 ];
+ /* Intend to fall through */
+
+ case 2:
+ DC_resp += (opus_int32)A_Q12[ 1 ];
+ DC_resp += (opus_int32)A_Q12[ 0 ];
+ /* Intend to fall through */
+
+ default:
+ break;
+ }
+
+ /* If the DC is unstable, we don't even need to do the full calculations */
+ if( DC_resp >= 4096 ) {
+ invGain_Q30 = 0;
+ } else {
+ vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );
+ vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );
+ invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+ }
+ }
+
+#ifdef OPUS_CHECK_ASM
+ silk_assert( invGain_Q30_c == invGain_Q30 );
+#endif
+
+ return invGain_Q30;
+}
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -30,11 +30,22 @@
#include "main_FIX.h"
#include "NSQ.h"
+#include "SigProc_FIX.h"
#if defined(OPUS_HAVE_RTCD)
# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
!defined(OPUS_ARM_PRESUME_NEON_INTR))
+
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+) = {
+ silk_LPC_inverse_pred_gain_c, /* ARMv4 */
+ silk_LPC_inverse_pred_gain_c, /* EDSP */
+ silk_LPC_inverse_pred_gain_c, /* Media */
+ MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */
+};
void (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(
const silk_encoder_state *psEncC, /* I Encoder State */
--- a/silk/decode_parameters.c
+++ b/silk/decode_parameters.c
@@ -52,7 +52,7 @@
silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB );
/* Convert NLSF parameters to AR prediction filter coefficients */
- silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order );
+ silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch );
/* If just reset, e.g., because internal Fs changed, do not allow interpolation */
/* improves the case of packet loss in the first frame after a switch */
@@ -69,7 +69,7 @@
}
/* Convert NLSF parameters to AR prediction filter coefficients */
- silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order );
+ silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch );
} else {
/* Copy LPC coefficients for first half from second half */
silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) );
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -92,7 +92,7 @@
silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
/* Convert to LPC for residual energy evaluation */
- silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
/* Calculate residual energy with NLSF interpolation */
silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
--- a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h
+++ b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h
@@ -224,8 +224,8 @@
silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 );
/* Ratio of prediction gains, in energy domain */
- pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder );
- nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder );
+ pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder, arch );
+ nrg = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder, arch );
/*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/
pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 );
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -73,7 +73,7 @@
silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
/* Convert to LPC for residual energy evaluation */
- silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
/* Calculate residual energy with LSF interpolation */
silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder );
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -256,7 +256,8 @@
void silk_NLSF2A_FLP(
silk_float *pAR, /* O LPC coefficients [ LPC_order ] */
const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */
- const opus_int LPC_order /* I LPC order */
+ const opus_int LPC_order, /* I LPC order */
+ int arch /* I Run-time architecture */
);
/* Limit, stabilize, and quantize NLSFs */
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -54,13 +54,14 @@
void silk_NLSF2A_FLP(
silk_float *pAR, /* O LPC coefficients [ LPC_order ] */
const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */
- const opus_int LPC_order /* I LPC order */
+ const opus_int LPC_order, /* I LPC order */
+ int arch /* I Run-time architecture */
)
{
opus_int i;
opus_int16 a_fix_Q12[ MAX_LPC_ORDER ];
- silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order );
+ silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch );
for( i = 0; i < LPC_order; i++ ) {
pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f );
--- a/silk/init_decoder.c
+++ b/silk/init_decoder.c
@@ -44,6 +44,7 @@
/* Used to deactivate LSF interpolation */
psDec->first_frame_after_reset = 1;
psDec->prev_gain_Q16 = 65536;
+ psDec->arch = opus_select_arch();
/* Reset CNG state */
silk_CNG_Reset( psDec );
--- a/silk/process_NLSFs.c
+++ b/silk/process_NLSFs.c
@@ -89,7 +89,7 @@
NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType );
/* Convert quantized NLSFs back to LPC coefficients */
- silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch );
if( doInterpolate ) {
/* Calculate the interpolated, quantized LSF vector for the first half */
@@ -97,7 +97,7 @@
psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );
/* Convert back to LPC coefficients */
- silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch );
} else {
/* Copy LPC coefficients for first half from second half */
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -301,6 +301,7 @@
/* Stuff used for PLC */
opus_int lossCnt;
opus_int prevSignalType;
+ int arch;
silk_PLC_struct sPLC;
--- a/silk/tests/test_unit_LPC_inv_pred_gain.c
+++ b/silk/tests/test_unit_LPC_inv_pred_gain.c
@@ -78,6 +78,7 @@
}
int main(void) {
+ const int arch = opus_select_arch();
/* Set to 10000 so all branches in C function are triggered */
const int loop_num = 10000;
int count = 0;
@@ -100,7 +101,7 @@
for( i = 0; i < SILK_MAX_ORDER_LPC; i++ ) {
A_Q12[i] = ((opus_int16)rand()) >> shift;
}
- gain = silk_LPC_inverse_pred_gain(A_Q12, order);
+ gain = silk_LPC_inverse_pred_gain(A_Q12, order, arch);
/* Look for filters that silk_LPC_inverse_pred_gain() thinks are
stable but definitely aren't. */
if( gain != 0 && !check_stability(A_Q12, order) ) {
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@
silk/resampler_structs.h \
silk/SigProc_FIX.h \
silk/x86/SigProc_FIX_sse.h \
+silk/arm/LPC_inv_pred_gain_arm.h \
silk/arm/macros_armv4.h \
silk/arm/macros_armv5e.h \
silk/arm/macros_arm64.h \
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -85,6 +85,7 @@
SILK_SOURCES_ARM_NEON_INTR = \
silk/arm/arm_silk_map.c \
+silk/arm/LPC_inv_pred_gain_neon_intr.c \
silk/arm/NSQ_del_dec_neon_intr.c \
silk/arm/NSQ_neon.c