shithub: opus

--- a/silk/CNG.c

+++ b/silk/CNG.c

@@ -142,7 +142,7 @@

         silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed );

         /* Convert CNG NLSF to filter representation */

-        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );

+        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch );

         /* Generate CNG signal, by synthesis filtering */

         silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) );

--- a/silk/LPC_inv_pred_gain.c

+++ b/silk/LPC_inv_pred_gain.c

@@ -39,7 +39,7 @@

 /* Compute inverse of LPC prediction gain, and                          */

 /* test if LPC coefficients are stable (all poles within unit circle)   */

-static opus_int32 LPC_inverse_pred_gain_QA(                 /* O   Returns inverse prediction gain in energy domain, Q30    */

+static opus_int32 LPC_inverse_pred_gain_QA_c(               /* O   Returns inverse prediction gain in energy domain, Q30    */

     opus_int32           A_QA[ SILK_MAX_ORDER_LPC ],        /* I   Prediction coefficients                                  */

     const opus_int       order                              /* I   Prediction order                                         */

@@ -119,7 +119,7 @@

 /* For input in Q12 domain */

-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse prediction gain in energy domain, Q30        */

+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse prediction gain in energy domain, Q30        */

     const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */

     const opus_int              order               /* I   Prediction order                                             */

@@ -137,5 +137,5 @@

     if( DC_resp >= 4096 ) {

         return 0;

-    return LPC_inverse_pred_gain_QA( Atmp_QA, order );

+    return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );

--- a/silk/NLSF2A.c

+++ b/silk/NLSF2A.c

@@ -66,7 +66,8 @@

 void silk_NLSF2A(

     opus_int16                  *a_Q12,             /* O    monic whitening filter coefficients in Q12,  [ d ]          */

     const opus_int16            *NLSF,              /* I    normalized line spectral frequencies in Q15, [ d ]          */

-    const opus_int              d                   /* I    filter order (should be even)                               */

+    const opus_int              d,                  /* I    filter order (should be even)                               */

+    int                         arch                /* I    Run-time architecture                                       */

     /* This ordering was found to maximize quality. It improves numerical accuracy of

@@ -128,7 +129,7 @@

     /* Convert int32 coefficients to Q12 int16 coefs */

     silk_LPC_fit( a_Q12, a32_QA1, 12, QA + 1, d );

-    for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) {

+    for( i = 0; silk_LPC_inverse_pred_gain( a_Q12, d, arch ) == 0 && i < MAX_LPC_STABILIZE_ITERATIONS; i++ ) {

         /* Prediction coefficients are (too close to) unstable; apply bandwidth expansion   */

         /* on the unscaled coefficients, convert to Q12 and measure again                   */

         silk_bwexpander_32( a32_QA1, d, 65536 - silk_LSHIFT( 2, i ) );

--- a/silk/PLC.c

+++ b/silk/PLC.c

@@ -275,7 +275,7 @@

             /* Reduce random noise for unvoiced frames with high LPC gain */

             opus_int32 invGain_Q30, down_scale_Q30;

-            invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order );

+            invGain_Q30 = silk_LPC_inverse_pred_gain( psPLC->prevLPC_Q12, psDec->LPC_order, arch );

             down_scale_Q30 = silk_min_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_HIGH_THRES ), invGain_Q30 );

             down_scale_Q30 = silk_max_32( silk_RSHIFT( (opus_int32)1 << 30, LOG2_INV_LPC_GAIN_LOW_THRES ), down_scale_Q30 );

--- a/silk/SigProc_FIX.h

+++ b/silk/SigProc_FIX.h

@@ -47,6 +47,10 @@

 #include "x86/SigProc_FIX_sse.h"

 #endif

+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))

+#include "arm/LPC_inv_pred_gain_arm.h"

+#endif

 /********************************************************************/

 /*                    SIGNAL PROCESSING FUNCTIONS                   */

 /********************************************************************/

@@ -132,7 +136,7 @@

 /* Compute inverse of LPC prediction gain, and                           */

 /* test if LPC coefficients are stable (all poles within unit circle)    */

-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse prediction gain in energy domain, Q30        */

+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse prediction gain in energy domain, Q30        */

     const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */

     const opus_int              order               /* I   Prediction order                                             */

);

@@ -146,6 +150,10 @@

     const opus_int32            N                   /* I    Number of input samples                                     */

);

+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)

+#define silk_LPC_inverse_pred_gain(A_Q12, order, arch)     ((void)(arch), silk_LPC_inverse_pred_gain_c(A_Q12, order))

+#endif

 /********************************************************************/

 /*                        SCALAR FUNCTIONS                          */

 /********************************************************************/

@@ -265,7 +273,8 @@

 void silk_NLSF2A(

     opus_int16                  *a_Q12,             /* O    monic whitening filter coefficients in Q12,  [ d ]          */

     const opus_int16            *NLSF,              /* I    normalized line spectral frequencies in Q15, [ d ]          */

-    const opus_int              d                   /* I    filter order (should be even)                               */

+    const opus_int              d,                  /* I    filter order (should be even)                               */

+    int                         arch                /* I    Run-time architecture                                       */

);

 /* Convert int32 coefficients to int16 coefs and make sure there's no wrap-around */

--- /dev/null

+++ b/silk/arm/LPC_inv_pred_gain_arm.h

@@ -1,0 +1,57 @@

+/***********************************************************************

+Copyright (c) 2017 Google Inc.

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions

+are met:

+- Redistributions of source code must retain the above copyright notice,

+this list of conditions and the following disclaimer.

+- Redistributions in binary form must reproduce the above copyright

+notice, this list of conditions and the following disclaimer in the

+documentation and/or other materials provided with the distribution.

+- Neither the name of Internet Society, IETF or IETF Trust, nor the

+names of specific contributors, may be used to endorse or promote

+products derived from this software without specific prior written

+permission.

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+POSSIBILITY OF SUCH DAMAGE.

+***********************************************************************/

+#ifndef SILK_LPC_INV_PRED_GAIN_ARM_H

+# define SILK_LPC_INV_PRED_GAIN_ARM_H

+# include "celt/arm/armcpu.h"

+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse prediction gain in energy domain, Q30        */

+    const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */

+    const opus_int              order               /* I   Prediction order                                             */

+);

+#  if !defined(OPUS_HAVE_RTCD) && defined(OPUS_ARM_PRESUME_NEON)

+#   define OVERRIDE_silk_LPC_inverse_pred_gain            (1)

+#   define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))

+#  endif

+# endif

+# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)

+/*Is run-time CPU detection enabled on this platform?*/

+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))

+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order);

+#   define OVERRIDE_silk_LPC_inverse_pred_gain            (1)

+#   define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))

+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)

+#   define OVERRIDE_silk_LPC_inverse_pred_gain            (1)

+#   define silk_LPC_inverse_pred_gain(A_Q12, order, arch) ((void)(arch), silk_LPC_inverse_pred_gain_neon(A_Q12, order))

+#  endif

+# endif

+#endif /* end SILK_LPC_INV_PRED_GAIN_ARM_H */

--- /dev/null

+++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c

@@ -1,0 +1,280 @@

+/***********************************************************************

+Copyright (c) 2017 Google Inc.

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions

+are met:

+- Redistributions of source code must retain the above copyright notice,

+this list of conditions and the following disclaimer.

+- Redistributions in binary form must reproduce the above copyright

+notice, this list of conditions and the following disclaimer in the

+documentation and/or other materials provided with the distribution.

+- Neither the name of Internet Society, IETF or IETF Trust, nor the

+names of specific contributors, may be used to endorse or promote

+products derived from this software without specific prior written

+permission.

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"

+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+POSSIBILITY OF SUCH DAMAGE.

+***********************************************************************/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#include <arm_neon.h>

+#include "SigProc_FIX.h"

+#include "define.h"

+#define QA                          24

+#define A_LIMIT                     SILK_FIX_CONST( 0.99975, QA )

+#define MUL32_FRAC_Q(a32, b32, Q)   ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))

+/* The difficulty is how to judge a 64-bit signed integer tmp64 is 32-bit overflowed,

+ * since NEON has no 64-bit min, max or comparison instructions.

+ * A failed idea is to compare the results of vmovn(tmp64) and vqmovn(tmp64) whether they are equal or not.

+ * However, this idea fails when the tmp64 is something like 0xFFFFFFF980000000.

+ * Here we know that mult2Q >= 1, so the highest bit (bit 63, sign bit) of tmp64 must equal to bit 62.

+ * tmp64 was shifted left by 1 and we got tmp64'. If high_half(tmp64') != 0 and high_half(tmp64') != -1,

+ * then we know that bit 31 to bit 63 of tmp64 can not all be the sign bit, and therefore tmp64 is 32-bit overflowed.

+ * That is, we judge if tmp64' > 0x00000000FFFFFFFF, or tmp64' <= 0xFFFFFFFF00000000.

+ * We use narrowing shift right 31 bits to tmp32' to save data bandwidth and instructions.

+ * That is, we judge if tmp32' > 0x00000000, or tmp32' <= 0xFFFFFFFF.

+ */

+/* Compute inverse of LPC prediction gain, and                          */

+/* test if LPC coefficients are stable (all poles within unit circle)   */

+static OPUS_INLINE opus_int32 LPC_inverse_pred_gain_QA_neon( /* O   Returns inverse prediction gain in energy domain, Q30    */

+    opus_int32           A_QA[ SILK_MAX_ORDER_LPC ],         /* I   Prediction coefficients                                  */

+    const opus_int       order                               /* I   Prediction order                                         */

+)

+{

+    opus_int   k, n, mult2Q;

+    opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2, tmp1, tmp2;

+    opus_int32 max, min;

+    int32x4_t  max_s32x4, min_s32x4;

+    int32x2_t  max_s32x2, min_s32x2;

+    max_s32x4 = vdupq_n_s32( silk_int32_MIN );

+    min_s32x4 = vdupq_n_s32( silk_int32_MAX );

+    invGain_Q30 = SILK_FIX_CONST( 1, 30 );

+    for( k = order - 1; k > 0; k-- ) {

+        int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;

+        int64x2_t mult2Q_s64x2;

+        /* Check for stability */

+        if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {

+            return 0;

+        }

+        /* Set RC equal to negated AR coef */

+        rc_Q31 = -silk_LSHIFT( A_QA[ k ], 31 - QA );

+        /* rc_mult1_Q30 range: [ 1 : 2^30 ] */

+        rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );

+        silk_assert( rc_mult1_Q30 > ( 1 << 15 ) );                   /* reduce A_LIMIT if fails */

+        silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );

+        /* Update inverse gain */

+        /* invGain_Q30 range: [ 0 : 2^30 ] */

+        invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );

+        silk_assert( invGain_Q30 >= 0           );

+        silk_assert( invGain_Q30 <= ( 1 << 30 ) );

+        if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {

+            return 0;

+        }

+        /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */

+        mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );

+        rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );

+        /* Update AR coefficient */

+        rc_Q31_s32x2   = vdup_n_s32( rc_Q31 );

+        mult2Q_s64x2   = vdupq_n_s64( -mult2Q );

+        rc_mult2_s32x2 = vdup_n_s32( rc_mult2 );

+        for( n = 0; n < ( ( k + 1 ) >> 1 ) - 3; n += 4 ) {

+            /* We always calculate extra elements of A_QA buffer when ( k % 4 ) != 0, to take the advantage of SIMD parallelization. */

+            int32x4_t tmp1_s32x4, tmp2_s32x4, t0_s32x4, t1_s32x4, s0_s32x4, s1_s32x4, t_QA0_s32x4, t_QA1_s32x4;

+            int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;

+            tmp1_s32x4  = vld1q_s32( A_QA + n );

+            tmp2_s32x4  = vld1q_s32( A_QA + k - n - 4 );

+            tmp2_s32x4  = vrev64q_s32( tmp2_s32x4 );

+            tmp2_s32x4  = vcombine_s32( vget_high_s32( tmp2_s32x4 ), vget_low_s32( tmp2_s32x4 ) );

+            t0_s32x4    = vqrdmulhq_lane_s32( tmp2_s32x4, rc_Q31_s32x2, 0 );

+            t1_s32x4    = vqrdmulhq_lane_s32( tmp1_s32x4, rc_Q31_s32x2, 0 );

+            t_QA0_s32x4 = vqsubq_s32( tmp1_s32x4, t0_s32x4 );

+            t_QA1_s32x4 = vqsubq_s32( tmp2_s32x4, t1_s32x4 );

+            t0_s64x2    = vmull_s32( vget_low_s32 ( t_QA0_s32x4 ), rc_mult2_s32x2 );

+            t1_s64x2    = vmull_s32( vget_high_s32( t_QA0_s32x4 ), rc_mult2_s32x2 );

+            t2_s64x2    = vmull_s32( vget_low_s32 ( t_QA1_s32x4 ), rc_mult2_s32x2 );

+            t3_s64x2    = vmull_s32( vget_high_s32( t_QA1_s32x4 ), rc_mult2_s32x2 );

+            t0_s64x2    = vrshlq_s64( t0_s64x2, mult2Q_s64x2 );

+            t1_s64x2    = vrshlq_s64( t1_s64x2, mult2Q_s64x2 );

+            t2_s64x2    = vrshlq_s64( t2_s64x2, mult2Q_s64x2 );

+            t3_s64x2    = vrshlq_s64( t3_s64x2, mult2Q_s64x2 );

+            t0_s32x4    = vcombine_s32( vmovn_s64( t0_s64x2 ), vmovn_s64( t1_s64x2 ) );

+            t1_s32x4    = vcombine_s32( vmovn_s64( t2_s64x2 ), vmovn_s64( t3_s64x2 ) );

+            s0_s32x4    = vcombine_s32( vshrn_n_s64( t0_s64x2, 31 ), vshrn_n_s64( t1_s64x2, 31 ) );

+            s1_s32x4    = vcombine_s32( vshrn_n_s64( t2_s64x2, 31 ), vshrn_n_s64( t3_s64x2, 31 ) );

+            max_s32x4   = vmaxq_s32( max_s32x4, s0_s32x4 );

+            min_s32x4   = vminq_s32( min_s32x4, s0_s32x4 );

+            max_s32x4   = vmaxq_s32( max_s32x4, s1_s32x4 );

+            min_s32x4   = vminq_s32( min_s32x4, s1_s32x4 );

+            t1_s32x4    = vrev64q_s32( t1_s32x4 );

+            t1_s32x4    = vcombine_s32( vget_high_s32( t1_s32x4 ), vget_low_s32( t1_s32x4 ) );

+            vst1q_s32( A_QA + n,         t0_s32x4 );

+            vst1q_s32( A_QA + k - n - 4, t1_s32x4 );

+        }

+        for( ; n < (k + 1) >> 1; n++ ) {

+            opus_int64 tmp64;

+            tmp1 = A_QA[ n ];

+            tmp2 = A_QA[ k - n - 1 ];

+            tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp1,

+                  MUL32_FRAC_Q( tmp2, rc_Q31, 31 ) ), rc_mult2 ), mult2Q);

+            if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {

+               return 0;

+            }

+            A_QA[ n ] = ( opus_int32 )tmp64;

+            tmp64 = silk_RSHIFT_ROUND64( silk_SMULL( silk_SUB_SAT32(tmp2,

+                  MUL32_FRAC_Q( tmp1, rc_Q31, 31 ) ), rc_mult2), mult2Q);

+            if( tmp64 > silk_int32_MAX || tmp64 < silk_int32_MIN ) {

+               return 0;

+            }

+            A_QA[ k - n - 1 ] = ( opus_int32 )tmp64;

+        }

+    }

+    /* Check for stability */

+    if( ( A_QA[ k ] > A_LIMIT ) || ( A_QA[ k ] < -A_LIMIT ) ) {

+        return 0;

+    }

+    max_s32x2 = vmax_s32( vget_low_s32( max_s32x4 ), vget_high_s32( max_s32x4 ) );

+    min_s32x2 = vmin_s32( vget_low_s32( min_s32x4 ), vget_high_s32( min_s32x4 ) );

+    max_s32x2 = vmax_s32( max_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( max_s32x2 ), 32 ) ) );

+    min_s32x2 = vmin_s32( min_s32x2, vreinterpret_s32_s64( vshr_n_s64( vreinterpret_s64_s32( min_s32x2 ), 32 ) ) );

+    max = vget_lane_s32( max_s32x2, 0 );

+    min = vget_lane_s32( min_s32x2, 0 );

+    if( ( max > 0 ) || ( min < -1 ) ) {

+        return 0;

+    }

+    /* Set RC equal to negated AR coef */

+    rc_Q31 = -silk_LSHIFT( A_QA[ 0 ], 31 - QA );

+    /* Range: [ 1 : 2^30 ] */

+    rc_mult1_Q30 = silk_SUB32( SILK_FIX_CONST( 1, 30 ), silk_SMMUL( rc_Q31, rc_Q31 ) );

+    /* Update inverse gain */

+    /* Range: [ 0 : 2^30 ] */

+    invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );

+    silk_assert( invGain_Q30 >= 0           );

+    silk_assert( invGain_Q30 <= ( 1 << 30 ) );

+    if( invGain_Q30 < SILK_FIX_CONST( 1.0f / MAX_PREDICTION_POWER_GAIN, 30 ) ) {

+        return 0;

+    }

+    return invGain_Q30;

+}

+/* For input in Q12 domain */

+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse prediction gain in energy domain, Q30        */

+    const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */

+    const opus_int              order               /* I   Prediction order                                             */

+)

+{

+#ifdef OPUS_CHECK_ASM

+    const opus_int32 invGain_Q30_c = silk_LPC_inverse_pred_gain_c( A_Q12, order );

+#endif

+    opus_int32 invGain_Q30;

+    if( ( SILK_MAX_ORDER_LPC != 24 ) || ( order & 1 )) {

+        invGain_Q30 = silk_LPC_inverse_pred_gain_c( A_Q12, order );

+    }

+    else {

+        opus_int32 Atmp_QA[ SILK_MAX_ORDER_LPC ];

+        opus_int32 DC_resp;

+        int16x8_t  t0_s16x8, t1_s16x8, t2_s16x8;

+        int32x4_t  t0_s32x4;

+        const opus_int leftover = order & 7;

+        /* Increase Q domain of the AR coefficients */

+        t0_s16x8 = vld1q_s16( A_Q12 +  0 );

+        t1_s16x8 = vld1q_s16( A_Q12 +  8 );

+        t2_s16x8 = vld1q_s16( A_Q12 + 16 );

+        t0_s32x4 = vpaddlq_s16( t0_s16x8 );

+        switch( order - leftover )

+        {

+        case 24:

+            t0_s32x4 = vpadalq_s16( t0_s32x4, t2_s16x8 );

+            /* Intend to fall through */

+        case 16:

+            t0_s32x4 = vpadalq_s16( t0_s32x4, t1_s16x8 );

+            vst1q_s32( Atmp_QA + 16, vshll_n_s16( vget_low_s16 ( t2_s16x8 ), QA - 12 ) );

+            vst1q_s32( Atmp_QA + 20, vshll_n_s16( vget_high_s16( t2_s16x8 ), QA - 12 ) );

+            /* Intend to fall through */

+        case 8:

+        {

+            const int32x2_t t_s32x2 = vpadd_s32( vget_low_s32( t0_s32x4 ), vget_high_s32( t0_s32x4 ) );

+            const int64x1_t t_s64x1 = vpaddl_s32( t_s32x2 );

+            DC_resp = vget_lane_s32( vreinterpret_s32_s64( t_s64x1 ), 0 );

+            vst1q_s32( Atmp_QA +  8, vshll_n_s16( vget_low_s16 ( t1_s16x8 ), QA - 12 ) );

+            vst1q_s32( Atmp_QA + 12, vshll_n_s16( vget_high_s16( t1_s16x8 ), QA - 12 ) );

+        }

+        break;

+        default:

+            DC_resp = 0;

+            break;

+        }

+        A_Q12 += order - leftover;

+        switch( leftover )

+        {

+        case 6:

+            DC_resp += (opus_int32)A_Q12[ 5 ];

+            DC_resp += (opus_int32)A_Q12[ 4 ];

+            /* Intend to fall through */

+        case 4:

+            DC_resp += (opus_int32)A_Q12[ 3 ];

+            DC_resp += (opus_int32)A_Q12[ 2 ];

+            /* Intend to fall through */

+        case 2:

+            DC_resp += (opus_int32)A_Q12[ 1 ];

+            DC_resp += (opus_int32)A_Q12[ 0 ];

+            /* Intend to fall through */

+        default:

+            break;

+        }

+        /* If the DC is unstable, we don't even need to do the full calculations */

+        if( DC_resp >= 4096 ) {

+            invGain_Q30 = 0;

+        } else {

+            vst1q_s32( Atmp_QA + 0, vshll_n_s16( vget_low_s16 ( t0_s16x8 ), QA - 12 ) );

+            vst1q_s32( Atmp_QA + 4, vshll_n_s16( vget_high_s16( t0_s16x8 ), QA - 12 ) );

+            invGain_Q30 = LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );

+        }

+    }

+#ifdef OPUS_CHECK_ASM

+    silk_assert( invGain_Q30_c == invGain_Q30 );

+#endif

+    return invGain_Q30;

+}

--- a/silk/arm/arm_silk_map.c

+++ b/silk/arm/arm_silk_map.c

@@ -30,11 +30,22 @@

 #include "main_FIX.h"

 #include "NSQ.h"

+#include "SigProc_FIX.h"

 #if defined(OPUS_HAVE_RTCD)

 # if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \

  !defined(OPUS_ARM_PRESUME_NEON_INTR))

+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */

+        const opus_int16            *A_Q12,                             /* I   Prediction coefficients, Q12 [order]                         */

+        const opus_int              order                               /* I   Prediction order                                             */

+) = {

+      silk_LPC_inverse_pred_gain_c,              /* ARMv4 */

+      silk_LPC_inverse_pred_gain_c,              /* EDSP */

+      silk_LPC_inverse_pred_gain_c,              /* Media */

+      MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */

+};

 void  (*const SILK_NSQ_DEL_DEC_IMPL[OPUS_ARCHMASK + 1])(

         const silk_encoder_state    *psEncC,                                    /* I    Encoder State                   */

--- a/silk/decode_parameters.c

+++ b/silk/decode_parameters.c

@@ -52,7 +52,7 @@

     silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB );

     /* Convert NLSF parameters to AR prediction filter coefficients */

-    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order );

+    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch );

     /* If just reset, e.g., because internal Fs changed, do not allow interpolation */

     /* improves the case of packet loss in the first frame after a switch           */

@@ -69,7 +69,7 @@

         /* Convert NLSF parameters to AR prediction filter coefficients */

-        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order );

+        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch );

     } else {

         /* Copy LPC coefficients for first half from second half */

         silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) );

--- a/silk/fixed/find_LPC_FIX.c

+++ b/silk/fixed/find_LPC_FIX.c

@@ -92,7 +92,7 @@

             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );

             /* Convert to LPC for residual energy evaluation */

-            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );

+            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );

             /* Calculate residual energy with NLSF interpolation */

             silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );

--- a/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h

+++ b/silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h

@@ -224,8 +224,8 @@

         silk_bwexpander_32( AR1_Q24, psEnc->sCmn.shapingLPCOrder, BWExp1_Q16 );

         /* Ratio of prediction gains, in energy domain */

-        pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder );

-        nrg         = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder );

+        pre_nrg_Q30 = silk_LPC_inverse_pred_gain_Q24( AR2_Q24, psEnc->sCmn.shapingLPCOrder, arch );

+        nrg         = silk_LPC_inverse_pred_gain_Q24( AR1_Q24, psEnc->sCmn.shapingLPCOrder, arch );

         /*psEncCtrl->GainsPre[ k ] = 1.0f - 0.7f * ( 1.0f - pre_nrg / nrg ) = 0.3f + 0.7f * pre_nrg / nrg;*/

         pre_nrg_Q30 = silk_LSHIFT32( silk_SMULWB( pre_nrg_Q30, SILK_FIX_CONST( 0.7, 15 ) ), 1 );

--- a/silk/float/find_LPC_FLP.c

+++ b/silk/float/find_LPC_FLP.c

@@ -73,7 +73,7 @@

             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );

             /* Convert to LPC for residual energy evaluation */

-            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder );

+            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );

             /* Calculate residual energy with LSF interpolation */

             silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder );

--- a/silk/float/main_FLP.h

+++ b/silk/float/main_FLP.h

@@ -256,7 +256,8 @@

 void silk_NLSF2A_FLP(

     silk_float                      *pAR,                               /* O    LPC coefficients [ LPC_order ]              */

     const opus_int16                *NLSF_Q15,                          /* I    NLSF vector      [ LPC_order ]              */

-    const opus_int                  LPC_order                           /* I    LPC order                                   */

+    const opus_int                  LPC_order,                          /* I    LPC order                                   */

+    int                             arch                                /* I    Run-time architecture                       */

);

 /* Limit, stabilize, and quantize NLSFs */

--- a/silk/float/wrappers_FLP.c

+++ b/silk/float/wrappers_FLP.c

@@ -54,13 +54,14 @@

 void silk_NLSF2A_FLP(

     silk_float                      *pAR,                               /* O    LPC coefficients [ LPC_order ]              */

     const opus_int16                *NLSF_Q15,                          /* I    NLSF vector      [ LPC_order ]              */

-    const opus_int                  LPC_order                           /* I    LPC order                                   */

+    const opus_int                  LPC_order,                          /* I    LPC order                                   */

+    int                             arch                                /* I    Run-time architecture                       */

     opus_int   i;

     opus_int16 a_fix_Q12[ MAX_LPC_ORDER ];

-    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order );

+    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch );

     for( i = 0; i < LPC_order; i++ ) {

         pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f );

--- a/silk/init_decoder.c

+++ b/silk/init_decoder.c

@@ -44,6 +44,7 @@

     /* Used to deactivate LSF interpolation */

     psDec->first_frame_after_reset = 1;

     psDec->prev_gain_Q16 = 65536;

+    psDec->arch = opus_select_arch();

     /* Reset CNG state */

     silk_CNG_Reset( psDec );

--- a/silk/process_NLSFs.c

+++ b/silk/process_NLSFs.c

@@ -89,7 +89,7 @@

         NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType );

     /* Convert quantized NLSFs back to LPC coefficients */

-    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );

+    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch );

     if( doInterpolate ) {

         /* Calculate the interpolated, quantized LSF vector for the first half */

@@ -97,7 +97,7 @@

             psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );

         /* Convert back to LPC coefficients */

-        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder );

+        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch );

     } else {

         /* Copy LPC coefficients for first half from second half */

--- a/silk/structs.h

+++ b/silk/structs.h

@@ -301,6 +301,7 @@

     /* Stuff used for PLC */

     opus_int                    lossCnt;

     opus_int                    prevSignalType;

+    int                         arch;

     silk_PLC_struct sPLC;

--- a/silk/tests/test_unit_LPC_inv_pred_gain.c

+++ b/silk/tests/test_unit_LPC_inv_pred_gain.c

@@ -78,6 +78,7 @@

 int main(void) {

+    const int arch = opus_select_arch();

     /* Set to 10000 so all branches in C function are triggered */

     const int loop_num = 10000;

     int count = 0;

@@ -100,7 +101,7 @@

                 for( i = 0; i < SILK_MAX_ORDER_LPC; i++ ) {

                     A_Q12[i] = ((opus_int16)rand()) >> shift;

-                gain = silk_LPC_inverse_pred_gain(A_Q12, order);

+                gain = silk_LPC_inverse_pred_gain(A_Q12, order, arch);

                 /* Look for filters that silk_LPC_inverse_pred_gain() thinks are

                    stable but definitely aren't. */

                 if( gain != 0 && !check_stability(A_Q12, order) ) {

--- a/silk_headers.mk

+++ b/silk_headers.mk

@@ -22,6 +22,7 @@

 silk/resampler_structs.h \

 silk/SigProc_FIX.h \

 silk/x86/SigProc_FIX_sse.h \

+silk/arm/LPC_inv_pred_gain_arm.h \

 silk/arm/macros_armv4.h \

 silk/arm/macros_armv5e.h \

 silk/arm/macros_arm64.h \

--- a/silk_sources.mk

+++ b/silk_sources.mk

@@ -85,6 +85,7 @@

 SILK_SOURCES_ARM_NEON_INTR = \

 silk/arm/arm_silk_map.c \

+silk/arm/LPC_inv_pred_gain_neon_intr.c \

 silk/arm/NSQ_del_dec_neon_intr.c \

 silk/arm/NSQ_neon.c

--

⑨