shithub: opus

--- a/silk/arm/NSQ_del_dec_neon_intr.c

+++ b/silk/arm/NSQ_del_dec_neon_intr.c

@@ -35,6 +35,7 @@

 #endif

 #include "main.h"

 #include "stack_alloc.h"

+#include "os_support.h"

 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */

 /* If there are more states, C function is called, and this optimization must be expanded. */

@@ -279,6 +280,7 @@

         /* Initialize delayed decision states */

         ALLOC( psDelDec, 1, NSQ_del_decs_struct );

+        OPUS_CLEAR(psDelDec, 1);

         /* Only RandState and RD_Q10 need to be initialized to 0. */

         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );

         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );

@@ -587,6 +589,7 @@

     silk_assert( nStatesDelayedDecision > 0 );

     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */

     ALLOC( psSampleState, 2, NSQ_samples_struct );

+    OPUS_CLEAR(psSampleState, 2);

     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];

     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];

@@ -711,12 +714,13 @@

                 const int rdo_offset = Lambda_Q10/2 - 512;

                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );

                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );

+                int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );

+                signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );

                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */

                 silk_assert( Lambda_Q10 <= 32767 );

                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );

-                q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );

-                q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );

+                q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);

                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );

@@ -723,11 +727,13 @@

                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );

                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );

                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );

-                int16x4_t tmp1_s16x4, tmp2_s16x4;

+                int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;

                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );

-                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );

-                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );

+                tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcgez_s16(q1_Q0_s16x4)), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );

+                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );

+                tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );

+                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);

                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );

                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );

                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );

@@ -816,6 +822,13 @@

                 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];

                 Winner_ind = k;

+        }

+        /* clear unused part of RD_Q10 to avoid overflows */

+        if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )

+        {

+            OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);

+            OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);

         /* Increase RD values of expired states */

--

⑨