shithub: opus

Download patch

ref: 833688e65dda31b6cfc319b5dcd0a54f3a2ef616
parent: 57901a6758c3bdc7481d61669812bde13d2085b8
author: Jan Buethe <jbuethe@amazon.de>
date: Wed Feb 21 12:27:54 EST 2024

bit-exact overflow fixes in silk/arm/NSQ_del_dec_neon_intr.c

--- a/silk/arm/NSQ_del_dec_neon_intr.c
+++ b/silk/arm/NSQ_del_dec_neon_intr.c
@@ -35,6 +35,7 @@
 #endif
 #include "main.h"
 #include "stack_alloc.h"
+#include "os_support.h"
 
 /* NEON intrinsics optimization now can only parallelize up to 4 delay decision states.    */
 /* If there are more states, C function is called, and this optimization must be expanded. */
@@ -279,6 +280,7 @@
 
         /* Initialize delayed decision states */
         ALLOC( psDelDec, 1, NSQ_del_decs_struct );
+        OPUS_CLEAR(psDelDec, 1);
         /* Only RandState and RD_Q10 need to be initialized to 0. */
         silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
         vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
@@ -587,6 +589,7 @@
     silk_assert( nStatesDelayedDecision > 0 );
     silk_assert( ( shapingLPCOrder & 1 ) == 0 );   /* check that order is even */
     ALLOC( psSampleState, 2, NSQ_samples_struct );
+    OPUS_CLEAR(psSampleState, 2);
 
     shp_lag_ptr  = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
     pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -711,12 +714,13 @@
                 const int rdo_offset = Lambda_Q10/2 - 512;
                 const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
                 const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
+                int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
+                signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
                 /* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
                 silk_assert( Lambda_Q10 <= 32767 );
 
                 q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
-                q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
-                q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
+                q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
                 q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
             }
             {
@@ -723,11 +727,13 @@
                 const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
                 const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
                 const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
-                int16x4_t tmp1_s16x4, tmp2_s16x4;
+                int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
 
                 q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
-                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
-                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcgez_s16(q1_Q0_s16x4)), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
+                tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
+                tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
+                q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4,  tmp_summand_s16x4);
                 q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
                 q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
@@ -816,6 +822,13 @@
                 RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
                 Winner_ind = k;
             }
+        }
+
+        /* clear unused part of RD_Q10 to avoid overflows */
+        if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
+        {
+            OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+            OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
         }
 
         /* Increase RD values of expired states */
--