ref: 833688e65dda31b6cfc319b5dcd0a54f3a2ef616
parent: 57901a6758c3bdc7481d61669812bde13d2085b8
author: Jan Buethe <jbuethe@amazon.de>
date: Wed Feb 21 12:27:54 EST 2024
bit-exact overflow fixes in silk/arm/NSQ_del_dec_neon_intr.c
--- a/silk/arm/NSQ_del_dec_neon_intr.c
+++ b/silk/arm/NSQ_del_dec_neon_intr.c
@@ -35,6 +35,7 @@
#endif
#include "main.h"
#include "stack_alloc.h"
+#include "os_support.h"
/* NEON intrinsics optimization now can only parallelize up to 4 delay decision states. */
/* If there are more states, C function is called, and this optimization must be expanded. */
@@ -279,6 +280,7 @@
/* Initialize delayed decision states */
ALLOC( psDelDec, 1, NSQ_del_decs_struct );
+ OPUS_CLEAR(psDelDec, 1);
/* Only RandState and RD_Q10 need to be initialized to 0. */
silk_memset( psDelDec->RandState, 0, sizeof( psDelDec->RandState ) );
vst1q_s32( psDelDec->RD_Q10, vdupq_n_s32( 0 ) );
@@ -587,6 +589,7 @@
silk_assert( nStatesDelayedDecision > 0 );
silk_assert( ( shapingLPCOrder & 1 ) == 0 ); /* check that order is even */
ALLOC( psSampleState, 2, NSQ_samples_struct );
+ OPUS_CLEAR(psSampleState, 2);
shp_lag_ptr = &NSQ->sLTP_shp_Q14[ NSQ->sLTP_shp_buf_idx - lag + HARM_SHAPE_FIR_TAPS / 2 ];
pred_lag_ptr = &sLTP_Q15[ NSQ->sLTP_buf_idx - lag + LTP_ORDER / 2 ];
@@ -711,12 +714,13 @@
const int rdo_offset = Lambda_Q10/2 - 512;
const uint16x4_t greaterThanRdo = vcgt_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) );
const uint16x4_t lessThanMinusRdo = vclt_s16( q1_Q10_s16x4, vdup_n_s16( -rdo_offset ) );
+ int16x4_t signed_offset = vbsl_s16( greaterThanRdo, vdup_n_s16( -rdo_offset ), vdup_n_s16( 0 ) );
+ signed_offset = vbsl_s16( lessThanMinusRdo, vdup_n_s16( rdo_offset ), signed_offset );
/* If Lambda_Q10 > 32767, then q1_Q0, q1_Q10 and q2_Q10 must change to 32-bit. */
silk_assert( Lambda_Q10 <= 32767 );
q1_Q0_s16x4 = vreinterpret_s16_u16( vclt_s16( q1_Q10_s16x4, vdup_n_s16( 0 ) ) );
- q1_Q0_s16x4 = vbsl_s16( greaterThanRdo, vsub_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
- q1_Q0_s16x4 = vbsl_s16( lessThanMinusRdo, vadd_s16( q1_Q10_s16x4, vdup_n_s16( rdo_offset ) ), q1_Q0_s16x4 );
+ q1_Q0_s16x4 = vbsl_s16(vorr_u16(greaterThanRdo, lessThanMinusRdo), vadd_s16( q1_Q10_s16x4 , signed_offset), q1_Q0_s16x4);
q1_Q0_s16x4 = vshr_n_s16( q1_Q0_s16x4, 10 );
}
{@@ -723,11 +727,13 @@
const uint16x4_t equal0_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( 0 ) );
const uint16x4_t equalMinus1_u16x4 = vceq_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
const uint16x4_t lessThanMinus1_u16x4 = vclt_s16( q1_Q0_s16x4, vdup_n_s16( -1 ) );
- int16x4_t tmp1_s16x4, tmp2_s16x4;
+ int16x4_t tmp1_s16x4, tmp2_s16x4, tmp_summand_s16x4;
q1_Q10_s16x4 = vshl_n_s16( q1_Q0_s16x4, 10 );
- tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
- q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ) );
+ tmp_summand_s16x4 = vand_s16( vreinterpret_s16_u16(vcgez_s16(q1_Q0_s16x4)), vdup_n_s16( offset_Q10 - QUANT_LEVEL_ADJUST_Q10 ) );
+ tmp1_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4 );
+ tmp_summand_s16x4 = vbsl_s16( lessThanMinus1_u16x4, vdup_n_s16( offset_Q10 + QUANT_LEVEL_ADJUST_Q10 ), vdup_n_s16(0) );
+ q1_Q10_s16x4 = vadd_s16( q1_Q10_s16x4, tmp_summand_s16x4);
q1_Q10_s16x4 = vbsl_s16( lessThanMinus1_u16x4, q1_Q10_s16x4, tmp1_s16x4 );
q1_Q10_s16x4 = vbsl_s16( equal0_u16x4, vdup_n_s16( offset_Q10 ), q1_Q10_s16x4 );
q1_Q10_s16x4 = vbsl_s16( equalMinus1_u16x4, vdup_n_s16( offset_Q10 - ( 1024 - QUANT_LEVEL_ADJUST_Q10 ) ), q1_Q10_s16x4 );
@@ -816,6 +822,13 @@
RDmin_Q10 = psSampleState[ 0 ].RD_Q10[ k ];
Winner_ind = k;
}
+ }
+
+ /* clear unused part of RD_Q10 to avoid overflows */
+ if( nStatesDelayedDecision < NEON_MAX_DEL_DEC_STATES )
+ {+ OPUS_CLEAR(psSampleState[0].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
+ OPUS_CLEAR(psSampleState[1].RD_Q10 + nStatesDelayedDecision, NEON_MAX_DEL_DEC_STATES - nStatesDelayedDecision);
}
/* Increase RD values of expired states */
--
⑨