ref: 2941f08a616290f291579df9c8dbc0801d8d3f18
parent: 4df773d092c2c7d63af2326ebc397cc84e65162e
author: Jean-Marc Valin <jeanmarcv@google.com>
date: Tue May 13 08:29:42 EDT 2025
Adaptive IMDCT scaling to maximize accuracy
--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -271,6 +271,9 @@
int i;
int N, N2, N4;
const kiss_twiddle_scalar *trig;
+#ifdef FIXED_POINT
+ int pre_shift, post_shift, fft_shift;
+#endif
(void) arch;
N = l->n;
@@ -283,6 +286,21 @@
N2 = N>>1;
N4 = N>>2;
+#ifdef FIXED_POINT
+ {
+ opus_val32 sumval=N2;
+ opus_val32 maxval=0;
+ for (i=0;i<N2;i++) {
+ maxval = MAX32(maxval, ABS32(in[i*stride]));
+ sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],4)));
+ }
+ pre_shift = IMAX(0, 29-celt_ilog2(1+SHR32(maxval,2)*3));
+ /* Worst-case where all the energy goes to a single sample. */
+ post_shift = IMAX(0, 26-celt_ilog2(ABS32(sumval)));
+ post_shift = IMIN(post_shift, pre_shift);
+ fft_shift = pre_shift - post_shift;
+ }
+#endif
/* Pre-rotate */
{
/* Temp pointers to make it really clear to the compiler what we're doing */
@@ -297,8 +315,8 @@
kiss_fft_scalar yr, yi;
opus_val32 x1, x2;
rev = *bitrev++;
- x1 = SHL32_ovflw(*xp1, IMDCT_HEADROOM);
- x2 = SHL32_ovflw(*xp2, IMDCT_HEADROOM);
+ x1 = SHL32_ovflw(*xp1, pre_shift);
+ x2 = SHL32_ovflw(*xp2, pre_shift);
yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
/* We swap real and imag because we use an FFT instead of an IFFT. */
@@ -310,7 +328,7 @@
}
}
- opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(0));
+ opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
/* Post-rotate and de-shuffle from both ends of the buffer at once to make
it in-place. */
@@ -330,8 +348,8 @@
t0 = t[i];
t1 = t[N4+i];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
- yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
- yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+ yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
+ yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
/* We swap real and imag because we're using an FFT instead of an IFFT. */
re = yp1[1];
im = yp1[0];
@@ -341,8 +359,8 @@
t0 = t[(N4-i-1)];
t1 = t[(N2-i-1)];
/* We'd scale up by 2 here, but instead it's done when mixing the windows */
- yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
- yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+ yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
+ yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
yp1[0] = yr;
yp0[1] = yi;
yp0 += 2;
--- a/celt/mdct.h
+++ b/celt/mdct.h
@@ -57,10 +57,6 @@
#include "arm/mdct_arm.h"
#endif
-/* There should be 2 bits of headroom in the IMDCT which we can take
- advantage of to maximize accuracy. */
-#define IMDCT_HEADROOM 2
-
int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch);
void clt_mdct_clear(mdct_lookup *l, int arch);
--
⑨