shithub: opus

Download patch

ref: 2941f08a616290f291579df9c8dbc0801d8d3f18
parent: 4df773d092c2c7d63af2326ebc397cc84e65162e
author: Jean-Marc Valin <jeanmarcv@google.com>
date: Tue May 13 08:29:42 EDT 2025

Adaptive IMDCT scaling to maximize accuracy

--- a/celt/mdct.c
+++ b/celt/mdct.c
@@ -271,6 +271,9 @@
    int i;
    int N, N2, N4;
    const kiss_twiddle_scalar *trig;
+#ifdef FIXED_POINT
+   int pre_shift, post_shift, fft_shift;
+#endif
    (void) arch;
 
    N = l->n;
@@ -283,6 +286,21 @@
    N2 = N>>1;
    N4 = N>>2;
 
+#ifdef FIXED_POINT
+   {
+      opus_val32 sumval=N2;
+      opus_val32 maxval=0;
+      for (i=0;i<N2;i++) {
+         maxval = MAX32(maxval, ABS32(in[i*stride]));
+         sumval = ADD32_ovflw(sumval, ABS32(SHR32(in[i*stride],4)));
+      }
+      pre_shift = IMAX(0, 29-celt_ilog2(1+SHR32(maxval,2)*3));
+      /* Worst-case where all the energy goes to a single sample. */
+      post_shift = IMAX(0, 26-celt_ilog2(ABS32(sumval)));
+      post_shift = IMIN(post_shift, pre_shift);
+      fft_shift = pre_shift - post_shift;
+   }
+#endif
    /* Pre-rotate */
    {
       /* Temp pointers to make it really clear to the compiler what we're doing */
@@ -297,8 +315,8 @@
          kiss_fft_scalar yr, yi;
          opus_val32 x1, x2;
          rev = *bitrev++;
-         x1 = SHL32_ovflw(*xp1, IMDCT_HEADROOM);
-         x2 = SHL32_ovflw(*xp2, IMDCT_HEADROOM);
+         x1 = SHL32_ovflw(*xp1, pre_shift);
+         x2 = SHL32_ovflw(*xp2, pre_shift);
          yr = ADD32_ovflw(S_MUL(x2, t[i]), S_MUL(x1, t[N4+i]));
          yi = SUB32_ovflw(S_MUL(x1, t[i]), S_MUL(x2, t[N4+i]));
          /* We swap real and imag because we use an FFT instead of an IFFT. */
@@ -310,7 +328,7 @@
       }
    }
 
-   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(0));
+   opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)) ARG_FIXED(fft_shift));
 
    /* Post-rotate and de-shuffle from both ends of the buffer at once to make
       it in-place. */
@@ -330,8 +348,8 @@
          t0 = t[i];
          t1 = t[N4+i];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
-         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
+         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
          /* We swap real and imag because we're using an FFT instead of an IFFT. */
          re = yp1[1];
          im = yp1[0];
@@ -341,8 +359,8 @@
          t0 = t[(N4-i-1)];
          t1 = t[(N2-i-1)];
          /* We'd scale up by 2 here, but instead it's done when mixing the windows */
-         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), IMDCT_HEADROOM);
-         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), IMDCT_HEADROOM);
+         yr = PSHR32_ovflw(ADD32_ovflw(S_MUL(re,t0), S_MUL(im,t1)), post_shift);
+         yi = PSHR32_ovflw(SUB32_ovflw(S_MUL(re,t1), S_MUL(im,t0)), post_shift);
          yp1[0] = yr;
          yp0[1] = yi;
          yp0 += 2;
--- a/celt/mdct.h
+++ b/celt/mdct.h
@@ -57,10 +57,6 @@
 #include "arm/mdct_arm.h"
 #endif
 
-/* There should be 2 bits of headroom in the IMDCT which we can take
-   advantage of to maximize accuracy. */
-#define IMDCT_HEADROOM 2
-
 int clt_mdct_init(mdct_lookup *l,int N, int maxshift, int arch);
 void clt_mdct_clear(mdct_lookup *l, int arch);
 
--