shithub: opus

--- /dev/null

+++ b/celt/arm/celt_fft_ne10.c

@@ -1,0 +1,173 @@

+/* Copyright (c) 2015 Xiph.Org Foundation

+   Written by Viswanath Puttagunta */

+/**

+   @file celt_fft_ne10.c

+   @brief ARM Neon optimizations for fft using NE10 library

+ */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifndef SKIP_CONFIG_H

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#endif

+#include <NE10_dsp.h>

+#include "os_support.h"

+#include "kiss_fft.h"

+#include "stack_alloc.h"

+#if !defined(FIXED_POINT)

+# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon

+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t

+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t

+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32

+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t

+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon

+#else

+# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)

+# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t

+# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t

+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32

+# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32

+# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t

+# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon

+#endif

+#if defined(CUSTOM_MODES)

+/* nfft lengths in NE10 that support scaled fft */

+# define NE10_FFTSCALED_SUPPORT_MAX 4

+static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {

+   480, 240, 120, 60

+};

+int opus_fft_alloc_arm_neon(kiss_fft_state *st)

+{

+   int i;

+   size_t memneeded = sizeof(struct arch_fft_state);

+   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);

+   if (!st->arch_fft)

+      return -1;

+   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {

+      if(st->nfft == ne10_fft_scaled_support[i])

+         break;

+   }

+   if (i == NE10_FFTSCALED_SUPPORT_MAX) {

+      /* This nfft length (scaled fft) is not supported in NE10 */

+      st->arch_fft->is_supported = 0;

+      st->arch_fft->priv = NULL;

+   }

+   else {

+      st->arch_fft->is_supported = 1;

+      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);

+      if (st->arch_fft->priv == NULL) {

+         return -1;

+      }

+   }

+   return 0;

+}

+void opus_fft_free_arm_neon(kiss_fft_state *st)

+{

+   NE10_FFT_CFG_TYPE_T cfg;

+   if (!st->arch_fft)

+      return;

+   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;

+   if (cfg)

+      NE10_FFT_DESTROY_C2C_TYPE(cfg);

+   opus_free(st->arch_fft);

+}

+#endif

+void opus_fft_neon(const kiss_fft_state *st,

+                   const kiss_fft_cpx *fin,

+                   kiss_fft_cpx *fout)

+{

+   NE10_FFT_STATE_TYPE_T state;

+   NE10_FFT_CFG_TYPE_T cfg = &state;

+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);

+   SAVE_STACK;

+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);

+   if (!st->arch_fft->is_supported) {

+      /* This nfft length (scaled fft) not supported in NE10 */

+      opus_fft_c(st, fin, fout);

+   }

+   else {

+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));

+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];

+#if !defined(FIXED_POINT)

+      state.is_forward_scaled = 1;

+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

+                                (NE10_FFT_CPX_TYPE_T *)fin,

+                                cfg, 0);

+#else

+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

+                                (NE10_FFT_CPX_TYPE_T *)fin,

+                                cfg, 0, 1);

+#endif

+   }

+   RESTORE_STACK;

+}

+void opus_ifft_neon(const kiss_fft_state *st,

+                    const kiss_fft_cpx *fin,

+                    kiss_fft_cpx *fout)

+{

+   NE10_FFT_STATE_TYPE_T state;

+   NE10_FFT_CFG_TYPE_T cfg = &state;

+   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);

+   SAVE_STACK;

+   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);

+   if (!st->arch_fft->is_supported) {

+      /* This nfft length (scaled fft) not supported in NE10 */

+      opus_ifft_c(st, fin, fout);

+   }

+   else {

+      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));

+      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];

+#if !defined(FIXED_POINT)

+      state.is_backward_scaled = 0;

+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

+                                (NE10_FFT_CPX_TYPE_T *)fin,

+                                cfg, 1);

+#else

+      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

+                                (NE10_FFT_CPX_TYPE_T *)fin,

+                                cfg, 1, 0);

+#endif

+   }

+   RESTORE_STACK;

+}

--- /dev/null

+++ b/celt/arm/celt_mdct_ne10.c

@@ -1,0 +1,258 @@

+/* Copyright (c) 2015 Xiph.Org Foundation

+   Written by Viswanath Puttagunta */

+/**

+   @file celt_mdct_ne10.c

+   @brief ARM Neon optimizations for mdct using NE10 library

+ */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifndef SKIP_CONFIG_H

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#endif

+#include "kiss_fft.h"

+#include "_kiss_fft_guts.h"

+#include "mdct.h"

+#include "stack_alloc.h"

+void clt_mdct_forward_neon(const mdct_lookup *l,

+                           kiss_fft_scalar *in,

+                           kiss_fft_scalar * OPUS_RESTRICT out,

+                           const opus_val16 *window,

+                           int overlap, int shift, int stride, int arch)

+{

+   int i;

+   int N, N2, N4;

+   VARDECL(kiss_fft_scalar, f);

+   VARDECL(kiss_fft_cpx, f2);

+   const kiss_fft_state *st = l->kfft[shift];

+   const kiss_twiddle_scalar *trig;

+   SAVE_STACK;

+   N = l->n;

+   trig = l->trig;

+   for (i=0;i<shift;i++)

+   {

+      N >>= 1;

+      trig += N;

+   }

+   N2 = N>>1;

+   N4 = N>>2;

+   ALLOC(f, N2, kiss_fft_scalar);

+   ALLOC(f2, N4, kiss_fft_cpx);

+   /* Consider the input to be composed of four blocks: [a, b, c, d] */

+   /* Window, shuffle, fold */

+   {

+      /* Temp pointers to make it really clear to the compiler what we're doing */

+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);

+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);

+      kiss_fft_scalar * OPUS_RESTRICT yp = f;

+      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);

+      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;

+      for(i=0;i<((overlap+3)>>2);i++)

+      {

+         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/

+         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);

+         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);

+         xp1+=2;

+         xp2-=2;

+         wp1+=2;

+         wp2-=2;

+      }

+      wp1 = window;

+      wp2 = window+overlap-1;

+      for(;i<N4-((overlap+3)>>2);i++)

+      {

+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */

+         *yp++ = *xp2;

+         *yp++ = *xp1;

+         xp1+=2;

+         xp2-=2;

+      }

+      for(;i<N4;i++)

+      {

+         /* Real part arranged as a-bR, Imag part arranged as -c-dR */

+         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);

+         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);

+         xp1+=2;

+         xp2-=2;

+         wp1+=2;

+         wp2-=2;

+      }

+   }

+   /* Pre-rotation */

+   {

+      kiss_fft_scalar * OPUS_RESTRICT yp = f;

+      const kiss_twiddle_scalar *t = &trig[0];

+      for(i=0;i<N4;i++)

+      {

+         kiss_fft_cpx yc;

+         kiss_twiddle_scalar t0, t1;

+         kiss_fft_scalar re, im, yr, yi;

+         t0 = t[i];

+         t1 = t[N4+i];

+         re = *yp++;

+         im = *yp++;

+         yr = S_MUL(re,t0)  -  S_MUL(im,t1);

+         yi = S_MUL(im,t0)  +  S_MUL(re,t1);

+         yc.r = yr;

+         yc.i = yi;

+         f2[i] = yc;

+      }

+   }

+   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);

+   /* Post-rotate */

+   {

+      /* Temp pointers to make it really clear to the compiler what we're doing */

+      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;

+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;

+      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);

+      const kiss_twiddle_scalar *t = &trig[0];

+      /* Temp pointers to make it really clear to the compiler what we're doing */

+      for(i=0;i<N4;i++)

+      {

+         kiss_fft_scalar yr, yi;

+         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);

+         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);

+         *yp1 = yr;

+         *yp2 = yi;

+         fp++;

+         yp1 += 2*stride;

+         yp2 -= 2*stride;

+      }

+   }

+   RESTORE_STACK;

+}

+void clt_mdct_backward_neon(const mdct_lookup *l,

+                            kiss_fft_scalar *in,

+                            kiss_fft_scalar * OPUS_RESTRICT out,

+                            const opus_val16 * OPUS_RESTRICT window,

+                            int overlap, int shift, int stride, int arch)

+{

+   int i;

+   int N, N2, N4;

+   VARDECL(kiss_fft_scalar, f);

+   const kiss_twiddle_scalar *trig;

+   const kiss_fft_state *st = l->kfft[shift];

+   N = l->n;

+   trig = l->trig;

+   for (i=0;i<shift;i++)

+   {

+      N >>= 1;

+      trig += N;

+   }

+   N2 = N>>1;

+   N4 = N>>2;

+   ALLOC(f, N2, kiss_fft_scalar);

+   /* Pre-rotate */

+   {

+      /* Temp pointers to make it really clear to the compiler what we're doing */

+      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;

+      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);

+      kiss_fft_scalar * OPUS_RESTRICT yp = f;

+      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];

+      for(i=0;i<N4;i++)

+      {

+         kiss_fft_scalar yr, yi;

+         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);

+         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);

+         yp[2*i] = yr;

+         yp[2*i+1] = yi;

+         xp1+=2*stride;

+         xp2-=2*stride;

+      }

+   }

+   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);

+   /* Post-rotate and de-shuffle from both ends of the buffer at once to make

+      it in-place. */

+   {

+      kiss_fft_scalar * yp0 = out+(overlap>>1);

+      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;

+      const kiss_twiddle_scalar *t = &trig[0];

+      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the

+         middle pair will be computed twice. */

+      for(i=0;i<(N4+1)>>1;i++)

+      {

+         kiss_fft_scalar re, im, yr, yi;

+         kiss_twiddle_scalar t0, t1;

+         re = yp0[0];

+         im = yp0[1];

+         t0 = t[i];

+         t1 = t[N4+i];

+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */

+         yr = S_MUL(re,t0) + S_MUL(im,t1);

+         yi = S_MUL(re,t1) - S_MUL(im,t0);

+         re = yp1[0];

+         im = yp1[1];

+         yp0[0] = yr;

+         yp1[1] = yi;

+         t0 = t[(N4-i-1)];

+         t1 = t[(N2-i-1)];

+         /* We'd scale up by 2 here, but instead it's done when mixing the windows */

+         yr = S_MUL(re,t0) + S_MUL(im,t1);

+         yi = S_MUL(re,t1) - S_MUL(im,t0);

+         yp1[0] = yr;

+         yp0[1] = yi;

+         yp0 += 2;

+         yp1 -= 2;

+      }

+   }

+   /* Mirror on both sides for TDAC */

+   {

+      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;

+      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;

+      const opus_val16 * OPUS_RESTRICT wp1 = window;

+      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;

+      for(i = 0; i < overlap/2; i++)

+      {

+         kiss_fft_scalar x1, x2;

+         x1 = *xp1;

+         x2 = *yp1;

+         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);

+         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);

+         wp1++;

+         wp2--;

+      }

+   }

+   RESTORE_STACK;

+}

--- a/celt/arm/celt_ne10_fft.c

+++ /dev/null

@@ -1,173 +1,0 @@

-/* Copyright (c) 2015 Xiph.Org Foundation

-   Written by Viswanath Puttagunta */

-/**

-   @file celt_ne10_fft.c

-   @brief ARM Neon optimizations for fft using NE10 library

- */

-/*

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/

-#ifndef SKIP_CONFIG_H

-#ifdef HAVE_CONFIG_H

-#include "config.h"

-#endif

-#endif

-#include <NE10_dsp.h>

-#include "os_support.h"

-#include "kiss_fft.h"

-#include "stack_alloc.h"

-#if !defined(FIXED_POINT)

-# define NE10_FFT_ALLOC_C2C_TYPE_NEON ne10_fft_alloc_c2c_float32_neon

-# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_float32_t

-# define NE10_FFT_STATE_TYPE_T ne10_fft_state_float32_t

-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_float32

-# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_float32_t

-# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_float32_neon

-#else

-# define NE10_FFT_ALLOC_C2C_TYPE_NEON(nfft) ne10_fft_alloc_c2c_int32_neon(nfft)

-# define NE10_FFT_CFG_TYPE_T ne10_fft_cfg_int32_t

-# define NE10_FFT_STATE_TYPE_T ne10_fft_state_int32_t

-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32

-# define NE10_FFT_DESTROY_C2C_TYPE ne10_fft_destroy_c2c_int32

-# define NE10_FFT_CPX_TYPE_T ne10_fft_cpx_int32_t

-# define NE10_FFT_C2C_1D_TYPE_NEON ne10_fft_c2c_1d_int32_neon

-#endif

-#if defined(CUSTOM_MODES)

-/* nfft lengths in NE10 that support scaled fft */

-# define NE10_FFTSCALED_SUPPORT_MAX 4

-static const int ne10_fft_scaled_support[NE10_FFTSCALED_SUPPORT_MAX] = {

-   480, 240, 120, 60

-};

-int opus_fft_alloc_arm_neon(kiss_fft_state *st)

-{

-   int i;

-   size_t memneeded = sizeof(struct arch_fft_state);

-   st->arch_fft = (arch_fft_state *)opus_alloc(memneeded);

-   if (!st->arch_fft)

-      return -1;

-   for (i = 0; i < NE10_FFTSCALED_SUPPORT_MAX; i++) {

-      if(st->nfft == ne10_fft_scaled_support[i])

-         break;

-   }

-   if (i == NE10_FFTSCALED_SUPPORT_MAX) {

-      /* This nfft length (scaled fft) is not supported in NE10 */

-      st->arch_fft->is_supported = 0;

-      st->arch_fft->priv = NULL;

-   }

-   else {

-      st->arch_fft->is_supported = 1;

-      st->arch_fft->priv = (void *)NE10_FFT_ALLOC_C2C_TYPE_NEON(st->nfft);

-      if (st->arch_fft->priv == NULL) {

-         return -1;

-      }

-   }

-   return 0;

-}

-void opus_fft_free_arm_neon(kiss_fft_state *st)

-{

-   NE10_FFT_CFG_TYPE_T cfg;

-   if (!st->arch_fft)

-      return;

-   cfg = (NE10_FFT_CFG_TYPE_T)st->arch_fft->priv;

-   if (cfg)

-      NE10_FFT_DESTROY_C2C_TYPE(cfg);

-   opus_free(st->arch_fft);

-}

-#endif

-void opus_fft_neon(const kiss_fft_state *st,

-                   const kiss_fft_cpx *fin,

-                   kiss_fft_cpx *fout)

-{

-   NE10_FFT_STATE_TYPE_T state;

-   NE10_FFT_CFG_TYPE_T cfg = &state;

-   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);

-   SAVE_STACK;

-   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);

-   if (!st->arch_fft->is_supported) {

-      /* This nfft length (scaled fft) not supported in NE10 */

-      opus_fft_c(st, fin, fout);

-   }

-   else {

-      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));

-      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];

-#if !defined(FIXED_POINT)

-      state.is_forward_scaled = 1;

-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

-                                (NE10_FFT_CPX_TYPE_T *)fin,

-                                cfg, 0);

-#else

-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

-                                (NE10_FFT_CPX_TYPE_T *)fin,

-                                cfg, 0, 1);

-#endif

-   }

-   RESTORE_STACK;

-}

-void opus_ifft_neon(const kiss_fft_state *st,

-                    const kiss_fft_cpx *fin,

-                    kiss_fft_cpx *fout)

-{

-   NE10_FFT_STATE_TYPE_T state;

-   NE10_FFT_CFG_TYPE_T cfg = &state;

-   VARDECL(NE10_FFT_CPX_TYPE_T, buffer);

-   SAVE_STACK;

-   ALLOC(buffer, st->nfft, NE10_FFT_CPX_TYPE_T);

-   if (!st->arch_fft->is_supported) {

-      /* This nfft length (scaled fft) not supported in NE10 */

-      opus_ifft_c(st, fin, fout);

-   }

-   else {

-      memcpy((void *)cfg, st->arch_fft->priv, sizeof(NE10_FFT_STATE_TYPE_T));

-      state.buffer = (NE10_FFT_CPX_TYPE_T *)&buffer[0];

-#if !defined(FIXED_POINT)

-      state.is_backward_scaled = 0;

-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

-                                (NE10_FFT_CPX_TYPE_T *)fin,

-                                cfg, 1);

-#else

-      NE10_FFT_C2C_1D_TYPE_NEON((NE10_FFT_CPX_TYPE_T *)fout,

-                                (NE10_FFT_CPX_TYPE_T *)fin,

-                                cfg, 1, 0);

-#endif

-   }

-   RESTORE_STACK;

-}

--- a/celt/arm/celt_ne10_mdct.c

+++ /dev/null

@@ -1,258 +1,0 @@

-/* Copyright (c) 2015 Xiph.Org Foundation

-   Written by Viswanath Puttagunta */

-/**

-   @file celt_ne10_mdct.c

-   @brief ARM Neon optimizations for mdct using NE10 library

- */

-/*

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

-   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/

-#ifndef SKIP_CONFIG_H

-#ifdef HAVE_CONFIG_H

-#include "config.h"

-#endif

-#endif

-#include "kiss_fft.h"

-#include "_kiss_fft_guts.h"

-#include "mdct.h"

-#include "stack_alloc.h"

-void clt_mdct_forward_neon(const mdct_lookup *l,

-                           kiss_fft_scalar *in,

-                           kiss_fft_scalar * OPUS_RESTRICT out,

-                           const opus_val16 *window,

-                           int overlap, int shift, int stride, int arch)

-{

-   int i;

-   int N, N2, N4;

-   VARDECL(kiss_fft_scalar, f);

-   VARDECL(kiss_fft_cpx, f2);

-   const kiss_fft_state *st = l->kfft[shift];

-   const kiss_twiddle_scalar *trig;

-   SAVE_STACK;

-   N = l->n;

-   trig = l->trig;

-   for (i=0;i<shift;i++)

-   {

-      N >>= 1;

-      trig += N;

-   }

-   N2 = N>>1;

-   N4 = N>>2;

-   ALLOC(f, N2, kiss_fft_scalar);

-   ALLOC(f2, N4, kiss_fft_cpx);

-   /* Consider the input to be composed of four blocks: [a, b, c, d] */

-   /* Window, shuffle, fold */

-   {

-      /* Temp pointers to make it really clear to the compiler what we're doing */

-      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);

-      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);

-      kiss_fft_scalar * OPUS_RESTRICT yp = f;

-      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);

-      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;

-      for(i=0;i<((overlap+3)>>2);i++)

-      {

-         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/

-         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);

-         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);

-         xp1+=2;

-         xp2-=2;

-         wp1+=2;

-         wp2-=2;

-      }

-      wp1 = window;

-      wp2 = window+overlap-1;

-      for(;i<N4-((overlap+3)>>2);i++)

-      {

-         /* Real part arranged as a-bR, Imag part arranged as -c-dR */

-         *yp++ = *xp2;

-         *yp++ = *xp1;

-         xp1+=2;

-         xp2-=2;

-      }

-      for(;i<N4;i++)

-      {

-         /* Real part arranged as a-bR, Imag part arranged as -c-dR */

-         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);

-         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);

-         xp1+=2;

-         xp2-=2;

-         wp1+=2;

-         wp2-=2;

-      }

-   }

-   /* Pre-rotation */

-   {

-      kiss_fft_scalar * OPUS_RESTRICT yp = f;

-      const kiss_twiddle_scalar *t = &trig[0];

-      for(i=0;i<N4;i++)

-      {

-         kiss_fft_cpx yc;

-         kiss_twiddle_scalar t0, t1;

-         kiss_fft_scalar re, im, yr, yi;

-         t0 = t[i];

-         t1 = t[N4+i];

-         re = *yp++;

-         im = *yp++;

-         yr = S_MUL(re,t0)  -  S_MUL(im,t1);

-         yi = S_MUL(im,t0)  +  S_MUL(re,t1);

-         yc.r = yr;

-         yc.i = yi;

-         f2[i] = yc;

-      }

-   }

-   opus_fft(st, f2, (kiss_fft_cpx *)f, arch);

-   /* Post-rotate */

-   {

-      /* Temp pointers to make it really clear to the compiler what we're doing */

-      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;

-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;

-      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);

-      const kiss_twiddle_scalar *t = &trig[0];

-      /* Temp pointers to make it really clear to the compiler what we're doing */

-      for(i=0;i<N4;i++)

-      {

-         kiss_fft_scalar yr, yi;

-         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);

-         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);

-         *yp1 = yr;

-         *yp2 = yi;

-         fp++;

-         yp1 += 2*stride;

-         yp2 -= 2*stride;

-      }

-   }

-   RESTORE_STACK;

-}

-void clt_mdct_backward_neon(const mdct_lookup *l,

-                            kiss_fft_scalar *in,

-                            kiss_fft_scalar * OPUS_RESTRICT out,

-                            const opus_val16 * OPUS_RESTRICT window,

-                            int overlap, int shift, int stride, int arch)

-{

-   int i;

-   int N, N2, N4;

-   VARDECL(kiss_fft_scalar, f);

-   const kiss_twiddle_scalar *trig;

-   const kiss_fft_state *st = l->kfft[shift];

-   N = l->n;

-   trig = l->trig;

-   for (i=0;i<shift;i++)

-   {

-      N >>= 1;

-      trig += N;

-   }

-   N2 = N>>1;

-   N4 = N>>2;

-   ALLOC(f, N2, kiss_fft_scalar);

-   /* Pre-rotate */

-   {

-      /* Temp pointers to make it really clear to the compiler what we're doing */

-      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;

-      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);

-      kiss_fft_scalar * OPUS_RESTRICT yp = f;

-      const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];

-      for(i=0;i<N4;i++)

-      {

-         kiss_fft_scalar yr, yi;

-         yr = S_MUL(*xp2, t[i]) + S_MUL(*xp1, t[N4+i]);

-         yi = S_MUL(*xp1, t[i]) - S_MUL(*xp2, t[N4+i]);

-         yp[2*i] = yr;

-         yp[2*i+1] = yi;

-         xp1+=2*stride;

-         xp2-=2*stride;

-      }

-   }

-   opus_ifft(st, (kiss_fft_cpx *)f, (kiss_fft_cpx*)(out+(overlap>>1)), arch);

-   /* Post-rotate and de-shuffle from both ends of the buffer at once to make

-      it in-place. */

-   {

-      kiss_fft_scalar * yp0 = out+(overlap>>1);

-      kiss_fft_scalar * yp1 = out+(overlap>>1)+N2-2;

-      const kiss_twiddle_scalar *t = &trig[0];

-      /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the

-         middle pair will be computed twice. */

-      for(i=0;i<(N4+1)>>1;i++)

-      {

-         kiss_fft_scalar re, im, yr, yi;

-         kiss_twiddle_scalar t0, t1;

-         re = yp0[0];

-         im = yp0[1];

-         t0 = t[i];

-         t1 = t[N4+i];

-         /* We'd scale up by 2 here, but instead it's done when mixing the windows */

-         yr = S_MUL(re,t0) + S_MUL(im,t1);

-         yi = S_MUL(re,t1) - S_MUL(im,t0);

-         re = yp1[0];

-         im = yp1[1];

-         yp0[0] = yr;

-         yp1[1] = yi;

-         t0 = t[(N4-i-1)];

-         t1 = t[(N2-i-1)];

-         /* We'd scale up by 2 here, but instead it's done when mixing the windows */

-         yr = S_MUL(re,t0) + S_MUL(im,t1);

-         yi = S_MUL(re,t1) - S_MUL(im,t0);

-         yp1[0] = yr;

-         yp0[1] = yi;

-         yp0 += 2;

-         yp1 -= 2;

-      }

-   }

-   /* Mirror on both sides for TDAC */

-   {

-      kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;

-      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;

-      const opus_val16 * OPUS_RESTRICT wp1 = window;

-      const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;

-      for(i = 0; i < overlap/2; i++)

-      {

-         kiss_fft_scalar x1, x2;

-         x1 = *xp1;

-         x2 = *yp1;

-         *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);

-         *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);

-         wp1++;

-         wp2--;

-      }

-   }

-   RESTORE_STACK;

-}

--- a/celt_sources.mk

+++ b/celt_sources.mk

@@ -45,5 +45,5 @@

 celt/arm/pitch_neon_intr.c

 CELT_SOURCES_ARM_NE10 = \

-celt/arm/celt_ne10_fft.c \

-celt/arm/celt_ne10_mdct.c

+celt/arm/celt_fft_ne10.c \

+celt/arm/celt_mdct_ne10.c

--

⑨