shithub: opus

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -42,350 +42,13 @@

 #define SOFTMAX_HACK

 #ifdef __AVX__

-#include <immintrin.h>

-#ifdef __AVX2__

-static __m256 exp8_approx(__m256 X)

-{

-   const __m256 K0 = _mm256_set1_ps(0.99992522f);

-   const __m256 K1 = _mm256_set1_ps(0.69583354f);

-   const __m256 K2 = _mm256_set1_ps(0.22606716f);

-   const __m256 K3 = _mm256_set1_ps(0.078024523f);

-   const __m256 log2_E = _mm256_set1_ps(1.44269504);

-   const __m256 max_in = _mm256_set1_ps(50.f);

-   const __m256 min_in = _mm256_set1_ps(-50.f);

-   const __m256i mask = _mm256_set1_epi32(0x7fffffff);

-   __m256 XF, Y;

-   __m256i I;

-   X = _mm256_mul_ps(X, log2_E);

-   X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));

-   XF = _mm256_floor_ps(X);

-   I = _mm256_cvtps_epi32(XF);

-   X = _mm256_sub_ps(X, XF);

-   Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);

-   I = _mm256_slli_epi32(I, 23);

-   Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));

-   return Y;

-}

+#include "vec_avx.h"

+#elif __ARM_NEON__

+#include "vec_neon.h"

 #else

-#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)

-#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)

-static __m128 exp4_approx(__m128 X)

-{

-   const __m128 K0 = _mm_set1_ps(0.99992522f);

-   const __m128 K1 = _mm_set1_ps(0.69583354f);

-   const __m128 K2 = _mm_set1_ps(0.22606716f);

-   const __m128 K3 = _mm_set1_ps(0.078024523f);

-   const __m128 log2_E = _mm_set1_ps(1.44269504);

-   const __m128 max_in = _mm_set1_ps(50.f);

-   const __m128 min_in = _mm_set1_ps(-50.f);

-   const __m128i mask = _mm_set1_epi32(0x7fffffff);

-   __m128 XF, Y;

-   __m128i I;

-   X = _mm_mul_ps(X, log2_E);

-   X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));

-   XF = _mm_floor_ps(X);

-   I = _mm_cvtps_epi32(XF);

-   X = _mm_sub_ps(X, XF);

-   Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);

-   I = _mm_slli_epi32(I, 23);

-   Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));

-   return Y;

-}

-static __m256 exp8_approx(__m256 X)

-{

-   __m256 Y;

-   __m128 Xhi, Xlo, Yhi, Ylo;

-   Xhi = _mm256_extractf128_ps(X, 1);

-   Xlo = _mm256_extractf128_ps(X, 0);

-   Yhi = exp4_approx(Xhi);

-   Ylo = exp4_approx(Xlo);

-   Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);

-   Y = _mm256_insertf128_ps(Y, Ylo, 0);

-   return Y;

-}

-#endif

-static float celt_exp(float x)

-{

-   float out[8];

-   __m256 X, Y;

-   X = _mm256_set1_ps(x);

-   Y = exp8_approx(X);

-   _mm256_storeu_ps(out, Y);

-   return out[0];

-}

-static void softmax(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N-7;i+=8)

-    {

-        __m256 X, Y;

-        X = _mm256_loadu_ps(&x[i]);

-        Y = exp8_approx(X);

-        _mm256_storeu_ps(&y[i], Y);

-    }

-    for (;i<N;i++)

-        y[i] = celt_exp(x[i]);

-}

-static void vec_tanh(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N-7;i+=8)

-    {

-        const __m256 two = _mm256_set1_ps(2.f);

-        const __m256 one = _mm256_set1_ps(1.f);

-        __m256 X, Y;

-        X = _mm256_loadu_ps(&x[i]);

-        X = _mm256_mul_ps(X, two);

-        Y = exp8_approx(X);

-        Y = _mm256_mul_ps(_mm256_sub_ps(Y, one),  _mm256_rcp_ps(_mm256_add_ps(Y, one)));

-        _mm256_storeu_ps(&y[i], Y);

-    }

-    for (;i<N;i++)

-    {

-        float ex2;

-        ex2 = celt_exp(2*x[i]);

-        y[i] = (ex2-1)/(ex2+1);

-    }

-}

-static void vec_sigmoid(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N-7;i+=8)

-    {

-        const __m256 one = _mm256_set1_ps(1.f);

-        __m256 X, Y;

-        X = _mm256_loadu_ps(&x[i]);

-        Y = exp8_approx(X);

-        Y = _mm256_mul_ps(Y,  _mm256_rcp_ps(_mm256_add_ps(Y, one)));

-        _mm256_storeu_ps(&y[i], Y);

-    }

-    for (;i<N;i++)

-    {

-        float ex;

-        ex = celt_exp(x[i]);

-        y[i] = (ex)/(ex+1);

-    }

-}

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

-{

-   int i, j;

-   for (i=0;i<rows;i+=16)

-   {

-      float * restrict y;

-      __m256 vy0, vy8;

-      y = &out[i];

-      vy0 = _mm256_loadu_ps(&y[0]);

-      vy8 = _mm256_loadu_ps(&y[8]);

-      for (j=0;j<cols;j++)

-      {

-         __m256 vxj;

-         __m256 vw;

-         vxj = _mm256_broadcast_ss(&x[j]);

-         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);

-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

-         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);

-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);

-      }

-      _mm256_storeu_ps (&y[0], vy0);

-      _mm256_storeu_ps (&y[8], vy8);

-   }

-}

-static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)

-{

-   int i, j;

-   for (i=0;i<rows;i+=16)

-   {

-      float * restrict y;

-      int cols;

-      __m256 vy0, vy8;

-      y = &out[i];

-      vy0 = _mm256_loadu_ps(&y[0]);

-      vy8 = _mm256_loadu_ps(&y[8]);

-      cols = *idx++;

-      for (j=0;j<cols;j++)

-      {

-         int id;

-         __m256 vxj;

-         __m256 vw;

-         id = *idx++;

-         vxj = _mm256_broadcast_ss(&x[id]);

-         vw = _mm256_loadu_ps(&weights[0]);

-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

-         vw = _mm256_loadu_ps(&weights[8]);

-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);

-         weights += 16;

-      }

-      _mm256_storeu_ps (&y[0], vy0);

-      _mm256_storeu_ps (&y[8], vy8);

-   }

-}

-#else /* No AVX2/FMA support */

 #warning Compiling without any vectorization. This code will be very slow

-#warning Try adding -mavx2 -mfma

-static float celt_exp2(float x)

-{

-   int integer;

-   float frac;

-   union {

-      float f;

-      opus_uint32 i;

-   } res;

-   integer = floor(x);

-   if (integer < -50)

-      return 0;

-   frac = x-integer;

-   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */

-   res.f = 0.99992522f + frac * (0.69583354f

-           + frac * (0.22606716f + 0.078024523f*frac));

-   res.i = (res.i + (integer<<23)) & 0x7fffffff;

-   return res.f;

-}

-#define celt_exp(x) celt_exp2((x)*1.44269504f)

-static float tansig_approx(float x)

-{

-    int i;

-    float y, dy;

-    float sign=1;

-    /* Tests are reversed to catch NaNs */

-    if (!(x<8))

-        return 1;

-    if (!(x>-8))

-        return -1;

-#ifndef FIXED_POINT

-    /* Another check in case of -ffast-math */

-    if (celt_isnan(x))

-       return 0;

+#include "vec.h"

 #endif

-    if (x<0)

-    {

-       x=-x;

-       sign=-1;

-    }

-    i = (int)floor(.5f+25*x);

-    x -= .04f*i;

-    y = tansig_table[i];

-    dy = 1-y*y;

-    y = y + x*dy*(1 - y*x);

-    return sign*y;

-}

-static OPUS_INLINE float sigmoid_approx(float x)

-{

-   return .5f + .5f*tansig_approx(.5f*x);

-}

-static void softmax(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N;i++)

-        y[i] = celt_exp(x[i]);

-}

-static void vec_tanh(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N;i++)

-    {

-        y[i] = tansig_approx(x[i]);

-    }

-}

-static void vec_sigmoid(float *y, const float *x, int N)

-{

-    int i;

-    for (i=0;i<N;i++)

-    {

-        y[i] = sigmoid_approx(x[i]);

-    }

-}

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

-{

-   int i, j;

-   for (i=0;i<rows;i+=16)

-   {

-      for (j=0;j<cols;j++)

-      {

-         const float * restrict w;

-         float * restrict y;

-         float xj;

-         w = &weights[j*col_stride + i];

-         xj = x[j];

-         y = &out[i];

-         y[0] += w[0]*xj;

-         y[1] += w[1]*xj;

-         y[2] += w[2]*xj;

-         y[3] += w[3]*xj;

-         y[4] += w[4]*xj;

-         y[5] += w[5]*xj;

-         y[6] += w[6]*xj;

-         y[7] += w[7]*xj;

-         y[8] += w[8]*xj;

-         y[9] += w[9]*xj;

-         y[10] += w[10]*xj;

-         y[11] += w[11]*xj;

-         y[12] += w[12]*xj;

-         y[13] += w[13]*xj;

-         y[14] += w[14]*xj;

-         y[15] += w[15]*xj;

-      }

-   }

-}

-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

-{

-   int i, j;

-   for (i=0;i<rows;i+=16)

-   {

-      int cols;

-      cols = *idx++;

-      for (j=0;j<cols;j++)

-      {

-         float * restrict y;

-         float xj;

-         xj = x[*idx++];

-         y = &out[i];

-         y[0] += w[0]*xj;

-         y[1] += w[1]*xj;

-         y[2] += w[2]*xj;

-         y[3] += w[3]*xj;

-         y[4] += w[4]*xj;

-         y[5] += w[5]*xj;

-         y[6] += w[6]*xj;

-         y[7] += w[7]*xj;

-         y[8] += w[8]*xj;

-         y[9] += w[9]*xj;

-         y[10] += w[10]*xj;

-         y[11] += w[11]*xj;

-         y[12] += w[12]*xj;

-         y[13] += w[13]*xj;

-         y[14] += w[14]*xj;

-         y[15] += w[15]*xj;

-         w += 16;

-      }

-   }

-}

-#endif

 static OPUS_INLINE float relu(float x)

--- /dev/null

+++ b/dnn/vec.h

@@ -1,0 +1,173 @@

+/* Copyright (c) 2018 Mozilla

+                 2008-2011 Octasic Inc.

+                 2012-2017 Jean-Marc Valin */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+/* No AVX2/FMA support */

+#ifndef LPCNET_TEST

+static float celt_exp2(float x)

+{

+   int integer;

+   float frac;

+   union {

+      float f;

+      opus_uint32 i;

+   } res;

+   integer = floor(x);

+   if (integer < -50)

+      return 0;

+   frac = x-integer;

+   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */

+   res.f = 0.99992522f + frac * (0.69583354f

+           + frac * (0.22606716f + 0.078024523f*frac));

+   res.i = (res.i + (integer<<23)) & 0x7fffffff;

+   return res.f;

+}

+#define celt_exp(x) celt_exp2((x)*1.44269504f)

+static float tansig_approx(float x)

+{

+    int i;

+    float y, dy;

+    float sign=1;

+    /* Tests are reversed to catch NaNs */

+    if (!(x<8))

+        return 1;

+    if (!(x>-8))

+        return -1;

+#ifndef FIXED_POINT

+    /* Another check in case of -ffast-math */

+    if (celt_isnan(x))

+       return 0;

+#endif

+    if (x<0)

+    {

+       x=-x;

+       sign=-1;

+    }

+    i = (int)floor(.5f+25*x);

+    x -= .04f*i;

+    y = tansig_table[i];

+    dy = 1-y*y;

+    y = y + x*dy*(1 - y*x);

+    return sign*y;

+}

+static OPUS_INLINE float sigmoid_approx(float x)

+{

+   return .5f + .5f*tansig_approx(.5f*x);

+}

+static void softmax(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+        y[i] = celt_exp(x[i]);

+}

+static void vec_tanh(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+    {

+        y[i] = tansig_approx(x[i]);

+    }

+}

+static void vec_sigmoid(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+    {

+        y[i] = sigmoid_approx(x[i]);

+    }

+}

+#endif

+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      for (j=0;j<cols;j++)

+      {

+         const float * restrict w;

+         float * restrict y;

+         float xj;

+         w = &weights[j*col_stride + i];

+         xj = x[j];

+         y = &out[i];

+         y[0] += w[0]*xj;

+         y[1] += w[1]*xj;

+         y[2] += w[2]*xj;

+         y[3] += w[3]*xj;

+         y[4] += w[4]*xj;

+         y[5] += w[5]*xj;

+         y[6] += w[6]*xj;

+         y[7] += w[7]*xj;

+         y[8] += w[8]*xj;

+         y[9] += w[9]*xj;

+         y[10] += w[10]*xj;

+         y[11] += w[11]*xj;

+         y[12] += w[12]*xj;

+         y[13] += w[13]*xj;

+         y[14] += w[14]*xj;

+         y[15] += w[15]*xj;

+      }

+   }

+}

+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      int cols;

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         float * restrict y;

+         float xj;

+         xj = x[*idx++];

+         y = &out[i];

+         y[0] += w[0]*xj;

+         y[1] += w[1]*xj;

+         y[2] += w[2]*xj;

+         y[3] += w[3]*xj;

+         y[4] += w[4]*xj;

+         y[5] += w[5]*xj;

+         y[6] += w[6]*xj;

+         y[7] += w[7]*xj;

+         y[8] += w[8]*xj;

+         y[9] += w[9]*xj;

+         y[10] += w[10]*xj;

+         y[11] += w[11]*xj;

+         y[12] += w[12]*xj;

+         y[13] += w[13]*xj;

+         y[14] += w[14]*xj;

+         y[15] += w[15]*xj;

+         w += 16;

+      }

+   }

+}

--- /dev/null

+++ b/dnn/vec_avx.h

@@ -1,0 +1,219 @@

+/* Copyright (c) 2018 Mozilla

+                 2012-2017 Jean-Marc Valin */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+/*

+  AVX implementation of vector operations, compile with -mavx

+  AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma

+*/

+#include <immintrin.h>

+#ifdef __AVX2__

+static __m256 exp8_approx(__m256 X)

+{

+   const __m256 K0 = _mm256_set1_ps(0.99992522f);

+   const __m256 K1 = _mm256_set1_ps(0.69583354f);

+   const __m256 K2 = _mm256_set1_ps(0.22606716f);

+   const __m256 K3 = _mm256_set1_ps(0.078024523f);

+   const __m256 log2_E = _mm256_set1_ps(1.44269504);

+   const __m256 max_in = _mm256_set1_ps(50.f);

+   const __m256 min_in = _mm256_set1_ps(-50.f);

+   const __m256i mask = _mm256_set1_epi32(0x7fffffff);

+   __m256 XF, Y;

+   __m256i I;

+   X = _mm256_mul_ps(X, log2_E);

+   X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));

+   XF = _mm256_floor_ps(X);

+   I = _mm256_cvtps_epi32(XF);

+   X = _mm256_sub_ps(X, XF);

+   Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);

+   I = _mm256_slli_epi32(I, 23);

+   Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));

+   return Y;

+}

+#else

+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)

+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)

+static __m128 exp4_approx(__m128 X)

+{

+   const __m128 K0 = _mm_set1_ps(0.99992522f);

+   const __m128 K1 = _mm_set1_ps(0.69583354f);

+   const __m128 K2 = _mm_set1_ps(0.22606716f);

+   const __m128 K3 = _mm_set1_ps(0.078024523f);

+   const __m128 log2_E = _mm_set1_ps(1.44269504);

+   const __m128 max_in = _mm_set1_ps(50.f);

+   const __m128 min_in = _mm_set1_ps(-50.f);

+   const __m128i mask = _mm_set1_epi32(0x7fffffff);

+   __m128 XF, Y;

+   __m128i I;

+   X = _mm_mul_ps(X, log2_E);

+   X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));

+   XF = _mm_floor_ps(X);

+   I = _mm_cvtps_epi32(XF);

+   X = _mm_sub_ps(X, XF);

+   Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);

+   I = _mm_slli_epi32(I, 23);

+   Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));

+   return Y;

+}

+static __m256 exp8_approx(__m256 X)

+{

+   __m256 Y;

+   __m128 Xhi, Xlo, Yhi, Ylo;

+   Xhi = _mm256_extractf128_ps(X, 1);

+   Xlo = _mm256_extractf128_ps(X, 0);

+   Yhi = exp4_approx(Xhi);

+   Ylo = exp4_approx(Xlo);

+   Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);

+   Y = _mm256_insertf128_ps(Y, Ylo, 0);

+   return Y;

+}

+#endif

+static float celt_exp(float x)

+{

+   float out[8];

+   __m256 X, Y;

+   X = _mm256_set1_ps(x);

+   Y = exp8_approx(X);

+   _mm256_storeu_ps(out, Y);

+   return out[0];

+}

+static void softmax(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N-7;i+=8)

+    {

+        __m256 X, Y;

+        X = _mm256_loadu_ps(&x[i]);

+        Y = exp8_approx(X);

+        _mm256_storeu_ps(&y[i], Y);

+    }

+    for (;i<N;i++)

+        y[i] = celt_exp(x[i]);

+}

+static void vec_tanh(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N-7;i+=8)

+    {

+        const __m256 two = _mm256_set1_ps(2.f);

+        const __m256 one = _mm256_set1_ps(1.f);

+        __m256 X, Y;

+        X = _mm256_loadu_ps(&x[i]);

+        X = _mm256_mul_ps(X, two);

+        Y = exp8_approx(X);

+        Y = _mm256_mul_ps(_mm256_sub_ps(Y, one),  _mm256_rcp_ps(_mm256_add_ps(Y, one)));

+        _mm256_storeu_ps(&y[i], Y);

+    }

+    for (;i<N;i++)

+    {

+        float ex2;

+        ex2 = celt_exp(2*x[i]);

+        y[i] = (ex2-1)/(ex2+1);

+    }

+}

+static void vec_sigmoid(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N-7;i+=8)

+    {

+        const __m256 one = _mm256_set1_ps(1.f);

+        __m256 X, Y;

+        X = _mm256_loadu_ps(&x[i]);

+        Y = exp8_approx(X);

+        Y = _mm256_mul_ps(Y,  _mm256_rcp_ps(_mm256_add_ps(Y, one)));

+        _mm256_storeu_ps(&y[i], Y);

+    }

+    for (;i<N;i++)

+    {

+        float ex;

+        ex = celt_exp(x[i]);

+        y[i] = (ex)/(ex+1);

+    }

+}

+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      float * restrict y;

+      __m256 vy0, vy8;

+      y = &out[i];

+      vy0 = _mm256_loadu_ps(&y[0]);

+      vy8 = _mm256_loadu_ps(&y[8]);

+      for (j=0;j<cols;j++)

+      {

+         __m256 vxj;

+         __m256 vw;

+         vxj = _mm256_broadcast_ss(&x[j]);

+         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);

+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);

+      }

+      _mm256_storeu_ps (&y[0], vy0);

+      _mm256_storeu_ps (&y[8], vy8);

+   }

+}

+static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)

+{

+   int i, j;

+   for (i=0;i<rows;i+=16)

+   {

+      float * restrict y;

+      int cols;

+      __m256 vy0, vy8;

+      y = &out[i];

+      vy0 = _mm256_loadu_ps(&y[0]);

+      vy8 = _mm256_loadu_ps(&y[8]);

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         int id;

+         __m256 vxj;

+         __m256 vw;

+         id = *idx++;

+         vxj = _mm256_broadcast_ss(&x[id]);

+         vw = _mm256_loadu_ps(&weights[0]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vw = _mm256_loadu_ps(&weights[8]);

+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);

+         weights += 16;

+      }

+      _mm256_storeu_ps (&y[0], vy0);

+      _mm256_storeu_ps (&y[8], vy8);

+   }

+}

--- /dev/null

+++ b/dnn/vec_neon.h

@@ -1,0 +1,194 @@

+/* Copyright (c) 2018 David Rowe

+                 2018 Mozilla

+                 2008-2011 Octasic Inc.

+                 2012-2017 Jean-Marc Valin */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+/* NEON support for ARM machines */

+#include <arm_neon.h>

+#ifndef LPCNET_TEST

+static float celt_exp2(float x)

+{

+    int integer;

+    float frac;

+    union {

+	float f;

+	opus_uint32 i;

+    } res;

+    integer = floor(x);

+    if (integer < -50)

+	return 0;

+    frac = x-integer;

+    /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */

+    res.f = 0.99992522f + frac * (0.69583354f

+				  + frac * (0.22606716f + 0.078024523f*frac));

+    res.i = (res.i + (integer<<23)) & 0x7fffffff;

+    return res.f;

+}

+#define celt_exp_neon(x) celt_exp2((x)*1.44269504f)

+static float tansig_approx(float x)

+{

+    int i;

+    float y, dy;

+    float sign=1;

+    /* Tests are reversed to catch NaNs */

+    if (!(x<8))

+        return 1;

+    if (!(x>-8))

+        return -1;

+#ifndef FIXED_POINT

+    /* Another check in case of -ffast-math */

+    if (celt_isnan(x))

+	return 0;

+#endif

+    if (x<0)

+    {

+	x=-x;

+	sign=-1;

+    }

+    i = (int)floor(.5f+25*x);

+    x -= .04f*i;

+    y = tansig_table[i];

+    dy = 1-y*y;

+    y = y + x*dy*(1 - y*x);

+    return sign*y;

+}

+static OPUS_INLINE float sigmoid_approx(float x)

+{

+    return .5f + .5f*tansig_approx(.5f*x);

+}

+static void softmax(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+        y[i] = celt_exp_neon(x[i]);

+}

+static void vec_tanh(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+    {

+        y[i] = tansig_approx(x[i]);

+    }

+}

+static void vec_sigmoid(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N;i++)

+    {

+        y[i] = sigmoid_approx(x[i]);

+    }

+}

+#endif

+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+{

+    int i, j;

+    for (i=0;i<rows;i+=16)

+    {

+	float * restrict y = &out[i];

+	/* keep y[0..15] in registers for duration of inner loop */

+	float32x4_t y0_3 = vld1q_f32(&y[0]);

+	float32x4_t y4_7 = vld1q_f32(&y[4]);

+	float32x4_t y8_11 = vld1q_f32(&y[8]);

+	float32x4_t y12_15 = vld1q_f32(&y[12]);

+	for (j=0;j<cols;j++)

+	{

+	    const float * restrict w;

+	    float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;

+	    float32x4_t xj;

+	    w = &weights[j*col_stride + i];

+	    wvec0_3 = vld1q_f32(&w[0]);

+	    wvec4_7 = vld1q_f32(&w[4]);

+	    wvec8_11 = vld1q_f32(&w[8]);

+	    wvec12_15 = vld1q_f32(&w[12]);

+	    xj = vld1q_dup_f32(&x[j]);

+	    y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);

+	    y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);

+	    y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);

+	    y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);

+	}

+	/* save y[0..15] back to memory */

+	vst1q_f32(&y[0], y0_3);

+	vst1q_f32(&y[4], y4_7);

+	vst1q_f32(&y[8], y8_11);

+	vst1q_f32(&y[12], y12_15);

+    }

+}

+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+{

+    int i, j;

+    for (i=0;i<rows;i+=16)

+    {

+	int cols;

+	cols = *idx++;

+	float * restrict y;

+	y = &out[i];

+	/* keep y[0..15] in registers for duration of inner loop */

+	float32x4_t y0_3 = vld1q_f32(&y[0]);

+	float32x4_t y4_7 = vld1q_f32(&y[4]);

+	float32x4_t y8_11 = vld1q_f32(&y[8]);

+	float32x4_t y12_15 = vld1q_f32(&y[12]);

+	for (j=0;j<cols;j++)

+	{

+	    float xj= x[*idx++];

+	    float32x4_t wvec;

+	    wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_n_f32(y0_3, wvec, xj);

+	    wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_n_f32(y4_7, wvec, xj);

+	    wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_n_f32(y8_11, wvec, xj);

+	    wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_n_f32(y12_15, wvec, xj);

+	    w += 16;

+	}

+	/* save y[0..15] back to memory */

+	vst1q_f32(&y[0], y0_3);

+	vst1q_f32(&y[4], y4_7);

+	vst1q_f32(&y[8], y8_11);

+	vst1q_f32(&y[12], y12_15);

+    }

+}

--

⑨