shithub: opus

Download patch

ref: 7dc696b9a4601b995099c029ef59930292612208
parent: bc108e9aa69bb2f17048c0f9a456f051d88fb6c1
author: David Rowe <david@rowetel.com>
date: Mon Dec 10 16:28:29 EST 2018

refactored for different machines, sgemv_accum16 using NEON intrisics

Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>

--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -42,350 +42,13 @@
 #define SOFTMAX_HACK
 
 #ifdef __AVX__
-#include <immintrin.h>
-
-
-#ifdef __AVX2__
-static __m256 exp8_approx(__m256 X)
-{
-   const __m256 K0 = _mm256_set1_ps(0.99992522f);
-   const __m256 K1 = _mm256_set1_ps(0.69583354f);
-   const __m256 K2 = _mm256_set1_ps(0.22606716f);
-   const __m256 K3 = _mm256_set1_ps(0.078024523f);
-   const __m256 log2_E = _mm256_set1_ps(1.44269504);
-   const __m256 max_in = _mm256_set1_ps(50.f);
-   const __m256 min_in = _mm256_set1_ps(-50.f);
-   const __m256i mask = _mm256_set1_epi32(0x7fffffff);
-   __m256 XF, Y;
-   __m256i I;
-   X = _mm256_mul_ps(X, log2_E);
-   X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
-   XF = _mm256_floor_ps(X);
-   I = _mm256_cvtps_epi32(XF);
-   X = _mm256_sub_ps(X, XF);
-   Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
-   I = _mm256_slli_epi32(I, 23);
-   Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
-   return Y;
-}
+#include "vec_avx.h"
+#elif __ARM_NEON__
+#include "vec_neon.h"
 #else
-#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
-#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
-static __m128 exp4_approx(__m128 X)
-{
-   const __m128 K0 = _mm_set1_ps(0.99992522f);
-   const __m128 K1 = _mm_set1_ps(0.69583354f);
-   const __m128 K2 = _mm_set1_ps(0.22606716f);
-   const __m128 K3 = _mm_set1_ps(0.078024523f);
-   const __m128 log2_E = _mm_set1_ps(1.44269504);
-   const __m128 max_in = _mm_set1_ps(50.f);
-   const __m128 min_in = _mm_set1_ps(-50.f);
-   const __m128i mask = _mm_set1_epi32(0x7fffffff);
-   __m128 XF, Y;
-   __m128i I;
-   X = _mm_mul_ps(X, log2_E);
-   X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));
-   XF = _mm_floor_ps(X);
-   I = _mm_cvtps_epi32(XF);
-   X = _mm_sub_ps(X, XF);
-   Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);
-   I = _mm_slli_epi32(I, 23);
-   Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
-   return Y;
-}
-static __m256 exp8_approx(__m256 X)
-{
-   __m256 Y;
-   __m128 Xhi, Xlo, Yhi, Ylo;
-   Xhi = _mm256_extractf128_ps(X, 1);
-   Xlo = _mm256_extractf128_ps(X, 0);
-   Yhi = exp4_approx(Xhi);
-   Ylo = exp4_approx(Xlo);
-   Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);
-   Y = _mm256_insertf128_ps(Y, Ylo, 0);
-   return Y;
-}
-#endif
-
-static float celt_exp(float x)
-{
-   float out[8];
-   __m256 X, Y;
-   X = _mm256_set1_ps(x);
-   Y = exp8_approx(X);
-   _mm256_storeu_ps(out, Y);
-   return out[0];
-}
-
-static void softmax(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N-7;i+=8)
-    {
-        __m256 X, Y;
-        X = _mm256_loadu_ps(&x[i]);
-        Y = exp8_approx(X);
-        _mm256_storeu_ps(&y[i], Y);
-    }
-    for (;i<N;i++)
-        y[i] = celt_exp(x[i]);
-}
-
-static void vec_tanh(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N-7;i+=8)
-    {
-        const __m256 two = _mm256_set1_ps(2.f);
-        const __m256 one = _mm256_set1_ps(1.f);
-        __m256 X, Y;
-        X = _mm256_loadu_ps(&x[i]);
-        X = _mm256_mul_ps(X, two);
-        Y = exp8_approx(X);
-        Y = _mm256_mul_ps(_mm256_sub_ps(Y, one),  _mm256_rcp_ps(_mm256_add_ps(Y, one)));
-        _mm256_storeu_ps(&y[i], Y);
-    }
-    for (;i<N;i++)
-    {
-        float ex2;
-        ex2 = celt_exp(2*x[i]);
-        y[i] = (ex2-1)/(ex2+1);
-    }
-}
-
-static void vec_sigmoid(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N-7;i+=8)
-    {
-        const __m256 one = _mm256_set1_ps(1.f);
-        __m256 X, Y;
-        X = _mm256_loadu_ps(&x[i]);
-        Y = exp8_approx(X);
-        Y = _mm256_mul_ps(Y,  _mm256_rcp_ps(_mm256_add_ps(Y, one)));
-        _mm256_storeu_ps(&y[i], Y);
-    }
-    for (;i<N;i++)
-    {
-        float ex;
-        ex = celt_exp(x[i]);
-        y[i] = (ex)/(ex+1);
-    }
-}
-
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      float * restrict y;
-      __m256 vy0, vy8;
-      y = &out[i];
-      vy0 = _mm256_loadu_ps(&y[0]);
-      vy8 = _mm256_loadu_ps(&y[8]);
-      for (j=0;j<cols;j++)
-      {
-         __m256 vxj;
-         __m256 vw;
-         vxj = _mm256_broadcast_ss(&x[j]);
-
-         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
-         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
-      }
-      _mm256_storeu_ps (&y[0], vy0);
-      _mm256_storeu_ps (&y[8], vy8);
-   }
-}
-static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      float * restrict y;
-      int cols;
-      __m256 vy0, vy8;
-      y = &out[i];
-      vy0 = _mm256_loadu_ps(&y[0]);
-      vy8 = _mm256_loadu_ps(&y[8]);
-      cols = *idx++;
-      for (j=0;j<cols;j++)
-      {
-         int id;
-         __m256 vxj;
-         __m256 vw;
-         id = *idx++;
-         vxj = _mm256_broadcast_ss(&x[id]);
-
-         vw = _mm256_loadu_ps(&weights[0]);
-         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
-         vw = _mm256_loadu_ps(&weights[8]);
-         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
-         weights += 16;
-      }
-      _mm256_storeu_ps (&y[0], vy0);
-      _mm256_storeu_ps (&y[8], vy8);
-   }
-}
-
-
-#else /* No AVX2/FMA support */
-
-
 #warning Compiling without any vectorization. This code will be very slow
-#warning Try adding -mavx2 -mfma
-
-
-static float celt_exp2(float x)
-{
-   int integer;
-   float frac;
-   union {
-      float f;
-      opus_uint32 i;
-   } res;
-   integer = floor(x);
-   if (integer < -50)
-      return 0;
-   frac = x-integer;
-   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
-   res.f = 0.99992522f + frac * (0.69583354f
-           + frac * (0.22606716f + 0.078024523f*frac));
-   res.i = (res.i + (integer<<23)) & 0x7fffffff;
-   return res.f;
-}
-#define celt_exp(x) celt_exp2((x)*1.44269504f)
-
-static float tansig_approx(float x)
-{
-    int i;
-    float y, dy;
-    float sign=1;
-    /* Tests are reversed to catch NaNs */
-    if (!(x<8))
-        return 1;
-    if (!(x>-8))
-        return -1;
-#ifndef FIXED_POINT
-    /* Another check in case of -ffast-math */
-    if (celt_isnan(x))
-       return 0;
+#include "vec.h"
 #endif
-    if (x<0)
-    {
-       x=-x;
-       sign=-1;
-    }
-    i = (int)floor(.5f+25*x);
-    x -= .04f*i;
-    y = tansig_table[i];
-    dy = 1-y*y;
-    y = y + x*dy*(1 - y*x);
-    return sign*y;
-}
-
-static OPUS_INLINE float sigmoid_approx(float x)
-{
-   return .5f + .5f*tansig_approx(.5f*x);
-}
-
-static void softmax(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N;i++)
-        y[i] = celt_exp(x[i]);
-}
-
-static void vec_tanh(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N;i++)
-    {
-        y[i] = tansig_approx(x[i]);
-    }
-}
-
-static void vec_sigmoid(float *y, const float *x, int N)
-{
-    int i;
-    for (i=0;i<N;i++)
-    {
-        y[i] = sigmoid_approx(x[i]);
-    }
-}
-
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      for (j=0;j<cols;j++)
-      {
-         const float * restrict w;
-         float * restrict y;
-         float xj;
-         w = &weights[j*col_stride + i];
-         xj = x[j];
-         y = &out[i];
-         y[0] += w[0]*xj;
-         y[1] += w[1]*xj;
-         y[2] += w[2]*xj;
-         y[3] += w[3]*xj;
-         y[4] += w[4]*xj;
-         y[5] += w[5]*xj;
-         y[6] += w[6]*xj;
-         y[7] += w[7]*xj;
-         y[8] += w[8]*xj;
-         y[9] += w[9]*xj;
-         y[10] += w[10]*xj;
-         y[11] += w[11]*xj;
-         y[12] += w[12]*xj;
-         y[13] += w[13]*xj;
-         y[14] += w[14]*xj;
-         y[15] += w[15]*xj;
-      }
-   }
-}
-
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
-{
-   int i, j;
-   for (i=0;i<rows;i+=16)
-   {
-      int cols;
-      cols = *idx++;
-      for (j=0;j<cols;j++)
-      {
-         float * restrict y;
-         float xj;
-         xj = x[*idx++];
-         y = &out[i];
-         y[0] += w[0]*xj;
-         y[1] += w[1]*xj;
-         y[2] += w[2]*xj;
-         y[3] += w[3]*xj;
-         y[4] += w[4]*xj;
-         y[5] += w[5]*xj;
-         y[6] += w[6]*xj;
-         y[7] += w[7]*xj;
-         y[8] += w[8]*xj;
-         y[9] += w[9]*xj;
-         y[10] += w[10]*xj;
-         y[11] += w[11]*xj;
-         y[12] += w[12]*xj;
-         y[13] += w[13]*xj;
-         y[14] += w[14]*xj;
-         y[15] += w[15]*xj;
-         w += 16;
-      }
-   }
-}
-#endif
-
-
 
 static OPUS_INLINE float relu(float x)
 {
--- /dev/null
+++ b/dnn/vec.h
@@ -1,0 +1,173 @@
+/* Copyright (c) 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* No AVX2/FMA support */
+#ifndef LPCNET_TEST
+static float celt_exp2(float x)
+{
+   int integer;
+   float frac;
+   union {
+      float f;
+      opus_uint32 i;
+   } res;
+   integer = floor(x);
+   if (integer < -50)
+      return 0;
+   frac = x-integer;
+   /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+   res.f = 0.99992522f + frac * (0.69583354f
+           + frac * (0.22606716f + 0.078024523f*frac));
+   res.i = (res.i + (integer<<23)) & 0x7fffffff;
+   return res.f;
+}
+#define celt_exp(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{
+    int i;
+    float y, dy;
+    float sign=1;
+    /* Tests are reversed to catch NaNs */
+    if (!(x<8))
+        return 1;
+    if (!(x>-8))
+        return -1;
+#ifndef FIXED_POINT
+    /* Another check in case of -ffast-math */
+    if (celt_isnan(x))
+       return 0;
+#endif
+    if (x<0)
+    {
+       x=-x;
+       sign=-1;
+    }
+    i = (int)floor(.5f+25*x);
+    x -= .04f*i;
+    y = tansig_table[i];
+    dy = 1-y*y;
+    y = y + x*dy*(1 - y*x);
+    return sign*y;
+}
+
+static OPUS_INLINE float sigmoid_approx(float x)
+{
+   return .5f + .5f*tansig_approx(.5f*x);
+}
+
+static void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+        y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = tansig_approx(x[i]);
+    }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+#endif
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      for (j=0;j<cols;j++)
+      {
+         const float * restrict w;
+         float * restrict y;
+         float xj;
+         w = &weights[j*col_stride + i];
+         xj = x[j];
+         y = &out[i];
+         y[0] += w[0]*xj;
+         y[1] += w[1]*xj;
+         y[2] += w[2]*xj;
+         y[3] += w[3]*xj;
+         y[4] += w[4]*xj;
+         y[5] += w[5]*xj;
+         y[6] += w[6]*xj;
+         y[7] += w[7]*xj;
+         y[8] += w[8]*xj;
+         y[9] += w[9]*xj;
+         y[10] += w[10]*xj;
+         y[11] += w[11]*xj;
+         y[12] += w[12]*xj;
+         y[13] += w[13]*xj;
+         y[14] += w[14]*xj;
+         y[15] += w[15]*xj;
+      }
+   }
+}
+
+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      int cols;
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         float * restrict y;
+         float xj;
+         xj = x[*idx++];
+         y = &out[i];
+         y[0] += w[0]*xj;
+         y[1] += w[1]*xj;
+         y[2] += w[2]*xj;
+         y[3] += w[3]*xj;
+         y[4] += w[4]*xj;
+         y[5] += w[5]*xj;
+         y[6] += w[6]*xj;
+         y[7] += w[7]*xj;
+         y[8] += w[8]*xj;
+         y[9] += w[9]*xj;
+         y[10] += w[10]*xj;
+         y[11] += w[11]*xj;
+         y[12] += w[12]*xj;
+         y[13] += w[13]*xj;
+         y[14] += w[14]*xj;
+         y[15] += w[15]*xj;
+         w += 16;
+      }
+   }
+}
--- /dev/null
+++ b/dnn/vec_avx.h
@@ -1,0 +1,219 @@
+/* Copyright (c) 2018 Mozilla
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+  AVX implementation of vector operations, compile with -mavx
+  AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
+*/
+
+#include <immintrin.h>
+
+#ifdef __AVX2__
+static __m256 exp8_approx(__m256 X)
+{
+   const __m256 K0 = _mm256_set1_ps(0.99992522f);
+   const __m256 K1 = _mm256_set1_ps(0.69583354f);
+   const __m256 K2 = _mm256_set1_ps(0.22606716f);
+   const __m256 K3 = _mm256_set1_ps(0.078024523f);
+   const __m256 log2_E = _mm256_set1_ps(1.44269504);
+   const __m256 max_in = _mm256_set1_ps(50.f);
+   const __m256 min_in = _mm256_set1_ps(-50.f);
+   const __m256i mask = _mm256_set1_epi32(0x7fffffff);
+   __m256 XF, Y;
+   __m256i I;
+   X = _mm256_mul_ps(X, log2_E);
+   X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
+   XF = _mm256_floor_ps(X);
+   I = _mm256_cvtps_epi32(XF);
+   X = _mm256_sub_ps(X, XF);
+   Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
+   I = _mm256_slli_epi32(I, 23);
+   Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
+   return Y;
+}
+#else
+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
+static __m128 exp4_approx(__m128 X)
+{
+   const __m128 K0 = _mm_set1_ps(0.99992522f);
+   const __m128 K1 = _mm_set1_ps(0.69583354f);
+   const __m128 K2 = _mm_set1_ps(0.22606716f);
+   const __m128 K3 = _mm_set1_ps(0.078024523f);
+   const __m128 log2_E = _mm_set1_ps(1.44269504);
+   const __m128 max_in = _mm_set1_ps(50.f);
+   const __m128 min_in = _mm_set1_ps(-50.f);
+   const __m128i mask = _mm_set1_epi32(0x7fffffff);
+   __m128 XF, Y;
+   __m128i I;
+   X = _mm_mul_ps(X, log2_E);
+   X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));
+   XF = _mm_floor_ps(X);
+   I = _mm_cvtps_epi32(XF);
+   X = _mm_sub_ps(X, XF);
+   Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);
+   I = _mm_slli_epi32(I, 23);
+   Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
+   return Y;
+}
+static __m256 exp8_approx(__m256 X)
+{
+   __m256 Y;
+   __m128 Xhi, Xlo, Yhi, Ylo;
+   Xhi = _mm256_extractf128_ps(X, 1);
+   Xlo = _mm256_extractf128_ps(X, 0);
+   Yhi = exp4_approx(Xhi);
+   Ylo = exp4_approx(Xlo);
+   Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);
+   Y = _mm256_insertf128_ps(Y, Ylo, 0);
+   return Y;
+}
+#endif
+
+static float celt_exp(float x)
+{
+   float out[8];
+   __m256 X, Y;
+   X = _mm256_set1_ps(x);
+   Y = exp8_approx(X);
+   _mm256_storeu_ps(out, Y);
+   return out[0];
+}
+
+static void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        Y = exp8_approx(X);
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+        y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        const __m256 two = _mm256_set1_ps(2.f);
+        const __m256 one = _mm256_set1_ps(1.f);
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        X = _mm256_mul_ps(X, two);
+        Y = exp8_approx(X);
+        Y = _mm256_mul_ps(_mm256_sub_ps(Y, one),  _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        float ex2;
+        ex2 = celt_exp(2*x[i]);
+        y[i] = (ex2-1)/(ex2+1);
+    }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N-7;i+=8)
+    {
+        const __m256 one = _mm256_set1_ps(1.f);
+        __m256 X, Y;
+        X = _mm256_loadu_ps(&x[i]);
+        Y = exp8_approx(X);
+        Y = _mm256_mul_ps(Y,  _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+        _mm256_storeu_ps(&y[i], Y);
+    }
+    for (;i<N;i++)
+    {
+        float ex;
+        ex = celt_exp(x[i]);
+        y[i] = (ex)/(ex+1);
+    }
+}
+
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float * restrict y;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      for (j=0;j<cols;j++)
+      {
+         __m256 vxj;
+         __m256 vw;
+         vxj = _mm256_broadcast_ss(&x[j]);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float * restrict y;
+      int cols;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[id]);
+
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+         weights += 16;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+
--- /dev/null
+++ b/dnn/vec_neon.h
@@ -1,0 +1,194 @@
+/* Copyright (c) 2018 David Rowe
+                 2018 Mozilla
+                 2008-2011 Octasic Inc.
+                 2012-2017 Jean-Marc Valin */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* NEON support for ARM machines */
+
+#include <arm_neon.h>
+#ifndef LPCNET_TEST
+static float celt_exp2(float x)
+{
+    int integer;
+    float frac;
+    union {
+	float f;
+	opus_uint32 i;
+    } res;
+    integer = floor(x);
+    if (integer < -50)
+	return 0;
+    frac = x-integer;
+    /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+    res.f = 0.99992522f + frac * (0.69583354f
+				  + frac * (0.22606716f + 0.078024523f*frac));
+    res.i = (res.i + (integer<<23)) & 0x7fffffff;
+    return res.f;
+}
+#define celt_exp_neon(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{
+    int i;
+    float y, dy;
+    float sign=1;
+    /* Tests are reversed to catch NaNs */
+    if (!(x<8))
+        return 1;
+    if (!(x>-8))
+        return -1;
+#ifndef FIXED_POINT
+    /* Another check in case of -ffast-math */
+    if (celt_isnan(x))
+	return 0;
+#endif
+    if (x<0)
+    {
+	x=-x;
+	sign=-1;
+    }
+    i = (int)floor(.5f+25*x);
+    x -= .04f*i;
+    y = tansig_table[i];
+    dy = 1-y*y;
+    y = y + x*dy*(1 - y*x);
+    return sign*y;
+}
+
+static OPUS_INLINE float sigmoid_approx(float x)
+{
+    return .5f + .5f*tansig_approx(.5f*x);
+}
+
+static void softmax(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+        y[i] = celt_exp_neon(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = tansig_approx(x[i]);
+    }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{
+    int i;
+    for (i=0;i<N;i++)
+    {
+        y[i] = sigmoid_approx(x[i]);
+    }
+}
+#endif
+
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	float * restrict y = &out[i];
+      
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	float32x4_t y0_3 = vld1q_f32(&y[0]);
+	float32x4_t y4_7 = vld1q_f32(&y[4]);
+	float32x4_t y8_11 = vld1q_f32(&y[8]);
+	float32x4_t y12_15 = vld1q_f32(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    const float * restrict w;
+	    float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+	    float32x4_t xj;
+
+	    w = &weights[j*col_stride + i];
+	    wvec0_3 = vld1q_f32(&w[0]);
+	    wvec4_7 = vld1q_f32(&w[4]);
+	    wvec8_11 = vld1q_f32(&w[8]);
+	    wvec12_15 = vld1q_f32(&w[12]);
+	    
+	    xj = vld1q_dup_f32(&x[j]);
+	 
+	    y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+	    y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+	    y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+	    y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+	}
+
+	/* save y[0..15] back to memory */
+      
+	vst1q_f32(&y[0], y0_3);
+	vst1q_f32(&y[4], y4_7);
+	vst1q_f32(&y[8], y8_11);
+	vst1q_f32(&y[12], y12_15);
+      
+    }
+}
+
+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{
+    int i, j;
+    for (i=0;i<rows;i+=16)
+    {
+	int cols;
+	cols = *idx++;
+	float * restrict y;
+	y = &out[i];
+
+	/* keep y[0..15] in registers for duration of inner loop */
+      
+	float32x4_t y0_3 = vld1q_f32(&y[0]);
+	float32x4_t y4_7 = vld1q_f32(&y[4]);
+	float32x4_t y8_11 = vld1q_f32(&y[8]);
+	float32x4_t y12_15 = vld1q_f32(&y[12]);
+      
+	for (j=0;j<cols;j++)
+	{
+	    float xj= x[*idx++];
+	    float32x4_t wvec;
+	 
+	    wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_n_f32(y0_3, wvec, xj);
+	    wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_n_f32(y4_7, wvec, xj);
+	    wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_n_f32(y8_11, wvec, xj);
+	    wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_n_f32(y12_15, wvec, xj);
+	 
+	    w += 16;
+	}
+
+	/* save y[0..15] back to memory */
+      
+	vst1q_f32(&y[0], y0_3);
+	vst1q_f32(&y[4], y4_7);
+	vst1q_f32(&y[8], y8_11);
+	vst1q_f32(&y[12], y12_15);
+      
+    }
+}
--