ref: 7dc696b9a4601b995099c029ef59930292612208
parent: bc108e9aa69bb2f17048c0f9a456f051d88fb6c1
author: David Rowe <david@rowetel.com>
date: Mon Dec 10 16:28:29 EST 2018
refactored for different machines, sgemv_accum16 using NEON intrisics Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -42,350 +42,13 @@
#define SOFTMAX_HACK
#ifdef __AVX__
-#include <immintrin.h>
-
-
-#ifdef __AVX2__
-static __m256 exp8_approx(__m256 X)
-{- const __m256 K0 = _mm256_set1_ps(0.99992522f);
- const __m256 K1 = _mm256_set1_ps(0.69583354f);
- const __m256 K2 = _mm256_set1_ps(0.22606716f);
- const __m256 K3 = _mm256_set1_ps(0.078024523f);
- const __m256 log2_E = _mm256_set1_ps(1.44269504);
- const __m256 max_in = _mm256_set1_ps(50.f);
- const __m256 min_in = _mm256_set1_ps(-50.f);
- const __m256i mask = _mm256_set1_epi32(0x7fffffff);
- __m256 XF, Y;
- __m256i I;
- X = _mm256_mul_ps(X, log2_E);
- X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
- XF = _mm256_floor_ps(X);
- I = _mm256_cvtps_epi32(XF);
- X = _mm256_sub_ps(X, XF);
- Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
- I = _mm256_slli_epi32(I, 23);
- Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
- return Y;
-}
+#include "vec_avx.h"
+#elif __ARM_NEON__
+#include "vec_neon.h"
#else
-#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
-#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
-static __m128 exp4_approx(__m128 X)
-{- const __m128 K0 = _mm_set1_ps(0.99992522f);
- const __m128 K1 = _mm_set1_ps(0.69583354f);
- const __m128 K2 = _mm_set1_ps(0.22606716f);
- const __m128 K3 = _mm_set1_ps(0.078024523f);
- const __m128 log2_E = _mm_set1_ps(1.44269504);
- const __m128 max_in = _mm_set1_ps(50.f);
- const __m128 min_in = _mm_set1_ps(-50.f);
- const __m128i mask = _mm_set1_epi32(0x7fffffff);
- __m128 XF, Y;
- __m128i I;
- X = _mm_mul_ps(X, log2_E);
- X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));
- XF = _mm_floor_ps(X);
- I = _mm_cvtps_epi32(XF);
- X = _mm_sub_ps(X, XF);
- Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);
- I = _mm_slli_epi32(I, 23);
- Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
- return Y;
-}
-static __m256 exp8_approx(__m256 X)
-{- __m256 Y;
- __m128 Xhi, Xlo, Yhi, Ylo;
- Xhi = _mm256_extractf128_ps(X, 1);
- Xlo = _mm256_extractf128_ps(X, 0);
- Yhi = exp4_approx(Xhi);
- Ylo = exp4_approx(Xlo);
- Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);
- Y = _mm256_insertf128_ps(Y, Ylo, 0);
- return Y;
-}
-#endif
-
-static float celt_exp(float x)
-{- float out[8];
- __m256 X, Y;
- X = _mm256_set1_ps(x);
- Y = exp8_approx(X);
- _mm256_storeu_ps(out, Y);
- return out[0];
-}
-
-static void softmax(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N-7;i+=8)
- {- __m256 X, Y;
- X = _mm256_loadu_ps(&x[i]);
- Y = exp8_approx(X);
- _mm256_storeu_ps(&y[i], Y);
- }
- for (;i<N;i++)
- y[i] = celt_exp(x[i]);
-}
-
-static void vec_tanh(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N-7;i+=8)
- {- const __m256 two = _mm256_set1_ps(2.f);
- const __m256 one = _mm256_set1_ps(1.f);
- __m256 X, Y;
- X = _mm256_loadu_ps(&x[i]);
- X = _mm256_mul_ps(X, two);
- Y = exp8_approx(X);
- Y = _mm256_mul_ps(_mm256_sub_ps(Y, one), _mm256_rcp_ps(_mm256_add_ps(Y, one)));
- _mm256_storeu_ps(&y[i], Y);
- }
- for (;i<N;i++)
- {- float ex2;
- ex2 = celt_exp(2*x[i]);
- y[i] = (ex2-1)/(ex2+1);
- }
-}
-
-static void vec_sigmoid(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N-7;i+=8)
- {- const __m256 one = _mm256_set1_ps(1.f);
- __m256 X, Y;
- X = _mm256_loadu_ps(&x[i]);
- Y = exp8_approx(X);
- Y = _mm256_mul_ps(Y, _mm256_rcp_ps(_mm256_add_ps(Y, one)));
- _mm256_storeu_ps(&y[i], Y);
- }
- for (;i<N;i++)
- {- float ex;
- ex = celt_exp(x[i]);
- y[i] = (ex)/(ex+1);
- }
-}
-
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{- int i, j;
- for (i=0;i<rows;i+=16)
- {- float * restrict y;
- __m256 vy0, vy8;
- y = &out[i];
- vy0 = _mm256_loadu_ps(&y[0]);
- vy8 = _mm256_loadu_ps(&y[8]);
- for (j=0;j<cols;j++)
- {- __m256 vxj;
- __m256 vw;
- vxj = _mm256_broadcast_ss(&x[j]);
-
- vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
- vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
- vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
- vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
- }
- _mm256_storeu_ps (&y[0], vy0);
- _mm256_storeu_ps (&y[8], vy8);
- }
-}
-static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
-{- int i, j;
- for (i=0;i<rows;i+=16)
- {- float * restrict y;
- int cols;
- __m256 vy0, vy8;
- y = &out[i];
- vy0 = _mm256_loadu_ps(&y[0]);
- vy8 = _mm256_loadu_ps(&y[8]);
- cols = *idx++;
- for (j=0;j<cols;j++)
- {- int id;
- __m256 vxj;
- __m256 vw;
- id = *idx++;
- vxj = _mm256_broadcast_ss(&x[id]);
-
- vw = _mm256_loadu_ps(&weights[0]);
- vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
-
- vw = _mm256_loadu_ps(&weights[8]);
- vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
- weights += 16;
- }
- _mm256_storeu_ps (&y[0], vy0);
- _mm256_storeu_ps (&y[8], vy8);
- }
-}
-
-
-#else /* No AVX2/FMA support */
-
-
#warning Compiling without any vectorization. This code will be very slow
-#warning Try adding -mavx2 -mfma
-
-
-static float celt_exp2(float x)
-{- int integer;
- float frac;
- union {- float f;
- opus_uint32 i;
- } res;
- integer = floor(x);
- if (integer < -50)
- return 0;
- frac = x-integer;
- /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
- res.f = 0.99992522f + frac * (0.69583354f
- + frac * (0.22606716f + 0.078024523f*frac));
- res.i = (res.i + (integer<<23)) & 0x7fffffff;
- return res.f;
-}
-#define celt_exp(x) celt_exp2((x)*1.44269504f)
-
-static float tansig_approx(float x)
-{- int i;
- float y, dy;
- float sign=1;
- /* Tests are reversed to catch NaNs */
- if (!(x<8))
- return 1;
- if (!(x>-8))
- return -1;
-#ifndef FIXED_POINT
- /* Another check in case of -ffast-math */
- if (celt_isnan(x))
- return 0;
+#include "vec.h"
#endif
- if (x<0)
- {- x=-x;
- sign=-1;
- }
- i = (int)floor(.5f+25*x);
- x -= .04f*i;
- y = tansig_table[i];
- dy = 1-y*y;
- y = y + x*dy*(1 - y*x);
- return sign*y;
-}
-
-static OPUS_INLINE float sigmoid_approx(float x)
-{- return .5f + .5f*tansig_approx(.5f*x);
-}
-
-static void softmax(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N;i++)
- y[i] = celt_exp(x[i]);
-}
-
-static void vec_tanh(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N;i++)
- {- y[i] = tansig_approx(x[i]);
- }
-}
-
-static void vec_sigmoid(float *y, const float *x, int N)
-{- int i;
- for (i=0;i<N;i++)
- {- y[i] = sigmoid_approx(x[i]);
- }
-}
-
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
-{- int i, j;
- for (i=0;i<rows;i+=16)
- {- for (j=0;j<cols;j++)
- {- const float * restrict w;
- float * restrict y;
- float xj;
- w = &weights[j*col_stride + i];
- xj = x[j];
- y = &out[i];
- y[0] += w[0]*xj;
- y[1] += w[1]*xj;
- y[2] += w[2]*xj;
- y[3] += w[3]*xj;
- y[4] += w[4]*xj;
- y[5] += w[5]*xj;
- y[6] += w[6]*xj;
- y[7] += w[7]*xj;
- y[8] += w[8]*xj;
- y[9] += w[9]*xj;
- y[10] += w[10]*xj;
- y[11] += w[11]*xj;
- y[12] += w[12]*xj;
- y[13] += w[13]*xj;
- y[14] += w[14]*xj;
- y[15] += w[15]*xj;
- }
- }
-}
-
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
-{- int i, j;
- for (i=0;i<rows;i+=16)
- {- int cols;
- cols = *idx++;
- for (j=0;j<cols;j++)
- {- float * restrict y;
- float xj;
- xj = x[*idx++];
- y = &out[i];
- y[0] += w[0]*xj;
- y[1] += w[1]*xj;
- y[2] += w[2]*xj;
- y[3] += w[3]*xj;
- y[4] += w[4]*xj;
- y[5] += w[5]*xj;
- y[6] += w[6]*xj;
- y[7] += w[7]*xj;
- y[8] += w[8]*xj;
- y[9] += w[9]*xj;
- y[10] += w[10]*xj;
- y[11] += w[11]*xj;
- y[12] += w[12]*xj;
- y[13] += w[13]*xj;
- y[14] += w[14]*xj;
- y[15] += w[15]*xj;
- w += 16;
- }
- }
-}
-#endif
-
-
static OPUS_INLINE float relu(float x)
{--- /dev/null
+++ b/dnn/vec.h
@@ -1,0 +1,173 @@
+/* Copyright (c) 2018 Mozilla
+ 2008-2011 Octasic Inc.
+ 2012-2017 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* No AVX2/FMA support */
+#ifndef LPCNET_TEST
+static float celt_exp2(float x)
+{+ int integer;
+ float frac;
+ union {+ float f;
+ opus_uint32 i;
+ } res;
+ integer = floor(x);
+ if (integer < -50)
+ return 0;
+ frac = x-integer;
+ /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+ res.f = 0.99992522f + frac * (0.69583354f
+ + frac * (0.22606716f + 0.078024523f*frac));
+ res.i = (res.i + (integer<<23)) & 0x7fffffff;
+ return res.f;
+}
+#define celt_exp(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{+ int i;
+ float y, dy;
+ float sign=1;
+ /* Tests are reversed to catch NaNs */
+ if (!(x<8))
+ return 1;
+ if (!(x>-8))
+ return -1;
+#ifndef FIXED_POINT
+ /* Another check in case of -ffast-math */
+ if (celt_isnan(x))
+ return 0;
+#endif
+ if (x<0)
+ {+ x=-x;
+ sign=-1;
+ }
+ i = (int)floor(.5f+25*x);
+ x -= .04f*i;
+ y = tansig_table[i];
+ dy = 1-y*y;
+ y = y + x*dy*(1 - y*x);
+ return sign*y;
+}
+
+static OPUS_INLINE float sigmoid_approx(float x)
+{+ return .5f + .5f*tansig_approx(.5f*x);
+}
+
+static void softmax(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ {+ y[i] = tansig_approx(x[i]);
+ }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ {+ y[i] = sigmoid_approx(x[i]);
+ }
+}
+#endif
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ for (j=0;j<cols;j++)
+ {+ const float * restrict w;
+ float * restrict y;
+ float xj;
+ w = &weights[j*col_stride + i];
+ xj = x[j];
+ y = &out[i];
+ y[0] += w[0]*xj;
+ y[1] += w[1]*xj;
+ y[2] += w[2]*xj;
+ y[3] += w[3]*xj;
+ y[4] += w[4]*xj;
+ y[5] += w[5]*xj;
+ y[6] += w[6]*xj;
+ y[7] += w[7]*xj;
+ y[8] += w[8]*xj;
+ y[9] += w[9]*xj;
+ y[10] += w[10]*xj;
+ y[11] += w[11]*xj;
+ y[12] += w[12]*xj;
+ y[13] += w[13]*xj;
+ y[14] += w[14]*xj;
+ y[15] += w[15]*xj;
+ }
+ }
+}
+
+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ int cols;
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {+ float * restrict y;
+ float xj;
+ xj = x[*idx++];
+ y = &out[i];
+ y[0] += w[0]*xj;
+ y[1] += w[1]*xj;
+ y[2] += w[2]*xj;
+ y[3] += w[3]*xj;
+ y[4] += w[4]*xj;
+ y[5] += w[5]*xj;
+ y[6] += w[6]*xj;
+ y[7] += w[7]*xj;
+ y[8] += w[8]*xj;
+ y[9] += w[9]*xj;
+ y[10] += w[10]*xj;
+ y[11] += w[11]*xj;
+ y[12] += w[12]*xj;
+ y[13] += w[13]*xj;
+ y[14] += w[14]*xj;
+ y[15] += w[15]*xj;
+ w += 16;
+ }
+ }
+}
--- /dev/null
+++ b/dnn/vec_avx.h
@@ -1,0 +1,219 @@
+/* Copyright (c) 2018 Mozilla
+ 2012-2017 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+ AVX implementation of vector operations, compile with -mavx
+ AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
+*/
+
+#include <immintrin.h>
+
+#ifdef __AVX2__
+static __m256 exp8_approx(__m256 X)
+{+ const __m256 K0 = _mm256_set1_ps(0.99992522f);
+ const __m256 K1 = _mm256_set1_ps(0.69583354f);
+ const __m256 K2 = _mm256_set1_ps(0.22606716f);
+ const __m256 K3 = _mm256_set1_ps(0.078024523f);
+ const __m256 log2_E = _mm256_set1_ps(1.44269504);
+ const __m256 max_in = _mm256_set1_ps(50.f);
+ const __m256 min_in = _mm256_set1_ps(-50.f);
+ const __m256i mask = _mm256_set1_epi32(0x7fffffff);
+ __m256 XF, Y;
+ __m256i I;
+ X = _mm256_mul_ps(X, log2_E);
+ X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X));
+ XF = _mm256_floor_ps(X);
+ I = _mm256_cvtps_epi32(XF);
+ X = _mm256_sub_ps(X, XF);
+ Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
+ I = _mm256_slli_epi32(I, 23);
+ Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
+ return Y;
+}
+#else
+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
+static __m128 exp4_approx(__m128 X)
+{+ const __m128 K0 = _mm_set1_ps(0.99992522f);
+ const __m128 K1 = _mm_set1_ps(0.69583354f);
+ const __m128 K2 = _mm_set1_ps(0.22606716f);
+ const __m128 K3 = _mm_set1_ps(0.078024523f);
+ const __m128 log2_E = _mm_set1_ps(1.44269504);
+ const __m128 max_in = _mm_set1_ps(50.f);
+ const __m128 min_in = _mm_set1_ps(-50.f);
+ const __m128i mask = _mm_set1_epi32(0x7fffffff);
+ __m128 XF, Y;
+ __m128i I;
+ X = _mm_mul_ps(X, log2_E);
+ X = _mm_max_ps(min_in, _mm_min_ps(max_in, X));
+ XF = _mm_floor_ps(X);
+ I = _mm_cvtps_epi32(XF);
+ X = _mm_sub_ps(X, XF);
+ Y = _mm_fmadd_ps(_mm_fmadd_ps(_mm_fmadd_ps(K3, X, K2), X, K1), X, K0);
+ I = _mm_slli_epi32(I, 23);
+ Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
+ return Y;
+}
+static __m256 exp8_approx(__m256 X)
+{+ __m256 Y;
+ __m128 Xhi, Xlo, Yhi, Ylo;
+ Xhi = _mm256_extractf128_ps(X, 1);
+ Xlo = _mm256_extractf128_ps(X, 0);
+ Yhi = exp4_approx(Xhi);
+ Ylo = exp4_approx(Xlo);
+ Y = _mm256_insertf128_ps(_mm256_setzero_ps(), Yhi, 1);
+ Y = _mm256_insertf128_ps(Y, Ylo, 0);
+ return Y;
+}
+#endif
+
+static float celt_exp(float x)
+{+ float out[8];
+ __m256 X, Y;
+ X = _mm256_set1_ps(x);
+ Y = exp8_approx(X);
+ _mm256_storeu_ps(out, Y);
+ return out[0];
+}
+
+static void softmax(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N-7;i+=8)
+ {+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ Y = exp8_approx(X);
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ y[i] = celt_exp(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N-7;i+=8)
+ {+ const __m256 two = _mm256_set1_ps(2.f);
+ const __m256 one = _mm256_set1_ps(1.f);
+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ X = _mm256_mul_ps(X, two);
+ Y = exp8_approx(X);
+ Y = _mm256_mul_ps(_mm256_sub_ps(Y, one), _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {+ float ex2;
+ ex2 = celt_exp(2*x[i]);
+ y[i] = (ex2-1)/(ex2+1);
+ }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N-7;i+=8)
+ {+ const __m256 one = _mm256_set1_ps(1.f);
+ __m256 X, Y;
+ X = _mm256_loadu_ps(&x[i]);
+ Y = exp8_approx(X);
+ Y = _mm256_mul_ps(Y, _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+ _mm256_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {+ float ex;
+ ex = celt_exp(x[i]);
+ y[i] = (ex)/(ex+1);
+ }
+}
+
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ float * restrict y;
+ __m256 vy0, vy8;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ vy8 = _mm256_loadu_ps(&y[8]);
+ for (j=0;j<cols;j++)
+ {+ __m256 vxj;
+ __m256 vw;
+ vxj = _mm256_broadcast_ss(&x[j]);
+
+ vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
+ vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ _mm256_storeu_ps (&y[8], vy8);
+ }
+}
+static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ float * restrict y;
+ int cols;
+ __m256 vy0, vy8;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ vy8 = _mm256_loadu_ps(&y[8]);
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {+ int id;
+ __m256 vxj;
+ __m256 vw;
+ id = *idx++;
+ vxj = _mm256_broadcast_ss(&x[id]);
+
+ vw = _mm256_loadu_ps(&weights[0]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vw = _mm256_loadu_ps(&weights[8]);
+ vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+ weights += 16;
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ _mm256_storeu_ps (&y[8], vy8);
+ }
+}
+
--- /dev/null
+++ b/dnn/vec_neon.h
@@ -1,0 +1,194 @@
+/* Copyright (c) 2018 David Rowe
+ 2018 Mozilla
+ 2008-2011 Octasic Inc.
+ 2012-2017 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/* NEON support for ARM machines */
+
+#include <arm_neon.h>
+#ifndef LPCNET_TEST
+static float celt_exp2(float x)
+{+ int integer;
+ float frac;
+ union {+ float f;
+ opus_uint32 i;
+ } res;
+ integer = floor(x);
+ if (integer < -50)
+ return 0;
+ frac = x-integer;
+ /* K0 = 1, K1 = log(2), K2 = 3-4*log(2), K3 = 3*log(2) - 2 */
+ res.f = 0.99992522f + frac * (0.69583354f
+ + frac * (0.22606716f + 0.078024523f*frac));
+ res.i = (res.i + (integer<<23)) & 0x7fffffff;
+ return res.f;
+}
+#define celt_exp_neon(x) celt_exp2((x)*1.44269504f)
+
+static float tansig_approx(float x)
+{+ int i;
+ float y, dy;
+ float sign=1;
+ /* Tests are reversed to catch NaNs */
+ if (!(x<8))
+ return 1;
+ if (!(x>-8))
+ return -1;
+#ifndef FIXED_POINT
+ /* Another check in case of -ffast-math */
+ if (celt_isnan(x))
+ return 0;
+#endif
+ if (x<0)
+ {+ x=-x;
+ sign=-1;
+ }
+ i = (int)floor(.5f+25*x);
+ x -= .04f*i;
+ y = tansig_table[i];
+ dy = 1-y*y;
+ y = y + x*dy*(1 - y*x);
+ return sign*y;
+}
+
+static OPUS_INLINE float sigmoid_approx(float x)
+{+ return .5f + .5f*tansig_approx(.5f*x);
+}
+
+static void softmax(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ y[i] = celt_exp_neon(x[i]);
+}
+
+static void vec_tanh(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ {+ y[i] = tansig_approx(x[i]);
+ }
+}
+
+static void vec_sigmoid(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N;i++)
+ {+ y[i] = sigmoid_approx(x[i]);
+ }
+}
+#endif
+
+static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ float * restrict y = &out[i];
+
+ /* keep y[0..15] in registers for duration of inner loop */
+
+ float32x4_t y0_3 = vld1q_f32(&y[0]);
+ float32x4_t y4_7 = vld1q_f32(&y[4]);
+ float32x4_t y8_11 = vld1q_f32(&y[8]);
+ float32x4_t y12_15 = vld1q_f32(&y[12]);
+
+ for (j=0;j<cols;j++)
+ {+ const float * restrict w;
+ float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+ float32x4_t xj;
+
+ w = &weights[j*col_stride + i];
+ wvec0_3 = vld1q_f32(&w[0]);
+ wvec4_7 = vld1q_f32(&w[4]);
+ wvec8_11 = vld1q_f32(&w[8]);
+ wvec12_15 = vld1q_f32(&w[12]);
+
+ xj = vld1q_dup_f32(&x[j]);
+
+ y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+ y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+ y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+ y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+ }
+
+ /* save y[0..15] back to memory */
+
+ vst1q_f32(&y[0], y0_3);
+ vst1q_f32(&y[4], y4_7);
+ vst1q_f32(&y[8], y8_11);
+ vst1q_f32(&y[12], y12_15);
+
+ }
+}
+
+static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+{+ int i, j;
+ for (i=0;i<rows;i+=16)
+ {+ int cols;
+ cols = *idx++;
+ float * restrict y;
+ y = &out[i];
+
+ /* keep y[0..15] in registers for duration of inner loop */
+
+ float32x4_t y0_3 = vld1q_f32(&y[0]);
+ float32x4_t y4_7 = vld1q_f32(&y[4]);
+ float32x4_t y8_11 = vld1q_f32(&y[8]);
+ float32x4_t y12_15 = vld1q_f32(&y[12]);
+
+ for (j=0;j<cols;j++)
+ {+ float xj= x[*idx++];
+ float32x4_t wvec;
+
+ wvec = vld1q_f32(&w[0]); y0_3 = vmlaq_n_f32(y0_3, wvec, xj);
+ wvec = vld1q_f32(&w[4]); y4_7 = vmlaq_n_f32(y4_7, wvec, xj);
+ wvec = vld1q_f32(&w[8]); y8_11 = vmlaq_n_f32(y8_11, wvec, xj);
+ wvec = vld1q_f32(&w[12]); y12_15 = vmlaq_n_f32(y12_15, wvec, xj);
+
+ w += 16;
+ }
+
+ /* save y[0..15] back to memory */
+
+ vst1q_f32(&y[0], y0_3);
+ vst1q_f32(&y[4], y4_7);
+ vst1q_f32(&y[8], y8_11);
+ vst1q_f32(&y[12], y12_15);
+
+ }
+}
--
⑨