shithub: opus

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -31,6 +31,7 @@

 #include <stdlib.h>

 #include <string.h>

 #include <stdio.h>

+#include <unistd.h>

 #include "kiss_fft.h"

 #include "common.h"

 #include <math.h>

@@ -141,6 +142,7 @@

   int encode = 0;

   int decode = 0;

   int quantize = 0;

+  srand(getpid());

   st = lpcnet_encoder_create();

   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;

   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -140,6 +140,7 @@

    compute_activation(output, output, N, layer->activation);

+#if 0

 void compute_gru(const GRULayer *gru, float *state, const float *input)

    int i;

@@ -201,6 +202,7 @@

    for (i=0;i<N;i++)

       state[i] = h[i];

+#endif

 void compute_gru2(const GRULayer *gru, float *state, const float *input)

@@ -224,7 +226,11 @@

    /* Compute update gate. */

    for (i=0;i<3*N;i++)

       zrh[i] = gru->bias[i];

+#if 1

+   sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);

+#else

    sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);

+#endif

    for (i=0;i<3*N;i++)

       recur[i] = gru->bias[3*N + i];

    sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -56,7 +56,7 @@

 typedef struct {

   const float *bias;

-  const float *input_weights;

+  const qweight *input_weights;

   const float *recurrent_weights;

   int nb_inputs;

   int nb_neurons;

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -39,7 +39,10 @@

 max_conv_inputs = 1

 max_mdense_tmp = 1

-def printVector(f, vector, name, dtype='float'):

+def printVector(f, vector, name, dtype='float', dotp=False):

+    if dotp:

+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))

+        vector = vector.transpose((2, 0, 3, 1))

     v = np.reshape(vector, (-1));

     #print('static const float ', name, '[', len(v), '] = \n', file=f)

     f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))

@@ -127,7 +130,12 @@

     name = self.name

     print("printing layer " + name + " of type " + self.__class__.__name__)

     weights = self.get_weights()

+    f.write('#ifdef DOT_PROD\n')

+    qweight = np.clip((128*weights[0]).astype('int'), -128, 127)

+    printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight')

+    f.write('#else /*DOT_PROD*/\n')

     printVector(f, weights[0], name + '_weights')

+    f.write('#endif /*DOT_PROD*/\n')

     printVector(f, weights[1], name + '_recurrent_weights')

     printVector(f, weights[-1], name + '_bias')

     if hasattr(self, 'activation'):

--- a/dnn/vec.h

+++ b/dnn/vec.h

@@ -41,10 +41,11 @@

 #include "vec_neon.h"

 #else

+#define MAX_INPUTS (2048)

 #define NO_OPTIMIZATIONS

-//#define DOT_PROD

+#define DOT_PROD

 //#define USE_SU_BIAS

 #ifdef DOT_PROD

@@ -193,12 +194,46 @@

 #ifdef DOT_PROD

+#define SCALE (128.f*127.f)

+#define SCALE_1 (1.f/128.f/127.f)

-#define MAX_INPUTS (2048)

+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   int i, j;

+   signed char x[MAX_INPUTS];

+   (void)col_stride;

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      for (j=0;j<cols;j+=4)

+      {

+         float * restrict y;

+         float xj0, xj1, xj2, xj3;

+         xj0 = x[j+0];

+         xj1 = x[j+1];

+         xj2 = x[j+2];

+         xj3 = x[j+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+#else

+#define sgemv_accum sgemv_accum8x4

+#endif

+#ifdef DOT_PROD

-#define SCALE (128.f*127.f)

-#define SCALE_1 (1.f/128.f/127.f)

 #ifdef USE_SU_BIAS

 static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

--

⑨