shithub: opus

--- a/dnn/README.md

+++ b/dnn/README.md

@@ -4,6 +4,7 @@

 - J.-M. Valin, J. Skoglund, [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://jmvalin.ca/papers/lpcnet_codec.pdf), *Submitted for INTERSPEECH 2019*.

 - J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Proc. International Conference on Acoustics, Speech and Signal Processing (ICASSP)*, arXiv:1810.11846, 2019.

+- J. Skoglund, J.-M. Valin, [Improving Opus Low Bit Rate Quality with Neural Speech Synthesis](https://jmvalin.ca/papers/opusnet.pdf), *Proc. INTERSPEECH*, arxiv:1905.04628, 2020.

 # Introduction

@@ -23,7 +24,9 @@

 make

```

 Note that the autogen.sh script is used when building from Git and will automatically download the latest model

-(models are too large to put in Git).

+(models are too large to put in Git). By default, LPCNet will attempt to use 8-bit dot product instructions on AVX*/Neon to

+speed up inference. To disable that (e.g. to avoid quantization effects when retraining), add --disable-dot-product to the

+configure script.

 It is highly recommended to set the CFLAGS environment variable to enable AVX or NEON *prior* to running configure, otherwise

 no vectorization will take place and the code will be very slow. On a recent x86 CPU, something like

@@ -69,7 +72,7 @@

    and it will generate an lpcnet*.h5 file for each iteration. If it stops with a

    "Failed to allocate RNN reserve space" message try reducing the *batch\_size* variable in train_lpcnet.py.

-1. You can synthesise speech with Python and your GPU card:

+1. You can synthesise speech with Python and your GPU card (very slow):

```

    ./dump_data -test test_input.s16 test_features.f32

    ./src/test_lpcnet.py test_features.f32 test.s16

@@ -76,7 +79,7 @@

```

    Note the .h5 is hard coded in test_lpcnet.py, modify for your .h5 file.

-1. Or with C on a CPU:

+1. Or with C on a CPU (C inference is much faster):

    First extract the model files nnet_data.h and nnet_data.c

```

    ./dump_lpcnet.py lpcnet15_384_10_G16_64.h5

@@ -95,6 +98,6 @@

 # Reading Further

 1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)

-1. Sample model files:

-https://jmvalin.ca/misc_stuff/lpcnet_models/

+1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)

+1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/

--- a/dnn/autogen.sh

+++ b/dnn/autogen.sh

@@ -6,7 +6,7 @@

 test -n "$srcdir" && cd "$srcdir"

 #SHA1 of the first commit compatible with the current model

-commit=90ea887

+commit=cce123e

 if [ ! -f lpcnet_data-$commit.tar.gz ]; then

 	echo "Downloading latest model"

--- a/dnn/causalconv.py

+++ /dev/null

@@ -1,52 +1,0 @@

-from keras import backend as K

-from keras.engine.topology import Layer

-from keras.layers import activations, initializers, regularizers, constraints, InputSpec, Conv1D

-import numpy as np

-class CausalConv(Conv1D):

-    def __init__(self, filters,

-                 kernel_size,

-                 dilation_rate=1,

-                 activation=None,

-                 use_bias=True,

-                 kernel_initializer='glorot_uniform',

-                 bias_initializer='zeros',

-                 kernel_regularizer=None,

-                 bias_regularizer=None,

-                 activity_regularizer=None,

-                 kernel_constraint=None,

-                 bias_constraint=None,

-                 return_memory=False,

-                 **kwargs):

-        super(CausalConv, self).__init__(

-            filters=filters,

-            kernel_size=kernel_size,

-            strides=1,

-            padding='valid',

-            data_format='channels_last',

-            dilation_rate=dilation_rate,

-            activation=activation,

-            use_bias=use_bias,

-            kernel_initializer=kernel_initializer,

-            bias_initializer=bias_initializer,

-            kernel_regularizer=kernel_regularizer,

-            bias_regularizer=bias_regularizer,

-            activity_regularizer=activity_regularizer,

-            kernel_constraint=kernel_constraint,

-            bias_constraint=bias_constraint,

-            **kwargs)

-        self.mem_size = dilation_rate*(kernel_size-1)

-        self.return_memory = return_memory

-    def call(self, inputs, memory=None):

-        if memory is None:

-            mem = K.zeros((K.shape(inputs)[0], self.mem_size, K.shape(inputs)[-1]))

-        else:

-            mem = K.variable(K.cast_to_floatx(memory))

-        inputs = K.concatenate([mem, inputs], axis=1)

-        ret = super(CausalConv, self).call(inputs)

-        if self.return_memory:

-            ret = ret, inputs[:, :self.mem_size, :]

-        return ret

--- a/dnn/configure.ac

+++ b/dnn/configure.ac

@@ -73,6 +73,14 @@

   AC_DEFINE([OP_ENABLE_ASSERTIONS], [1], [Enable assertions in code])

])

+AC_ARG_ENABLE([dot-product],

+	      AS_HELP_STRING([--disable-dot-product], [Disable dot product implementation]),,

+  enable_dot_product=yes)

+AS_IF([test "$enable_dot_product" = "no"], [

+       AC_DEFINE([DISABLE_DOT_PROD], [1], [Disable dot product instructions])

+])

 AS_CASE(["$ac_cv_search_lrintf"],

   ["no"],[],

   ["none required"],[],

@@ -114,8 +122,8 @@

 ------------------------------------------------------------------------

   $PACKAGE_NAME $PACKAGE_VERSION: Automatic configuration OK.

+    Dot product intrinsics ....... ${enable_dot_product}

     Assertions ................... ${enable_assertions}

     Hidden visibility ............ ${cc_cv_flag_visibility}

     API documentation ............ ${enable_doc}

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -31,6 +31,7 @@

 #include <stdlib.h>

 #include <string.h>

 #include <stdio.h>

+#include <unistd.h>

 #include "kiss_fft.h"

 #include "common.h"

 #include <math.h>

@@ -141,6 +142,7 @@

   int encode = 0;

   int decode = 0;

   int quantize = 0;

+  srand(getpid());

   st = lpcnet_encoder_create();

   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;

   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {

@@ -231,7 +233,7 @@

     if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;

     if (training && ++gain_change_count > 2821) {

-      float tmp;

+      float tmp, tmp2;

       speech_gain = pow(10., (-20+(rand()%40))/20.);

       if (rand()%20==0) speech_gain *= .01;

       if (rand()%100==0) speech_gain = 0;

@@ -238,7 +240,8 @@

       gain_change_count = 0;

       rand_resp(a_sig, b_sig);

       tmp = (float)rand()/RAND_MAX;

-      noise_std = 10*tmp*tmp;

+      tmp2 = (float)rand()/RAND_MAX;

+      noise_std = -log(tmp)-log(tmp2);

     biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);

     biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);

--- a/dnn/dump_lpcnet.py

+++ /dev/null

@@ -1,270 +1,0 @@

-#!/usr/bin/python3

-'''Copyright (c) 2017-2018 Mozilla

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-'''

-import lpcnet

-import sys

-import numpy as np

-from keras.optimizers import Adam

-from keras.callbacks import ModelCheckpoint

-from keras.layers import Layer, GRU, CuDNNGRU, Dense, Conv1D, Embedding

-from ulaw import ulaw2lin, lin2ulaw

-from mdense import MDense

-import keras.backend as K

-import h5py

-import re

-max_rnn_neurons = 1

-max_conv_inputs = 1

-max_mdense_tmp = 1

-def printVector(f, vector, name, dtype='float'):

-    v = np.reshape(vector, (-1));

-    #print('static const float ', name, '[', len(v), '] = \n', file=f)

-    f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))

-    for i in range(0, len(v)):

-        f.write('{}'.format(v[i]))

-        if (i!=len(v)-1):

-            f.write(',')

-        else:

-            break;

-        if (i%8==7):

-            f.write("\n   ")

-        else:

-            f.write(" ")

-    #print(v, file=f)

-    f.write('\n};\n\n')

-    return;

-def printSparseVector(f, A, name):

-    N = A.shape[0]

-    W = np.zeros((0,))

-    diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])

-    A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))

-    A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))

-    A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))

-    printVector(f, diag, name + '_diag')

-    idx = np.zeros((0,), dtype='int')

-    for i in range(3*N//16):

-        pos = idx.shape[0]

-        idx = np.append(idx, -1)

-        nb_nonzero = 0

-        for j in range(N):

-            if np.sum(np.abs(A[j, i*16:(i+1)*16])) > 1e-10:

-                nb_nonzero = nb_nonzero + 1

-                idx = np.append(idx, j)

-                W = np.concatenate([W, A[j, i*16:(i+1)*16]])

-        idx[pos] = nb_nonzero

-    printVector(f, W, name)

-    #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)

-    printVector(f, idx, name + '_idx', dtype='int')

-    return;

-def dump_layer_ignore(self, f, hf):

-    print("ignoring layer " + self.name + " of type " + self.__class__.__name__)

-    return False

-Layer.dump_layer = dump_layer_ignore

-def dump_sparse_gru(self, f, hf):

-    global max_rnn_neurons

-    name = 'sparse_' + self.name

-    print("printing layer " + name + " of type sparse " + self.__class__.__name__)

-    weights = self.get_weights()

-    printSparseVector(f, weights[1], name + '_recurrent_weights')

-    printVector(f, weights[-1], name + '_bias')

-    if hasattr(self, 'activation'):

-        activation = self.activation.__name__.upper()

-    else:

-        activation = 'TANH'

-    if hasattr(self, 'reset_after') and not self.reset_after:

-        reset_after = 0

-    else:

-        reset_after = 1

-    neurons = weights[0].shape[1]//3

-    max_rnn_neurons = max(max_rnn_neurons, neurons)

-    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

-            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

-    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

-    hf.write('extern const SparseGRULayer {};\n\n'.format(name));

-    return True

-def dump_gru_layer(self, f, hf):

-    global max_rnn_neurons

-    name = self.name

-    print("printing layer " + name + " of type " + self.__class__.__name__)

-    weights = self.get_weights()

-    printVector(f, weights[0], name + '_weights')

-    printVector(f, weights[1], name + '_recurrent_weights')

-    printVector(f, weights[-1], name + '_bias')

-    if hasattr(self, 'activation'):

-        activation = self.activation.__name__.upper()

-    else:

-        activation = 'TANH'

-    if hasattr(self, 'reset_after') and not self.reset_after:

-        reset_after = 0

-    else:

-        reset_after = 1

-    neurons = weights[0].shape[1]//3

-    max_rnn_neurons = max(max_rnn_neurons, neurons)

-    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'

-            .format(name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

-    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

-    hf.write('extern const GRULayer {};\n\n'.format(name));

-    return True

-CuDNNGRU.dump_layer = dump_gru_layer

-GRU.dump_layer = dump_gru_layer

-def dump_dense_layer_impl(name, weights, bias, activation, f, hf):

-    printVector(f, weights, name + '_weights')

-    printVector(f, bias, name + '_bias')

-    f.write('const DenseLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, ACTIVATION_{}\n}};\n\n'

-            .format(name, name, name, weights.shape[0], weights.shape[1], activation))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))

-    hf.write('extern const DenseLayer {};\n\n'.format(name));

-def dump_dense_layer(self, f, hf):

-    name = self.name

-    print("printing layer " + name + " of type " + self.__class__.__name__)

-    weights = self.get_weights()

-    activation = self.activation.__name__.upper()

-    dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)

-    return False

-Dense.dump_layer = dump_dense_layer

-def dump_mdense_layer(self, f, hf):

-    global max_mdense_tmp

-    name = self.name

-    print("printing layer " + name + " of type " + self.__class__.__name__)

-    weights = self.get_weights()

-    printVector(f, np.transpose(weights[0], (1, 2, 0)), name + '_weights')

-    printVector(f, np.transpose(weights[1], (1, 0)), name + '_bias')

-    printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor')

-    activation = self.activation.__name__.upper()

-    max_mdense_tmp = max(max_mdense_tmp, weights[0].shape[0]*weights[0].shape[2])

-    f.write('const MDenseLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}_factor,\n   {}, {}, {}, ACTIVATION_{}\n}};\n\n'

-            .format(name, name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[0]))

-    hf.write('extern const MDenseLayer {};\n\n'.format(name));

-    return False

-MDense.dump_layer = dump_mdense_layer

-def dump_conv1d_layer(self, f, hf):

-    global max_conv_inputs

-    name = self.name

-    print("printing layer " + name + " of type " + self.__class__.__name__)

-    weights = self.get_weights()

-    printVector(f, weights[0], name + '_weights')

-    printVector(f, weights[-1], name + '_bias')

-    activation = self.activation.__name__.upper()

-    max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])

-    f.write('const Conv1DLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, {}, ACTIVATION_{}\n}};\n\n'

-            .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))

-    hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))

-    hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))

-    hf.write('extern const Conv1DLayer {};\n\n'.format(name));

-    return True

-Conv1D.dump_layer = dump_conv1d_layer

-def dump_embedding_layer_impl(name, weights, f, hf):

-    printVector(f, weights, name + '_weights')

-    f.write('const EmbeddingLayer {} = {{\n   {}_weights,\n   {}, {}\n}};\n\n'

-            .format(name, name, weights.shape[0], weights.shape[1]))

-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))

-    hf.write('extern const EmbeddingLayer {};\n\n'.format(name));

-def dump_embedding_layer(self, f, hf):

-    name = self.name

-    print("printing layer " + name + " of type " + self.__class__.__name__)

-    weights = self.get_weights()[0]

-    dump_embedding_layer_impl(name, weights, f, hf)

-    return False

-Embedding.dump_layer = dump_embedding_layer

-model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=384, use_gpu=False)

-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

-#model.summary()

-model.load_weights(sys.argv[1])

-if len(sys.argv) > 2:

-    cfile = sys.argv[2];

-    hfile = sys.argv[3];

-else:

-    cfile = 'nnet_data.c'

-    hfile = 'nnet_data.h'

-f = open(cfile, 'w')

-hf = open(hfile, 'w')

-f.write('/*This file is automatically generated from a Keras model*/\n\n')

-f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))

-hf.write('/*This file is automatically generated from a Keras model*/\n\n')

-hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')

-embed_size = lpcnet.embed_size

-E = model.get_layer('embed_sig').get_weights()[0]

-W = model.get_layer('gru_a').get_weights()[0][:embed_size,:]

-dump_embedding_layer_impl('gru_a_embed_sig', np.dot(E, W), f, hf)

-W = model.get_layer('gru_a').get_weights()[0][embed_size:2*embed_size,:]

-dump_embedding_layer_impl('gru_a_embed_pred', np.dot(E, W), f, hf)

-W = model.get_layer('gru_a').get_weights()[0][2*embed_size:3*embed_size,:]

-dump_embedding_layer_impl('gru_a_embed_exc', np.dot(E, W), f, hf)

-W = model.get_layer('gru_a').get_weights()[0][3*embed_size:,:]

-#FIXME: dump only half the biases

-b = model.get_layer('gru_a').get_weights()[2]

-dump_dense_layer_impl('gru_a_dense_feature', W, b, 'LINEAR', f, hf)

-layer_list = []

-for i, layer in enumerate(model.layers):

-    if layer.dump_layer(f, hf):

-        layer_list.append(layer.name)

-dump_sparse_gru(model.get_layer('gru_a'), f, hf)

-hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))

-hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))

-hf.write('#define MAX_MDENSE_TMP {}\n\n'.format(max_mdense_tmp))

-hf.write('typedef struct {\n')

-for i, name in enumerate(layer_list):

-    hf.write('  float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper()))

-hf.write('} NNetState;\n')

-hf.write('\n\n#endif\n')

-f.close()

-hf.close()

--- a/dnn/gatedconv.py

+++ /dev/null

@@ -1,65 +1,0 @@

-from keras import backend as K

-from keras.engine.topology import Layer

-from keras.layers import activations, initializers, regularizers, constraints, InputSpec, Conv1D, Dense

-import numpy as np

-class GatedConv(Conv1D):

-    def __init__(self, filters,

-                 kernel_size,

-                 dilation_rate=1,

-                 activation='tanh',

-                 use_bias=True,

-                 kernel_initializer='glorot_uniform',

-                 bias_initializer='zeros',

-                 kernel_regularizer=None,

-                 bias_regularizer=None,

-                 activity_regularizer=None,

-                 kernel_constraint=None,

-                 bias_constraint=None,

-                 return_memory=False,

-                 **kwargs):

-        super(GatedConv, self).__init__(

-            filters=2*filters,

-            kernel_size=kernel_size,

-            strides=1,

-            padding='valid',

-            data_format='channels_last',

-            dilation_rate=dilation_rate,

-            activation='linear',

-            use_bias=use_bias,

-            kernel_initializer=kernel_initializer,

-            bias_initializer=bias_initializer,

-            kernel_regularizer=kernel_regularizer,

-            bias_regularizer=bias_regularizer,

-            activity_regularizer=activity_regularizer,

-            kernel_constraint=kernel_constraint,

-            bias_constraint=bias_constraint,

-            **kwargs)

-        self.mem_size = dilation_rate*(kernel_size-1)

-        self.return_memory = return_memory

-        self.out_dims = filters

-        self.nongate_activation = activations.get(activation)

-    def call(self, inputs, cond=None, memory=None):

-        if memory is None:

-            mem = K.zeros((K.shape(inputs)[0], self.mem_size, K.shape(inputs)[-1]))

-        else:

-            mem = K.variable(K.cast_to_floatx(memory))

-        inputs = K.concatenate([mem, inputs], axis=1)

-        ret = super(GatedConv, self).call(inputs)

-        if cond is not None:

-            d = Dense(2*self.out_dims, use_bias=False, activation='linear')

-            ret = ret + d(cond)

-        ret = self.nongate_activation(ret[:, :, :self.out_dims]) * activations.sigmoid(ret[:, :, self.out_dims:])

-        if self.return_memory:

-            ret = ret, inputs[:, :self.mem_size, :]

-        return ret

-    def compute_output_shape(self, input_shape):

-        assert input_shape and len(input_shape) >= 2

-        assert input_shape[-1]

-        output_shape = list(input_shape)

-        output_shape[-1] = self.out_dims

-        return tuple(output_shape)

--- a/dnn/lpcnet.py

+++ /dev/null

@@ -1,176 +1,0 @@

-#!/usr/bin/python3

-'''Copyright (c) 2018 Mozilla

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-'''

-import math

-from keras.models import Model

-from keras.layers import Input, GRU, CuDNNGRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation

-from keras import backend as K

-from keras.initializers import Initializer

-from keras.callbacks import Callback

-from mdense import MDense

-import numpy as np

-import h5py

-import sys

-frame_size = 160

-pcm_bits = 8

-embed_size = 128

-pcm_levels = 2**pcm_bits

-class Sparsify(Callback):

-    def __init__(self, t_start, t_end, interval, density):

-        super(Sparsify, self).__init__()

-        self.batch = 0

-        self.t_start = t_start

-        self.t_end = t_end

-        self.interval = interval

-        self.final_density = density

-    def on_batch_end(self, batch, logs=None):

-        #print("batch number", self.batch)

-        self.batch += 1

-        if self.batch < self.t_start or ((self.batch-self.t_start) % self.interval != 0 and self.batch < self.t_end):

-            #print("don't constrain");

-            pass

-        else:

-            #print("constrain");

-            layer = self.model.get_layer('gru_a')

-            w = layer.get_weights()

-            p = w[1]

-            nb = p.shape[1]//p.shape[0]

-            N = p.shape[0]

-            #print("nb = ", nb, ", N = ", N);

-            #print(p.shape)

-            #print ("density = ", density)

-            for k in range(nb):

-                density = self.final_density[k]

-                if self.batch < self.t_end:

-                    r = 1 - (self.batch-self.t_start)/(self.t_end - self.t_start)

-                    density = 1 - (1-self.final_density[k])*(1 - r*r*r)

-                A = p[:, k*N:(k+1)*N]

-                A = A - np.diag(np.diag(A))

-                A = np.transpose(A, (1, 0))

-                L=np.reshape(A, (N, N//16, 16))

-                S=np.sum(L*L, axis=-1)

-                SS=np.sort(np.reshape(S, (-1,)))

-                thresh = SS[round(N*N//16*(1-density))]

-                mask = (S>=thresh).astype('float32');

-                mask = np.repeat(mask, 16, axis=1)

-                mask = np.minimum(1, mask + np.diag(np.ones((N,))))

-                mask = np.transpose(mask, (1, 0))

-                p[:, k*N:(k+1)*N] = p[:, k*N:(k+1)*N]*mask

-                #print(thresh, np.mean(mask))

-            w[1] = p

-            layer.set_weights(w)

-class PCMInit(Initializer):

-    def __init__(self, gain=.1, seed=None):

-        self.gain = gain

-        self.seed = seed

-    def __call__(self, shape, dtype=None):

-        num_rows = 1

-        for dim in shape[:-1]:

-            num_rows *= dim

-        num_cols = shape[-1]

-        flat_shape = (num_rows, num_cols)

-        if self.seed is not None:

-            np.random.seed(self.seed)

-        a = np.random.uniform(-1.7321, 1.7321, flat_shape)

-        #a[:,0] = math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows

-        #a[:,1] = .5*a[:,0]*a[:,0]*a[:,0]

-        a = a + np.reshape(math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows, (num_rows, 1))

-        return self.gain * a

-    def get_config(self):

-        return {

-            'gain': self.gain,

-            'seed': self.seed

-        }

-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True, adaptation=False):

-    pcm = Input(shape=(None, 3))

-    feat = Input(shape=(None, nb_used_features))

-    pitch = Input(shape=(None, 1))

-    dec_feat = Input(shape=(None, 128))

-    dec_state1 = Input(shape=(rnn_units1,))

-    dec_state2 = Input(shape=(rnn_units2,))

-    padding = 'valid' if training else 'same'

-    fconv1 = Conv1D(128, 3, padding=padding, activation='tanh', name='feature_conv1')

-    fconv2 = Conv1D(128, 3, padding=padding, activation='tanh', name='feature_conv2')

-    embed = Embedding(256, embed_size, embeddings_initializer=PCMInit(), name='embed_sig')

-    cpcm = Reshape((-1, embed_size*3))(embed(pcm))

-    pembed = Embedding(256, 64, name='embed_pitch')

-    cat_feat = Concatenate()([feat, Reshape((-1, 64))(pembed(pitch))])

-    cfeat = fconv2(fconv1(cat_feat))

-    fdense1 = Dense(128, activation='tanh', name='feature_dense1')

-    fdense2 = Dense(128, activation='tanh', name='feature_dense2')

-    cfeat = fdense2(fdense1(cfeat))

-    rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))

-    if use_gpu:

-        rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a')

-        rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b')

-    else:

-        rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a')

-        rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b')

-    rnn_in = Concatenate()([cpcm, rep(cfeat)])

-    md = MDense(pcm_levels, activation='softmax', name='dual_fc')

-    gru_out1, _ = rnn(rnn_in)

-    gru_out2, _ = rnn2(Concatenate()([gru_out1, rep(cfeat)]))

-    ulaw_prob = md(gru_out2)

-    if adaptation:

-        rnn.trainable=False

-        rnn2.trainable=False

-        md.trainable=False

-        embed.Trainable=False

-    model = Model([pcm, feat, pitch], ulaw_prob)

-    model.rnn_units1 = rnn_units1

-    model.rnn_units2 = rnn_units2

-    model.nb_used_features = nb_used_features

-    model.frame_size = frame_size

-    encoder = Model([feat, pitch], cfeat)

-    dec_rnn_in = Concatenate()([cpcm, dec_feat])

-    dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1)

-    dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2)

-    dec_ulaw_prob = md(dec_gru_out2)

-    decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])

-    return model, encoder, decoder

--- a/dnn/mdense.py

+++ /dev/null

@@ -1,94 +1,0 @@

-from keras import backend as K

-from keras.engine.topology import Layer

-from keras.layers import activations, initializers, regularizers, constraints, InputSpec

-import numpy as np

-import math

-class MDense(Layer):

-    def __init__(self, outputs,

-                 channels=2,

-                 activation=None,

-                 use_bias=True,

-                 kernel_initializer='glorot_uniform',

-                 bias_initializer='zeros',

-                 kernel_regularizer=None,

-                 bias_regularizer=None,

-                 activity_regularizer=None,

-                 kernel_constraint=None,

-                 bias_constraint=None,

-                 **kwargs):

-        if 'input_shape' not in kwargs and 'input_dim' in kwargs:

-            kwargs['input_shape'] = (kwargs.pop('input_dim'),)

-        super(MDense, self).__init__(**kwargs)

-        self.units = outputs

-        self.channels = channels

-        self.activation = activations.get(activation)

-        self.use_bias = use_bias

-        self.kernel_initializer = initializers.get(kernel_initializer)

-        self.bias_initializer = initializers.get(bias_initializer)

-        self.kernel_regularizer = regularizers.get(kernel_regularizer)

-        self.bias_regularizer = regularizers.get(bias_regularizer)

-        self.activity_regularizer = regularizers.get(activity_regularizer)

-        self.kernel_constraint = constraints.get(kernel_constraint)

-        self.bias_constraint = constraints.get(bias_constraint)

-        self.input_spec = InputSpec(min_ndim=2)

-        self.supports_masking = True

-    def build(self, input_shape):

-        assert len(input_shape) >= 2

-        input_dim = input_shape[-1]

-        self.kernel = self.add_weight(shape=(self.units, input_dim, self.channels),

-                                      initializer=self.kernel_initializer,

-                                      name='kernel',

-                                      regularizer=self.kernel_regularizer,

-                                      constraint=self.kernel_constraint)

-        if self.use_bias:

-            self.bias = self.add_weight(shape=(self.units, self.channels),

-                                        initializer=self.bias_initializer,

-                                        name='bias',

-                                        regularizer=self.bias_regularizer,

-                                        constraint=self.bias_constraint)

-        else:

-            self.bias = None

-        self.factor = self.add_weight(shape=(self.units, self.channels),

-                                    initializer='ones',

-                                    name='factor',

-                                    regularizer=self.bias_regularizer,

-                                    constraint=self.bias_constraint)

-        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})

-        self.built = True

-    def call(self, inputs):

-        output = K.dot(inputs, self.kernel)

-        if self.use_bias:

-            output = output + self.bias

-        output = K.tanh(output) * self.factor

-        output = K.sum(output, axis=-1)

-        if self.activation is not None:

-            output = self.activation(output)

-        return output

-    def compute_output_shape(self, input_shape):

-        assert input_shape and len(input_shape) >= 2

-        assert input_shape[-1]

-        output_shape = list(input_shape)

-        output_shape[-1] = self.units

-        return tuple(output_shape)

-    def get_config(self):

-        config = {

-            'units': self.units,

-            'activation': activations.serialize(self.activation),

-            'use_bias': self.use_bias,

-            'kernel_initializer': initializers.serialize(self.kernel_initializer),

-            'bias_initializer': initializers.serialize(self.bias_initializer),

-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),

-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),

-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),

-            'kernel_constraint': constraints.serialize(self.kernel_constraint),

-            'bias_constraint': constraints.serialize(self.bias_constraint)

-        }

-        base_config = super(MDense, self).get_config()

-        return dict(list(base_config.items()) + list(config.items()))

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -39,17 +39,14 @@

 #include "nnet.h"

 #include "nnet_data.h"

-#define SOFTMAX_HACK

-#ifdef __AVX__

-#include "vec_avx.h"

-#elif __ARM_NEON__

-#include "vec_neon.h"

-#else

+#ifdef NO_OPTIMIZATIONS

 #warning Compiling without any vectorization. This code will be very slow

-#include "vec.h"

 #endif

+#define SOFTMAX_HACK

 static OPUS_INLINE float relu(float x)

    return x < 0 ? 0 : x;

@@ -83,8 +80,9 @@

          output[i] = relu(input[i]);

    } else if (activation == ACTIVATION_SOFTMAX) {

 #ifdef SOFTMAX_HACK

-      for (i=0;i<N;i++)

-         output[i] = input[i];

+      RNN_COPY(output, input, N);

+      /*for (i=0;i<N;i++)

+         output[i] = input[i];*/

 #else

       float sum = 0;

       softmax(output, input, N);

@@ -143,6 +141,7 @@

    compute_activation(output, output, N, layer->activation);

+#if 0

 void compute_gru(const GRULayer *gru, float *state, const float *input)

    int i;

@@ -204,6 +203,7 @@

    for (i=0;i<N;i++)

       state[i] = h[i];

+#endif

 void compute_gru2(const GRULayer *gru, float *state, const float *input)

@@ -225,9 +225,14 @@

    celt_assert(gru->reset_after);

    stride = 3*N;

    /* Compute update gate. */

+#ifdef USE_SU_BIAS

    for (i=0;i<3*N;i++)

+      zrh[i] = gru->subias[i];

+#else

+   for (i=0;i<3*N;i++)

       zrh[i] = gru->bias[i];

-   sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);

+#endif

+   sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);

    for (i=0;i<3*N;i++)

       recur[i] = gru->bias[3*N + i];

    sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);

@@ -277,41 +282,42 @@

       state[i] = h[i];

-void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input)

+/* WARNING: for efficiency reasons, this function overwrites the input vector. */

+void compute_sparse_gru(const SparseGRULayer *gru, float *state, float *input)

    int i, k;

    int N;

-   float zrh[3*MAX_RNN_NEURONS];

    float recur[3*MAX_RNN_NEURONS];

    float *z;

    float *r;

    float *h;

+   const float *bias;

    N = gru->nb_neurons;

-   z = zrh;

-   r = &zrh[N];

-   h = &zrh[2*N];

+   z = input;

+   r = &input[N];

+   h = &input[2*N];

    celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);

    celt_assert(input != state);

    celt_assert(gru->reset_after);

-   RNN_COPY(zrh, input, 3*N);

-   for (i=0;i<3*N;i++)

-      recur[i] = gru->bias[3*N + i];

+#ifdef USE_SU_BIAS

+   bias = &gru->subias[3*N];

+#else

+   bias = &gru->bias[3*N];

+#endif

    for (k=0;k<3;k++)

       for (i=0;i<N;i++)

-         recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];

+         recur[k*N + i] = bias[k*N + i] + gru->diag_weights[k*N + i]*state[i];

-   sparse_sgemv_accum16(recur, gru->recurrent_weights, 3*N, gru->idx, state);

+   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, gru->idx, state);

    for (i=0;i<2*N;i++)

-      zrh[i] += recur[i];

-   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);

+      input[i] += recur[i];

+   compute_activation(input, input, 2*N, ACTIVATION_SIGMOID);

    for (i=0;i<N;i++)

       h[i] += recur[2*N+i]*r[i];

    compute_activation(h, h, N, gru->activation);

    for (i=0;i<N;i++)

-      h[i] = z[i]*state[i] + (1-z[i])*h[i];

-   for (i=0;i<N;i++)

-      state[i] = h[i];

+      state[i] = z[i]*state[i] + (1-z[i])*h[i];

 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)

@@ -393,9 +399,22 @@

     /* Do the sampling (from the cdf). */

     r = tmp[N-1] * ((rand()+.5f)/(RAND_MAX+1.f));

+#if 1 /* Bisection search in the CDF (faster than the equivalent linear one below). */

+    {

+        int start=-1;

+        int end = N-1;

+        while (end > start+1) {

+            int mid = (start+end)>>1;

+            if (r <= tmp[mid]) end = mid;

+            else start = mid;

+        }

+        return end;

+    }

+#else

     for (i=0;i<N-1;i++)

         if (r <= tmp[i]) return i;

     return N-1;

+#endif

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -28,6 +28,8 @@

 #ifndef _NNET_H_

 #define _NNET_H_

+#include "vec.h"

 #define ACTIVATION_LINEAR  0

 #define ACTIVATION_SIGMOID 1

 #define ACTIVATION_TANH    2

@@ -54,7 +56,8 @@

 typedef struct {

   const float *bias;

-  const float *input_weights;

+  const float *subias;

+  const qweight *input_weights;

   const float *recurrent_weights;

   int nb_inputs;

   int nb_neurons;

@@ -64,8 +67,9 @@

 typedef struct {

   const float *bias;

+  const float *subias;

   const float *diag_weights;

-  const float *recurrent_weights;

+  const qweight *recurrent_weights;

   const int *idx;

   int nb_neurons;

   int activation;

@@ -99,7 +103,7 @@

 void compute_gru3(const GRULayer *gru, float *state, const float *input);

-void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input);

+void compute_sparse_gru(const SparseGRULayer *gru, float *state, float *input);

 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);

--- a/dnn/tansig_table.h

+++ b/dnn/tansig_table.h

@@ -1,5 +1,8 @@

 /* This file is auto-generated by gen_tables */

+#ifndef TANSIG_TABLE_H

+#define TANSIG_TABLE_H

 static const float tansig_table[201] = {

 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,

 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,

@@ -43,3 +46,5 @@

 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,

 1.000000f,

};

+#endif /*TANSIG_TABLE_H*/

--- a/dnn/test_lpcnet.py

+++ /dev/null

@@ -1,106 +1,0 @@

-#!/usr/bin/python3

-'''Copyright (c) 2018 Mozilla

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-'''

-import lpcnet

-import sys

-import numpy as np

-from keras.optimizers import Adam

-from keras.callbacks import ModelCheckpoint

-from ulaw import ulaw2lin, lin2ulaw

-import keras.backend as K

-import h5py

-import tensorflow as tf

-from keras.backend.tensorflow_backend import set_session

-config = tf.ConfigProto()

-config.gpu_options.per_process_gpu_memory_fraction = 0.2

-set_session(tf.Session(config=config))

-model, enc, dec = lpcnet.new_lpcnet_model(use_gpu=False)

-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

-#model.summary()

-feature_file = sys.argv[1]

-out_file = sys.argv[2]

-frame_size = model.frame_size

-nb_features = 55

-nb_used_features = model.nb_used_features

-features = np.fromfile(feature_file, dtype='float32')

-features = np.resize(features, (-1, nb_features))

-nb_frames = 1

-feature_chunk_size = features.shape[0]

-pcm_chunk_size = frame_size*feature_chunk_size

-features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))

-features[:,:,18:36] = 0

-periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')

-model.load_weights('lpcnet20h_384_10_G16_80.h5')

-order = 16

-pcm = np.zeros((nb_frames*pcm_chunk_size, ))

-fexc = np.zeros((1, 1, 3), dtype='int16')+128

-state1 = np.zeros((1, model.rnn_units1), dtype='float32')

-state2 = np.zeros((1, model.rnn_units2), dtype='float32')

-mem = 0

-coef = 0.85

-fout = open(out_file, 'wb')

-skip = order + 1

-for c in range(0, nb_frames):

-    cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])

-    for fr in range(0, feature_chunk_size):

-        f = c*feature_chunk_size + fr

-        a = features[c, fr, nb_features-order:]

-        for i in range(skip, frame_size):

-            pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])

-            fexc[0, 0, 1] = lin2ulaw(pred)

-            p, state1, state2 = dec.predict([fexc, cfeat[:, fr:fr+1, :], state1, state2])

-            #Lower the temperature for voiced frames to reduce noisiness

-            p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5))

-            p = p/(1e-18 + np.sum(p))

-            #Cut off the tail of the remaining distribution

-            p = np.maximum(p-0.002, 0).astype('float64')

-            p = p/(1e-8 + np.sum(p))

-            fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))

-            pcm[f*frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2])

-            fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])

-            mem = coef*mem + pcm[f*frame_size + i]

-            #print(mem)

-            np.array([np.round(mem)], dtype='int16').tofile(fout)

-        skip = 0

--- a/dnn/train_lpcnet.py

+++ /dev/null

@@ -1,125 +1,0 @@

-#!/usr/bin/python3

-'''Copyright (c) 2018 Mozilla

-   Redistribution and use in source and binary forms, with or without

-   modification, are permitted provided that the following conditions

-   are met:

-   - Redistributions of source code must retain the above copyright

-   notice, this list of conditions and the following disclaimer.

-   - Redistributions in binary form must reproduce the above copyright

-   notice, this list of conditions and the following disclaimer in the

-   documentation and/or other materials provided with the distribution.

-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

-   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

-   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

-   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

-   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

-   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

-   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

-   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

-   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-'''

-# Train a LPCNet model (note not a Wavenet model)

-import lpcnet

-import sys

-import numpy as np

-from keras.optimizers import Adam

-from keras.callbacks import ModelCheckpoint

-from ulaw import ulaw2lin, lin2ulaw

-import keras.backend as K

-import h5py

-import tensorflow as tf

-from keras.backend.tensorflow_backend import set_session

-config = tf.ConfigProto()

-# use this option to reserve GPU memory, e.g. for running more than

-# one thing at a time.  Best to disable for GPUs with small memory

-config.gpu_options.per_process_gpu_memory_fraction = 0.44

-set_session(tf.Session(config=config))

-nb_epochs = 120

-# Try reducing batch_size if you run out of memory on your GPU

-batch_size = 64

-model, _, _ = lpcnet.new_lpcnet_model(training=True)

-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

-model.summary()

-feature_file = sys.argv[1]

-pcm_file = sys.argv[2]     # 16 bit unsigned short PCM samples

-frame_size = model.frame_size

-nb_features = 55

-nb_used_features = model.nb_used_features

-feature_chunk_size = 15

-pcm_chunk_size = frame_size*feature_chunk_size

-# u for unquantised, load 16 bit PCM samples and convert to mu-law

-data = np.fromfile(pcm_file, dtype='uint8')

-nb_frames = len(data)//(4*pcm_chunk_size)

-features = np.fromfile(feature_file, dtype='float32')

-# limit to discrete number of frames

-data = data[:nb_frames*4*pcm_chunk_size]

-features = features[:nb_frames*feature_chunk_size*nb_features]

-features = np.reshape(features, (nb_frames*feature_chunk_size, nb_features))

-sig = np.reshape(data[0::4], (nb_frames, pcm_chunk_size, 1))

-pred = np.reshape(data[1::4], (nb_frames, pcm_chunk_size, 1))

-in_exc = np.reshape(data[2::4], (nb_frames, pcm_chunk_size, 1))

-out_exc = np.reshape(data[3::4], (nb_frames, pcm_chunk_size, 1))

-del data

-print("ulaw std = ", np.std(out_exc))

-features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))

-features = features[:, :, :nb_used_features]

-features[:,:,18:36] = 0

-fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)

-fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)

-features = np.concatenate([fpad1, features, fpad2], axis=1)

-periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')

-in_data = np.concatenate([sig, pred, in_exc], axis=-1)

-del sig

-del pred

-del in_exc

-# dump models to disk as we go

-checkpoint = ModelCheckpoint('lpcnet30_384_10_G16_{epoch:02d}.h5')

-#Set this to True to adapt an existing model (e.g. on new data)

-adaptation = False

-if adaptation:

-    #Adapting from an existing model

-    model.load_weights('lpcnet24c_384_10_G16_120.h5')

-    sparsify = lpcnet.Sparsify(0, 0, 1, (0.05, 0.05, 0.2))

-    lr = 0.0001

-    decay = 0

-else:

-    #Training from scratch

-    sparsify = lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))

-    lr = 0.001

-    decay = 5e-5

-model.compile(optimizer=Adam(lr, amsgrad=True, decay=decay), loss='sparse_categorical_crossentropy')

-model.save_weights('lpcnet30_384_10_G16_00.h5');

-model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])

--- a/dnn/training_tf2/dump_lpcnet.py

+++ b/dnn/training_tf2/dump_lpcnet.py

@@ -39,7 +39,10 @@

 max_conv_inputs = 1

 max_mdense_tmp = 1

-def printVector(f, vector, name, dtype='float'):

+def printVector(f, vector, name, dtype='float', dotp=False):

+    if dotp:

+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))

+        vector = vector.transpose((2, 0, 3, 1))

     v = np.reshape(vector, (-1));

     #print('static const float ', name, '[', len(v), '] = \n', file=f)

     f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))

@@ -59,27 +62,37 @@

 def printSparseVector(f, A, name):

     N = A.shape[0]

-    W = np.zeros((0,))

+    W = np.zeros((0,), dtype='int')

+    W0 = np.zeros((0,))

     diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])

     A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))

     A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))

     A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))

+    AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')

     printVector(f, diag, name + '_diag')

     idx = np.zeros((0,), dtype='int')

-    for i in range(3*N//16):

+    for i in range(3*N//8):

         pos = idx.shape[0]

         idx = np.append(idx, -1)

         nb_nonzero = 0

-        for j in range(N):

-            if np.sum(np.abs(A[j, i*16:(i+1)*16])) > 1e-10:

+        for j in range(N//4):

+            block = A[j*4:(j+1)*4, i*8:(i+1)*8]

+            qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]

+            if np.sum(np.abs(block)) > 1e-10:

                 nb_nonzero = nb_nonzero + 1

                 idx = np.append(idx, j)

-                W = np.concatenate([W, A[j, i*16:(i+1)*16]])

+                vblock = qblock.transpose((1,0)).reshape((-1,))

+                W0 = np.concatenate([W0, block.reshape((-1,))])

+                W = np.concatenate([W, vblock])

         idx[pos] = nb_nonzero

-    printVector(f, W, name)

+    f.write('#ifdef DOT_PROD\n')

+    printVector(f, W, name, dtype='qweight')

+    f.write('#else /*DOT_PROD*/\n')

+    printVector(f, W0, name, dtype='qweight')

+    f.write('#endif /*DOT_PROD*/\n')

     #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)

     printVector(f, idx, name + '_idx', dtype='int')

-    return;

+    return AQ

 def dump_layer_ignore(self, f, hf):

     print("ignoring layer " + self.name + " of type " + self.__class__.__name__)

@@ -91,8 +104,11 @@

     name = 'sparse_' + self.name

     print("printing layer " + name + " of type sparse " + self.__class__.__name__)

     weights = self.get_weights()

-    printSparseVector(f, weights[1], name + '_recurrent_weights')

+    qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')

     printVector(f, weights[-1], name + '_bias')

+    subias = weights[-1].copy()

+    subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)

+    printVector(f, subias, name + '_subias')

     if hasattr(self, 'activation'):

         activation = self.activation.__name__.upper()

     else:

@@ -103,8 +119,8 @@

         reset_after = 1

     neurons = weights[0].shape[1]//3

     max_rnn_neurons = max(max_rnn_neurons, neurons)

-    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

-            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'

+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))

     hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('extern const SparseGRULayer {};\n\n'.format(name));

@@ -115,9 +131,17 @@

     name = self.name

     print("printing layer " + name + " of type " + self.__class__.__name__)

     weights = self.get_weights()

+    f.write('#ifdef DOT_PROD\n')

+    qweight = np.clip(np.round(128.*weights[0]).astype('int'), -128, 127)

+    printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight')

+    f.write('#else /*DOT_PROD*/\n')

     printVector(f, weights[0], name + '_weights')

+    f.write('#endif /*DOT_PROD*/\n')

     printVector(f, weights[1], name + '_recurrent_weights')

     printVector(f, weights[-1], name + '_bias')

+    subias = weights[-1].copy()

+    subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)

+    printVector(f, subias, name + '_subias')

     if hasattr(self, 'activation'):

         activation = self.activation.__name__.upper()

     else:

@@ -128,8 +152,8 @@

         reset_after = 1

     neurons = weights[0].shape[1]//3

     max_rnn_neurons = max(max_rnn_neurons, neurons)

-    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'

-            .format(name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))

+    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_weights,\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'

+            .format(name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))

     hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))

     hf.write('extern const GRULayer {};\n\n'.format(name));

@@ -224,7 +248,8 @@

 hf = open(hfile, 'w')

-f.write('/*This file is automatically generated from a Keras model*/\n\n')

+f.write('/*This file is automatically generated from a Keras model*/\n')

+f.write('/*based on model {}*/\n\n'.format(sys.argv[1]))

 f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))

 hf.write('/*This file is automatically generated from a Keras model*/\n\n')

--- a/dnn/training_tf2/lpcnet.py

+++ b/dnn/training_tf2/lpcnet.py

@@ -26,9 +26,12 @@

'''

 import math

+import tensorflow as tf

 from tensorflow.keras.models import Model

 from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation

+from tensorflow.compat.v1.keras.layers import CuDNNGRU

 from tensorflow.keras import backend as K

+from tensorflow.keras.constraints import Constraint

 from tensorflow.keras.initializers import Initializer

 from tensorflow.keras.callbacks import Callback

 from mdense import MDense

@@ -41,6 +44,12 @@

 embed_size = 128

 pcm_levels = 2**pcm_bits

+def quant_regularizer(x):

+    Q = 128

+    Q_1 = 1./Q

+    #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))

+    return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))

 class Sparsify(Callback):

     def __init__(self, t_start, t_end, interval, density):

         super(Sparsify, self).__init__()

@@ -73,15 +82,19 @@

                     density = 1 - (1-self.final_density[k])*(1 - r*r*r)

                 A = p[:, k*N:(k+1)*N]

                 A = A - np.diag(np.diag(A))

-                #A = np.transpose(A, (1, 0))

-                L=np.reshape(A, (N, N//16, 16))

+                #This is needed because of the CuDNNGRU strange weight ordering

+                A = np.transpose(A, (1, 0))

+                L=np.reshape(A, (N//4, 4, N//8, 8))

                 S=np.sum(L*L, axis=-1)

+                S=np.sum(S, axis=1)

                 SS=np.sort(np.reshape(S, (-1,)))

-                thresh = SS[round(N*N//16*(1-density))]

+                thresh = SS[round(N*N//32*(1-density))]

                 mask = (S>=thresh).astype('float32');

-                mask = np.repeat(mask, 16, axis=1)

+                mask = np.repeat(mask, 4, axis=0)

+                mask = np.repeat(mask, 8, axis=1)

                 mask = np.minimum(1, mask + np.diag(np.ones((N,))))

-                #mask = np.transpose(mask, (1, 0))

+                #This is needed because of the CuDNNGRU strange weight ordering

+                mask = np.transpose(mask, (1, 0))

                 p[:, k*N:(k+1)*N] = p[:, k*N:(k+1)*N]*mask

                 #print(thresh, np.mean(mask))

             w[1] = p

@@ -113,7 +126,25 @@

             'seed': self.seed

-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False):

+class WeightClip(Constraint):

+    '''Clips the weights incident to each hidden unit to be inside a range

+    '''

+    def __init__(self, c=2):

+        self.c = c

+    def __call__(self, p):

+        # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of

+        # saturation when implementing dot products with SSSE3 or AVX2.

+        return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))

+        #return K.clip(p, -self.c, self.c)

+    def get_config(self):

+        return {'name': self.__class__.__name__,

+            'c': self.c}

+constraint = WeightClip(0.992)

+def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False, quantize=False):

     pcm = Input(shape=(None, 3))

     feat = Input(shape=(None, nb_used_features))

     pitch = Input(shape=(None, 1))

@@ -140,8 +171,18 @@

     rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))

-    rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a')

-    rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b')

+    quant = quant_regularizer if quantize else None

+    if training:

+        rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a',

+              recurrent_constraint = constraint, recurrent_regularizer=quant)

+        rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b',

+               kernel_constraint=constraint, kernel_regularizer=quant)

+    else:

+        rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a',

+              recurrent_constraint = constraint, recurrent_regularizer=quant)

+        rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b',

+               kernel_constraint=constraint, kernel_regularizer=quant)

     rnn_in = Concatenate()([cpcm, rep(cfeat)])

     md = MDense(pcm_levels, activation='softmax', name='dual_fc')

--- /dev/null

+++ b/dnn/training_tf2/pade.py

@@ -1,0 +1,70 @@

+# Optimizing a rational function to optimize a tanh() approximation

+import numpy as np

+import tensorflow as tf

+from tensorflow.keras.models import Model

+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation

+import tensorflow.keras.backend as K

+from tensorflow.keras.optimizers import Adam, SGD

+def my_loss1(y_true, y_pred):

+    return 1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)

+def my_loss2(y_true, y_pred):

+    return .1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)

+def my_loss3(y_true, y_pred):

+    return .01*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)

+# Using these initializers to seed the approximation

+# with a reasonable starting point

+def num_init(shape, dtype=None):

+    rr = tf.constant([[945], [105], [1]], dtype=dtype)

+    #rr = tf.constant([[946.56757], [98.01368], [0.66841]], dtype=dtype)

+    print(rr)

+    return rr

+def den_init(shape, dtype=None):

+    rr = tf.constant([[945], [420], [15]], dtype=dtype)

+    #rr = tf.constant([[946.604], [413.342], [12.465]], dtype=dtype)

+    print(rr)

+    return rr

+x = np.arange(-10, 10, .01)

+N = len(x)

+x = np.reshape(x, (1, -1, 1))

+x2 = x*x

+x2in = np.concatenate([x2*0 + 1, x2, x2*x2], axis=2)

+yout = np.tanh(x)

+model_x = Input(shape=(None, 1,))

+model_x2 = Input(shape=(None, 3,))

+num = Dense(1, name='num', use_bias=False, kernel_initializer=num_init)

+den = Dense(1, name='den', use_bias=False, kernel_initializer=den_init)

+def ratio(x):

+    return tf.minimum(1., tf.maximum(-1., x[0]*x[1]/x[2]))

+out_layer = Lambda(ratio)

+output = out_layer([model_x, num(model_x2), den(model_x2)])

+model = Model([model_x, model_x2], output)

+model.summary()

+model.compile(Adam(0.05, beta_1=0.9, beta_2=0.9, decay=2e-5), loss='mean_squared_error')

+model.fit([x, x2in], yout, batch_size=1, epochs=500000, validation_split=0.0)

+model.compile(Adam(0.001, beta_2=0.9, decay=1e-4), loss=my_loss1)

+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)

+model.compile(Adam(0.0001, beta_2=0.9, decay=1e-4), loss=my_loss2)

+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)

+model.compile(Adam(0.00001, beta_2=0.9, decay=1e-4), loss=my_loss3)

+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)

+model.save_weights('tanh.h5')

--- /dev/null

+++ b/dnn/training_tf2/test_lpcnet.py

@@ -1,0 +1,98 @@

+#!/usr/bin/python3

+'''Copyright (c) 2018 Mozilla

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+'''

+import lpcnet

+import sys

+import numpy as np

+from ulaw import ulaw2lin, lin2ulaw

+import h5py

+model, enc, dec = lpcnet.new_lpcnet_model()

+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

+#model.summary()

+feature_file = sys.argv[1]

+out_file = sys.argv[2]

+frame_size = model.frame_size

+nb_features = 55

+nb_used_features = model.nb_used_features

+features = np.fromfile(feature_file, dtype='float32')

+features = np.resize(features, (-1, nb_features))

+nb_frames = 1

+feature_chunk_size = features.shape[0]

+pcm_chunk_size = frame_size*feature_chunk_size

+features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))

+features[:,:,18:36] = 0

+periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')

+model.load_weights('lpcnet34bq17_384_01.h5')

+order = 16

+pcm = np.zeros((nb_frames*pcm_chunk_size, ))

+fexc = np.zeros((1, 1, 3), dtype='int16')+128

+state1 = np.zeros((1, model.rnn_units1), dtype='float32')

+state2 = np.zeros((1, model.rnn_units2), dtype='float32')

+mem = 0

+coef = 0.85

+fout = open(out_file, 'wb')

+skip = order + 1

+for c in range(0, nb_frames):

+    cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])

+    for fr in range(0, feature_chunk_size):

+        f = c*feature_chunk_size + fr

+        a = features[c, fr, nb_features-order:]

+        for i in range(skip, frame_size):

+            pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])

+            fexc[0, 0, 1] = lin2ulaw(pred)

+            p, state1, state2 = dec.predict([fexc, cfeat[:, fr:fr+1, :], state1, state2])

+            #Lower the temperature for voiced frames to reduce noisiness

+            p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5))

+            p = p/(1e-18 + np.sum(p))

+            #Cut off the tail of the remaining distribution

+            p = np.maximum(p-0.002, 0).astype('float64')

+            p = p/(1e-8 + np.sum(p))

+            fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))

+            pcm[f*frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2])

+            fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])

+            mem = coef*mem + pcm[f*frame_size + i]

+            #print(mem)

+            np.array([np.round(mem)], dtype='int16').tofile(fout)

+        skip = 0

--- a/dnn/training_tf2/train_lpcnet.py

+++ b/dnn/training_tf2/train_lpcnet.py

@@ -37,23 +37,36 @@

 import h5py

 import tensorflow as tf

-gpus = tf.config.experimental.list_physical_devices('GPU')

-if gpus:

-  try:

-    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])

-  except RuntimeError as e:

-    print(e)

+#gpus = tf.config.experimental.list_physical_devices('GPU')

+#if gpus:

+#  try:

+#    tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])

+#  except RuntimeError as e:

+#    print(e)

 nb_epochs = 120

 # Try reducing batch_size if you run out of memory on your GPU

-batch_size = 64

+batch_size = 128

-model, _, _ = lpcnet.new_lpcnet_model(training=True)

+#Set this to True to adapt an existing model (e.g. on new data)

+adaptation = False

-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

-model.summary()

+if adaptation:

+    lr = 0.0001

+    decay = 0

+else:

+    lr = 0.001

+    decay = 2.5e-5

+opt = Adam(lr, decay=decay, beta_2=0.99)

+strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()

+with strategy.scope():

+    model, _, _ = lpcnet.new_lpcnet_model(training=True)

+    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

+    model.summary()

 feature_file = sys.argv[1]

 pcm_file = sys.argv[2]     # 16 bit unsigned short PCM samples

 frame_size = model.frame_size

@@ -65,7 +78,7 @@

 # u for unquantised, load 16 bit PCM samples and convert to mu-law

 data = np.fromfile(pcm_file, dtype='uint8')

-nb_frames = len(data)//(4*pcm_chunk_size)

+nb_frames = len(data)//(4*pcm_chunk_size)//batch_size*batch_size

 features = np.fromfile(feature_file, dtype='float32')

@@ -102,23 +115,15 @@

 del in_exc

 # dump models to disk as we go

-checkpoint = ModelCheckpoint('lpcnet32c_384_10_G16_{epoch:02d}.h5')

+checkpoint = ModelCheckpoint('lpcnet33e_384_{epoch:02d}.h5')

-#Set this to True to adapt an existing model (e.g. on new data)

-adaptation = False

 if adaptation:

     #Adapting from an existing model

-    model.load_weights('lpcnet24c_384_10_G16_120.h5')

+    model.load_weights('lpcnet33a_384_100.h5')

     sparsify = lpcnet.Sparsify(0, 0, 1, (0.05, 0.05, 0.2))

-    lr = 0.0001

-    decay = 0

 else:

     #Training from scratch

     sparsify = lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))

-    lr = 0.001

-    decay = 5e-5

-model.compile(optimizer=Adam(lr, decay=decay, beta_2=0.99), loss='sparse_categorical_crossentropy')

-model.save_weights('lpcnet32c_384_10_G16_00.h5');

+model.save_weights('lpcnet33e_384_00.h5');

 model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])

--- a/dnn/ulaw.py

+++ /dev/null

@@ -1,19 +1,0 @@

-import numpy as np

-import math

-scale = 255.0/32768.0

-scale_1 = 32768.0/255.0

-def ulaw2lin(u):

-    u = u - 128

-    s = np.sign(u)

-    u = np.abs(u)

-    return s*scale_1*(np.exp(u/128.*math.log(256))-1)

-def lin2ulaw(x):

-    s = np.sign(x)

-    x = np.abs(x)

-    u = (s*(128*np.log(1+scale*x)/math.log(256)))

-    u = np.clip(128 + np.round(u), 0, 255)

-    return u.astype('int16')

--- a/dnn/vec.h

+++ b/dnn/vec.h

@@ -25,9 +25,41 @@

    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

+#ifndef VEC_H

+#define VEC_H

+#include "tansig_table.h"

+#include "opus_types.h"

+#include <math.h>

+#include "arch.h"

+#ifdef __AVX__

+#include "vec_avx.h"

+#elif defined(__ARM_NEON__) || defined(__ARM_NEON)

+#include "vec_neon.h"

+#else

+#define MAX_INPUTS (2048)

+#define NO_OPTIMIZATIONS

+#ifndef DISABLE_DOT_PROD

+#define DOT_PROD

+//#define USE_SU_BIAS

+#endif

+#ifdef DOT_PROD

+typedef signed char qweight;

+#else

+typedef float qweight;

+#endif

 /* No AVX2/FMA support */

 #ifndef LPCNET_TEST

-static float celt_exp2(float x)

+static inline float celt_exp2(float x)

    int integer;

    float frac;

@@ -47,7 +79,7 @@

 #define celt_exp(x) celt_exp2((x)*1.44269504f)

-static float tansig_approx(float x)

+static inline float tansig_approx(float x)

     int i;

     float y, dy;

@@ -66,12 +98,12 @@

     return sign*y;

-static OPUS_INLINE float sigmoid_approx(float x)

+static inline float sigmoid_approx(float x)

    return .5f + .5f*tansig_approx(.5f*x);

-static void softmax(float *y, const float *x, int N)

+static inline void softmax(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -78,7 +110,7 @@

         y[i] = celt_exp(x[i]);

-static void vec_tanh(float *y, const float *x, int N)

+static inline void vec_tanh(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -87,7 +119,7 @@

-static void vec_sigmoid(float *y, const float *x, int N)

+static inline void vec_sigmoid(float *y, const float *x, int N)

     int i;

     for (i=0;i<N;i++)

@@ -96,7 +128,7 @@

 #endif

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -129,7 +161,7 @@

-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -162,3 +194,216 @@

+#ifdef DOT_PROD

+#define SCALE (128.f*127.f)

+#define SCALE_1 (1.f/128.f/127.f)

+#ifdef USE_SU_BIAS

+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   int i, j;

+   unsigned char x[MAX_INPUTS];

+   (void)col_stride;

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      for (j=0;j<cols;j+=4)

+      {

+         float * restrict y;

+         float xj0, xj1, xj2, xj3;

+         xj0 = x[j+0];

+         xj1 = x[j+1];

+         xj2 = x[j+2];

+         xj3 = x[j+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

+{

+   int i, j;

+   unsigned char x[MAX_INPUTS];

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      int colblocks;

+      colblocks = *idx++;

+      for (j=0;j<colblocks;j++)

+      {

+         int pos;

+         float * restrict y;

+         int xj0, xj1, xj2, xj3;

+         pos = 4 * (*idx++);

+         xj0 = x[pos+0];

+         xj1 = x[pos+1];

+         xj2 = x[pos+2];

+         xj3 = x[pos+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+#else /*USE_SU_BIAS*/

+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   int i, j;

+   signed char x[MAX_INPUTS];

+   (void)col_stride;

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      for (j=0;j<cols;j+=4)

+      {

+         float * restrict y;

+         float xj0, xj1, xj2, xj3;

+         xj0 = x[j+0];

+         xj1 = x[j+1];

+         xj2 = x[j+2];

+         xj3 = x[j+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

+{

+   int i, j;

+   signed char x[MAX_INPUTS];

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      int colblocks;

+      colblocks = *idx++;

+      for (j=0;j<colblocks;j++)

+      {

+         int pos;

+         float * restrict y;

+         int xj0, xj1, xj2, xj3;

+         pos = 4 * (*idx++);

+         xj0 = x[pos+0];

+         xj1 = x[pos+1];

+         xj2 = x[pos+2];

+         xj3 = x[pos+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+#endif /*USE_SU_BIAS*/

+#else /*DOT_PROD*/

+#define sgemv_accum8x4 sgemv_accum

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)

+{

+   int i, j;

+   (void)ignore;

+   for (i=0;i<rows;i+=8)

+   {

+      int cols;

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         int pos;

+         float * restrict y;

+         float xj0, xj1, xj2, xj3;

+         pos = 4 * (*idx++);

+         xj0 = x[pos+0];

+         xj1 = x[pos+1];

+         xj2 = x[pos+2];

+         xj3 = x[pos+3];

+         y = &out[i];

+         y[0] += w[0]*xj0;

+         y[1] += w[1]*xj0;

+         y[2] += w[2]*xj0;

+         y[3] += w[3]*xj0;

+         y[4] += w[4]*xj0;

+         y[5] += w[5]*xj0;

+         y[6] += w[6]*xj0;

+         y[7] += w[7]*xj0;

+         y[0] += w[8]*xj1;

+         y[1] += w[9]*xj1;

+         y[2] += w[10]*xj1;

+         y[3] += w[11]*xj1;

+         y[4] += w[12]*xj1;

+         y[5] += w[13]*xj1;

+         y[6] += w[14]*xj1;

+         y[7] += w[15]*xj1;

+         y[0] += w[16]*xj2;

+         y[1] += w[17]*xj2;

+         y[2] += w[18]*xj2;

+         y[3] += w[19]*xj2;

+         y[4] += w[20]*xj2;

+         y[5] += w[21]*xj2;

+         y[6] += w[22]*xj2;

+         y[7] += w[23]*xj2;

+         y[0] += w[24]*xj3;

+         y[1] += w[25]*xj3;

+         y[2] += w[26]*xj3;

+         y[3] += w[27]*xj3;

+         y[4] += w[28]*xj3;

+         y[5] += w[29]*xj3;

+         y[6] += w[30]*xj3;

+         y[7] += w[31]*xj3;

+         w += 32;

+      }

+   }

+}

+#endif /*DOT_PROD*/

+#endif /*no optimizations*/

+#endif /*VEC_H*/

--- a/dnn/vec_avx.h

+++ b/dnn/vec_avx.h

@@ -29,10 +29,23 @@

   AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma

*/

+#ifndef VEC_AVX_H

+#define VEC_AVX_H

 #include <immintrin.h>

+#ifndef DISABLE_DOT_PROD

+#define DOT_PROD

+#define USE_SU_BIAS

+#endif

+#ifndef __FMA__

+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)

+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)

+#endif

 #ifdef __AVX2__

-static __m256 exp8_approx(__m256 X)

+static inline __m256 exp8_approx(__m256 X)

    const __m256 K0 = _mm256_set1_ps(0.99992522f);

    const __m256 K1 = _mm256_set1_ps(0.69583354f);

@@ -41,7 +54,6 @@

    const __m256 log2_E = _mm256_set1_ps(1.44269504);

    const __m256 max_in = _mm256_set1_ps(50.f);

    const __m256 min_in = _mm256_set1_ps(-50.f);

-   const __m256i mask = _mm256_set1_epi32(0x7fffffff);

    __m256 XF, Y;

    __m256i I;

    X = _mm256_mul_ps(X, log2_E);

@@ -51,13 +63,70 @@

    X = _mm256_sub_ps(X, XF);

    Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);

    I = _mm256_slli_epi32(I, 23);

-   Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));

+   Y = _mm256_castsi256_ps(_mm256_add_epi32(I, _mm256_castps_si256(Y)));

    return Y;

+/* Approximating tanh() using a Padé-like rational function:

+   tanh(x) ~= x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)

+   subject to the +/- 1 bounds.

+   The coefficients were determined by gradient descent trying to minimize

+   the maximum deviation over the whole range (this is only possible because

+   of the bounds). The max error is around 3e-4 and is dominated by the

+   reciprocal approximation (the max error of the rational function is

+   around 6e-5).

+   */

+static inline __m256 tanh8_approx(__m256 X)

+{

+   const __m256 N0 = _mm256_set1_ps(952.52801514f);

+   const __m256 N1 = _mm256_set1_ps(96.39235687f);

+   const __m256 N2 = _mm256_set1_ps(0.60863042f);

+   const __m256 D0 = _mm256_set1_ps(952.72399902f);

+   const __m256 D1 = _mm256_set1_ps(413.36801147f);

+   const __m256 D2 = _mm256_set1_ps(11.88600922f);

+   const __m256 max_out = _mm256_set1_ps(1.f);

+   const __m256 min_out = _mm256_set1_ps(-1.f);

+   __m256 X2, num, den;

+   X2 = _mm256_mul_ps(X, X);

+   num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);

+   den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);

+   num = _mm256_mul_ps(num, X);

+   den = _mm256_rcp_ps(den);

+   num = _mm256_mul_ps(num, den);

+   return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));

+}

+/* Sigmoid approximation using a Padé-like rational function:

+   1/(1+exp(-x)) ~= 0.5 + x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)

+   subject to the [0, 1] bounds.

+   The coefficients are directly derived by dividing the tanh() coefficients

+   by powers of two to get the correct scaling. The max error is around 1.5e-4

+   and is dominated by the reciprocal approximation (the max error of the

+   rational function is around 3e-5).

+   */

+static inline __m256 sigmoid8_approx(__m256 X)

+{

+   const __m256 N0 = _mm256_set1_ps(238.13200378f);

+   const __m256 N1 = _mm256_set1_ps(6.02452230f);

+   const __m256 N2 = _mm256_set1_ps(0.00950985f);

+   const __m256 D0 = _mm256_set1_ps(952.72399902f);

+   const __m256 D1 = _mm256_set1_ps(103.34200287f);

+   const __m256 D2 = _mm256_set1_ps(0.74287558f);

+   const __m256 half = _mm256_set1_ps(0.5);

+   const __m256 max_out = _mm256_set1_ps(1.f);

+   const __m256 min_out = _mm256_set1_ps(0.f);

+   __m256 X2, num, den;

+   X2 = _mm256_mul_ps(X, X);

+   num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);

+   den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);

+   num = _mm256_mul_ps(num, X);

+   den = _mm256_rcp_ps(den);

+   num = _mm256_fmadd_ps(num, den, half);

+   return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));

+}

 #else

-#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)

-#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)

-static __m128 exp4_approx(__m128 X)

+static inline __m128 exp4_approx(__m128 X)

    const __m128 K0 = _mm_set1_ps(0.99992522f);

    const __m128 K1 = _mm_set1_ps(0.69583354f);

@@ -79,7 +148,7 @@

    Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));

    return Y;

-static __m256 exp8_approx(__m256 X)

+static inline __m256 exp8_approx(__m256 X)

    __m256 Y;

    __m128 Xhi, Xlo, Yhi, Ylo;

@@ -91,9 +160,51 @@

    Y = _mm256_insertf128_ps(Y, Ylo, 0);

    return Y;

+static inline __m128 tanh4_approx(__m128 X)

+{

+   const __m128 N0 = _mm_set1_ps(952.52801514f);

+   const __m128 N1 = _mm_set1_ps(96.39235687f);

+   const __m128 N2 = _mm_set1_ps(0.60863042f);

+   const __m128 D0 = _mm_set1_ps(952.72399902f);

+   const __m128 D1 = _mm_set1_ps(413.36801147f);

+   const __m128 D2 = _mm_set1_ps(11.88600922f);

+   const __m128 max_out = _mm_set1_ps(1.f);

+   const __m128 min_out = _mm_set1_ps(-1.f);

+   __m128 X2, num, den;

+   X2 = _mm_mul_ps(X, X);

+   num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);

+   den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);

+   num = _mm_mul_ps(num, X);

+   den = _mm_rcp_ps(den);

+   num = _mm_mul_ps(num, den);

+   return _mm_max_ps(min_out, _mm_min_ps(max_out, num));

+}

+static inline __m128 sigmoid4_approx(__m128 X)

+{

+   const __m128 N0 = _mm_set1_ps(238.13200378f);

+   const __m128 N1 = _mm_set1_ps(6.02452230f);

+   const __m128 N2 = _mm_set1_ps(0.00950985f);

+   const __m128 D0 = _mm_set1_ps(952.72399902f);

+   const __m128 D1 = _mm_set1_ps(103.34200287f);

+   const __m128 D2 = _mm_set1_ps(0.74287558f);

+   const __m128 half = _mm_set1_ps(0.5);

+   const __m128 max_out = _mm_set1_ps(1.f);

+   const __m128 min_out = _mm_set1_ps(0.f);

+   __m128 X2, num, den;

+   X2 = _mm_mul_ps(X, X);

+   num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);

+   den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);

+   num = _mm_mul_ps(num, X);

+   den = _mm_rcp_ps(den);

+   num = _mm_fmadd_ps(num, den, half);

+   return _mm_max_ps(min_out, _mm_min_ps(max_out, num));

+}

 #endif

-static float celt_exp(float x)

+static inline float celt_exp(float x)

    float out[8];

    __m256 X, Y;

@@ -103,7 +214,7 @@

    return out[0];

-static void softmax(float *y, const float *x, int N)

+static inline void softmax(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-7;i+=8)

@@ -117,18 +228,15 @@

         y[i] = celt_exp(x[i]);

-static void vec_tanh(float *y, const float *x, int N)

+#ifdef __AVX2__

+static inline void vec_tanh(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-7;i+=8)

-        const __m256 two = _mm256_set1_ps(2.f);

-        const __m256 one = _mm256_set1_ps(1.f);

         __m256 X, Y;

         X = _mm256_loadu_ps(&x[i]);

-        X = _mm256_mul_ps(X, two);

-        Y = exp8_approx(X);

-        Y = _mm256_mul_ps(_mm256_sub_ps(Y, one),  _mm256_rcp_ps(_mm256_add_ps(Y, one)));

+        Y = tanh8_approx(X);

         _mm256_storeu_ps(&y[i], Y);

     for (;i<N;i++)

@@ -139,17 +247,14 @@

-static void vec_sigmoid(float *y, const float *x, int N)

+static inline void vec_sigmoid(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-7;i+=8)

-        const __m256 one = _mm256_set1_ps(1.f);

         __m256 X, Y;

         X = _mm256_loadu_ps(&x[i]);

-        Y = exp8_approx(X);

-        /* Compute as 1-1/(1+e^x) to avoid >1 values caused by the reciprocal approximation. */

-        Y = _mm256_sub_ps(one, _mm256_mul_ps(one,  _mm256_rcp_ps(_mm256_add_ps(Y, one))));

+        Y = sigmoid8_approx(X);

         _mm256_storeu_ps(&y[i], Y);

     for (;i<N;i++)

@@ -159,9 +264,47 @@

         y[i] = (ex)/(ex+1);

+#else

+static inline void vec_tanh(float *y, const float *x, int N)

+{

+    int i;

+    for (i=0;i<N-3;i+=4)

+    {

+        __m128 X, Y;

+        X = _mm_loadu_ps(&x[i]);

+        Y = tanh4_approx(X);

+        _mm_storeu_ps(&y[i], Y);

+    }

+    for (;i<N;i++)

+    {

+        float ex2;

+        ex2 = celt_exp(2*x[i]);

+        y[i] = (ex2-1)/(ex2+1);

+    }

+}

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+static inline void vec_sigmoid(float *y, const float *x, int N)

+    int i;

+    for (i=0;i<N-3;i+=4)

+    {

+        __m128 X, Y;

+        X = _mm_loadu_ps(&x[i]);

+        Y = sigmoid4_approx(X);

+        _mm_storeu_ps(&y[i], Y);

+    }

+    for (;i<N;i++)

+    {

+        float ex;

+        ex = celt_exp(x[i]);

+        y[i] = (ex)/(ex+1);

+    }

+}

+#endif

+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+{

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -186,7 +329,7 @@

       _mm256_storeu_ps (&y[8], vy8);

-static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)

+static inline void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)

    int i, j;

    for (i=0;i<rows;i+=16)

@@ -218,3 +361,198 @@

+#ifdef DOT_PROD

+#define USE_SU_BIAS

+typedef signed char qweight;

+#define MAX_INPUTS (2048)

+#define MAX_OUTPUTS (8192)

+#define SCALE (128.f*127.f)

+#define SCALE_1 (1.f/128.f/127.f)

+#if 1

+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   __m256i ones;

+   int i, j;

+   unsigned char x[MAX_INPUTS];

+   int out[MAX_OUTPUTS];

+   (void)col_stride;

+   ones = _mm256_set1_epi16(1);

+   for (i=0;i<rows;i++) out[i] = SCALE*_out[i];

+   //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

+   __m256 const127 = _mm256_set1_ps(127.f);

+   for (i=0;i<cols;i+=8) {

+       __m256 xf;

+       __m256i xi;

+       xf = _mm256_loadu_ps(&_x[i]);

+       //xf = _mm256_mul_ps(xf, const127);

+       //xf = _mm256_add_ps(xf, const127);

+       xf = _mm256_fmadd_ps(xf, const127, const127);

+       xi = _mm256_cvtps_epi32(xf);

+       xi = _mm256_packus_epi32(xi,  _mm256_setzero_si256());

+       xi = _mm256_permute4x64_epi64(xi, 0xD8);

+       xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());

+       xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));

+       //xi = _mm256_permute4x64_epi64(xi, 0x);

+       _mm256_storeu_si256 ((__m256i *)&x[i], xi);

+   }

+   for (i=0;i<rows;i+=8)

+   {

+      int * restrict y;

+      __m256i vy0;

+      y = &out[i];

+      vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);

+      for (j=0;j<cols;j+=4)

+      {

+         __m256i tmp;

+         __m256i vxj;

+         __m256i vw;

+         vxj = _mm256_set1_epi32(*(int*)&x[j]);

+         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

+         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         tmp = _mm256_madd_epi16(tmp, ones);

+         vy0 = _mm256_add_epi32(vy0, tmp);

+         w += 32;

+      }

+      _mm256_storeu_si256 ((__m256i *)&y[0], vy0);

+   }

+   for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

+}

+#else

+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   int i, j;

+   unsigned char x[MAX_INPUTS];

+   (void)col_stride;

+   for (i=0;i<rows;i++) out[i] *= SCALE;

+   for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      for (j=0;j<cols;j+=4)

+      {

+         float * restrict y;

+         float xj0, xj1, xj2, xj3;

+         xj0 = x[j+0];

+         xj1 = x[j+1];

+         xj2 = x[j+2];

+         xj3 = x[j+3];

+         y = &out[i];

+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);

+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);

+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);

+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);

+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);

+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);

+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);

+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);

+         w += 32;

+      }

+   }

+   for (i=0;i<rows;i++) out[i] *= SCALE_1;

+}

+#endif

+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

+{

+   __m256i ones;

+   int i, j;

+   unsigned char x[MAX_INPUTS];

+   int out[MAX_OUTPUTS];

+   ones = _mm256_set1_epi16(1);

+   for (i=0;i<rows;i++) out[i] = SCALE*_out[i];

+   //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);

+   __m256 const127 = _mm256_set1_ps(127.f);

+   for (i=0;i<cols;i+=8) {

+       __m256 xf;

+       __m256i xi;

+       xf = _mm256_loadu_ps(&_x[i]);

+       //xf = _mm256_mul_ps(xf, const127);

+       //xf = _mm256_add_ps(xf, const127);

+       xf = _mm256_fmadd_ps(xf, const127, const127);

+       xi = _mm256_cvtps_epi32(xf);

+       xi = _mm256_packus_epi32(xi,  _mm256_setzero_si256());

+       xi = _mm256_permute4x64_epi64(xi, 0xD8);

+       xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());

+       xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));

+       //xi = _mm256_permute4x64_epi64(xi, 0x);

+       _mm256_storeu_si256 ((__m256i *)&x[i], xi);

+   }

+   for (i=0;i<rows;i+=8)

+   {

+      int * restrict y;

+      int colblocks;

+      __m256i vy0;

+      colblocks = *idx++;

+      y = &out[i];

+      vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);

+      for (j=0;j<colblocks;j++)

+      {

+         __m256i tmp;

+         __m256i vxj;

+         __m256i vw;

+         int pos;

+         pos = 4 * (*idx++);

+         vxj = _mm256_set1_epi32(*(int*)&x[pos]);

+         vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?

+         tmp = _mm256_maddubs_epi16(vxj, vw); //swap?

+         tmp = _mm256_madd_epi16(tmp, ones);

+         vy0 = _mm256_add_epi32(vy0, tmp);

+         w += 32;

+      }

+      _mm256_storeu_si256 ((__m256i *)&y[0], vy0);

+   }

+   for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

+}

+#else /*DOT_PROD*/

+typedef float qweight;

+#define sgemv_accum8x4 sgemv_accum

+static inline void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)

+{

+   int i, j;

+   (void)ignore;

+   for (i=0;i<rows;i+=8)

+   {

+      float * restrict y;

+      int cols;

+      __m256 vy0;

+      y = &out[i];

+      vy0 = _mm256_loadu_ps(&y[0]);

+      cols = *idx++;

+      for (j=0;j<cols;j++)

+      {

+         int id;

+         __m256 vxj;

+         __m256 vw;

+         id = *idx++;

+         vxj = _mm256_broadcast_ss(&x[4*id]);

+         vw = _mm256_loadu_ps(&weights[0]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vxj = _mm256_broadcast_ss(&x[4*id+1]);

+         vw = _mm256_loadu_ps(&weights[8]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vxj = _mm256_broadcast_ss(&x[4*id+2]);

+         vw = _mm256_loadu_ps(&weights[16]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         vxj = _mm256_broadcast_ss(&x[4*id+3]);

+         vw = _mm256_loadu_ps(&weights[24]);

+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);

+         weights += 32;

+      }

+      _mm256_storeu_ps (&y[0], vy0);

+   }

+}

+#endif /*DOT_PROD*/

+#endif /*VEC_AVX_H*/

--- a/dnn/vec_neon.h

+++ b/dnn/vec_neon.h

@@ -29,8 +29,15 @@

 /* NEON support for ARM machines */

 #include <arm_neon.h>

+#ifndef DISABLE_DOT_PROD

+#define DOT_PROD

+#endif

+typedef signed char qweight;

 #ifndef LPCNET_TEST

-static OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {

+static inline OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {

   int32x4_t i;

   float32x4_t xf;

@@ -57,7 +64,7 @@

   return Y;

-static OPUS_INLINE float celt_exp(float x)

+static inline float celt_exp(float x)

    float out[4];

    float32x4_t X, Y;

@@ -67,7 +74,7 @@

    return out[0];

-static void softmax(float *y, const float *x, int N)

+static inline void softmax(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-3;i+=4)

@@ -81,7 +88,7 @@

         y[i] = celt_exp(x[i]);

-static void vec_tanh(float *y, const float *x, int N)

+static inline void vec_tanh(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-3;i+=4)

@@ -103,7 +110,7 @@

-static void vec_sigmoid(float *y, const float *x, int N)

+static inline void vec_sigmoid(float *y, const float *x, int N)

     int i;

     for (i=0;i<N-3;i+=4)

@@ -124,7 +131,7 @@

 #endif

-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)

     int i, j;

     for (i=0;i<rows;i+=16)

@@ -168,7 +175,7 @@

-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)

     int i, j;

     for (i=0;i<rows;i+=16)

@@ -206,4 +213,76 @@

 	vst1q_f32(&y[12], y12_15);

+}

+#define SCALE (128.f*127.f)

+#define SCALE_1 (1.f/128.f/127.f)

+#define MAX_INPUTS 2048

+#define MAX_OUTPUTS 8192

+static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b)

+{

+  return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)),  vmull_high_s8(a, b)));

+}

+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)

+{

+   int i, j;

+   signed char x[MAX_INPUTS];

+   int out[MAX_OUTPUTS];

+   (void)col_stride;

+   for (i=0;i<rows;i++) out[i] = (int)floor(.5+SCALE*_out[i]);

+   for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      int32x4_t acc0, acc1;

+      acc0 = vld1q_s32(&out[i]);

+      acc1 = vld1q_s32(&out[i+4]);

+      for (j=0;j<cols;j+=4)

+      {

+         int8x16_t vw0, vw1, vx;

+         vx = (int8x16_t)vld1q_dup_s32((int*)&x[j]);

+         vw0 = vld1q_s8(w);

+         vw1 = vld1q_s8(&w[16]);

+         acc0 = vdotprod(acc0, vw0, vx);

+         acc1 = vdotprod(acc1, vw1, vx);

+         w += 32;

+      }

+      vst1q_s32(&out[i], acc0);

+      vst1q_s32(&out[i+4], acc1);

+   }

+   for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

+}

+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)

+{

+   int i, j;

+   signed char x[MAX_INPUTS];

+   int out[MAX_OUTPUTS];

+   for (i=0;i<rows;i++) out[i] = (int)floor(.5+SCALE*_out[i]);

+   for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);

+   for (i=0;i<rows;i+=8)

+   {

+      int colblocks;

+      int32x4_t acc0, acc1;

+      acc0 = vld1q_s32(&out[i]);

+      acc1 = vld1q_s32(&out[i+4]);

+      colblocks = *idx++;

+      for (j=0;j<colblocks;j++)

+      {

+         int pos;

+         pos = 4 * (*idx++);

+         int8x16_t vw0, vw1, vx;

+         vx = (int8x16_t)vld1q_dup_s32((int*)&x[pos]);

+         vw0 = vld1q_s8(w);

+         vw1 = vld1q_s8(&w[16]);

+         acc0 = vdotprod(acc0, vw0, vx);

+         acc1 = vdotprod(acc1, vw1, vx);

+         w += 32;

+      }

+      vst1q_s32(&out[i], acc0);

+      vst1q_s32(&out[i+4], acc1);

+   }

+   for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];

--

⑨