ref: c151fc1853cbd4742943decf3f637b6b39431403
parent: 8e405b44e04dbd86b1349836ad2fcefbb56cdfed
parent: f0ce43389ad630f0594be81ce1716b2b72e8c883
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Wed Jun 30 14:56:04 EDT 2021
Merge branch 'exp_improved_simd2'
--- a/dnn/README.md
+++ b/dnn/README.md
@@ -4,6 +4,7 @@
- J.-M. Valin, J. Skoglund, [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://jmvalin.ca/papers/lpcnet_codec.pdf), *Submitted for INTERSPEECH 2019*.
- J.-M. Valin, J. Skoglund, [LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://jmvalin.ca/papers/lpcnet_icassp2019.pdf), *Proc. International Conference on Acoustics, Speech and Signal Processing (ICASSP)*, arXiv:1810.11846, 2019.
+- J. Skoglund, J.-M. Valin, [Improving Opus Low Bit Rate Quality with Neural Speech Synthesis](https://jmvalin.ca/papers/opusnet.pdf), *Proc. INTERSPEECH*, arxiv:1905.04628, 2020.
# Introduction
@@ -23,7 +24,9 @@
make
```
Note that the autogen.sh script is used when building from Git and will automatically download the latest model
-(models are too large to put in Git).
+(models are too large to put in Git). By default, LPCNet will attempt to use 8-bit dot product instructions on AVX*/Neon to
+speed up inference. To disable that (e.g. to avoid quantization effects when retraining), add --disable-dot-product to the
+configure script.
It is highly recommended to set the CFLAGS environment variable to enable AVX or NEON *prior* to running configure, otherwise
no vectorization will take place and the code will be very slow. On a recent x86 CPU, something like
@@ -69,7 +72,7 @@
and it will generate an lpcnet*.h5 file for each iteration. If it stops with a
"Failed to allocate RNN reserve space" message try reducing the *batch\_size* variable in train_lpcnet.py.
-1. You can synthesise speech with Python and your GPU card:
+1. You can synthesise speech with Python and your GPU card (very slow):
```
./dump_data -test test_input.s16 test_features.f32
./src/test_lpcnet.py test_features.f32 test.s16
@@ -76,7 +79,7 @@
```
Note the .h5 is hard coded in test_lpcnet.py, modify for your .h5 file.
-1. Or with C on a CPU:
+1. Or with C on a CPU (C inference is much faster):
First extract the model files nnet_data.h and nnet_data.c
```
./dump_lpcnet.py lpcnet15_384_10_G16_64.h5
@@ -95,6 +98,6 @@
# Reading Further
1. [LPCNet: DSP-Boosted Neural Speech Synthesis](https://people.xiph.org/~jm/demo/lpcnet/)
-1. Sample model files:
-https://jmvalin.ca/misc_stuff/lpcnet_models/
+1. [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://people.xiph.org/~jm/demo/lpcnet_codec/)
+1. Sample model files (check compatibility): https://media.xiph.org/lpcnet/data/
--- a/dnn/autogen.sh
+++ b/dnn/autogen.sh
@@ -6,7 +6,7 @@
test -n "$srcdir" && cd "$srcdir"
#SHA1 of the first commit compatible with the current model
-commit=90ea887
+commit=cce123e
if [ ! -f lpcnet_data-$commit.tar.gz ]; then
echo "Downloading latest model"
--- a/dnn/causalconv.py
+++ /dev/null
@@ -1,52 +1,0 @@
-from keras import backend as K
-from keras.engine.topology import Layer
-from keras.layers import activations, initializers, regularizers, constraints, InputSpec, Conv1D
-import numpy as np
-
-class CausalConv(Conv1D):
-
- def __init__(self, filters,
- kernel_size,
- dilation_rate=1,
- activation=None,
- use_bias=True,
- kernel_initializer='glorot_uniform',
- bias_initializer='zeros',
- kernel_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- kernel_constraint=None,
- bias_constraint=None,
- return_memory=False,
- **kwargs):
-
- super(CausalConv, self).__init__(
- filters=filters,
- kernel_size=kernel_size,
- strides=1,
- padding='valid',
- data_format='channels_last',
- dilation_rate=dilation_rate,
- activation=activation,
- use_bias=use_bias,
- kernel_initializer=kernel_initializer,
- bias_initializer=bias_initializer,
- kernel_regularizer=kernel_regularizer,
- bias_regularizer=bias_regularizer,
- activity_regularizer=activity_regularizer,
- kernel_constraint=kernel_constraint,
- bias_constraint=bias_constraint,
- **kwargs)
- self.mem_size = dilation_rate*(kernel_size-1)
- self.return_memory = return_memory
-
- def call(self, inputs, memory=None):
- if memory is None:
- mem = K.zeros((K.shape(inputs)[0], self.mem_size, K.shape(inputs)[-1]))
- else:
- mem = K.variable(K.cast_to_floatx(memory))
- inputs = K.concatenate([mem, inputs], axis=1)
- ret = super(CausalConv, self).call(inputs)
- if self.return_memory:
- ret = ret, inputs[:, :self.mem_size, :]
- return ret
--- a/dnn/configure.ac
+++ b/dnn/configure.ac
@@ -73,6 +73,14 @@
AC_DEFINE([OP_ENABLE_ASSERTIONS], [1], [Enable assertions in code])
])
+AC_ARG_ENABLE([dot-product],
+ AS_HELP_STRING([--disable-dot-product], [Disable dot product implementation]),,
+ enable_dot_product=yes)
+
+AS_IF([test "$enable_dot_product" = "no"], [
+ AC_DEFINE([DISABLE_DOT_PROD], [1], [Disable dot product instructions])
+])
+
AS_CASE(["$ac_cv_search_lrintf"],
["no"],[],
["none required"],[],
@@ -114,8 +122,8 @@
------------------------------------------------------------------------
$PACKAGE_NAME $PACKAGE_VERSION: Automatic configuration OK.
+ Dot product intrinsics ....... ${enable_dot_product} Assertions ................... ${enable_assertions}-
Hidden visibility ............ ${cc_cv_flag_visibility} API documentation ............ ${enable_doc}--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -31,6 +31,7 @@
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
+#include <unistd.h>
#include "kiss_fft.h"
#include "common.h"
#include <math.h>
@@ -141,6 +142,7 @@
int encode = 0;
int decode = 0;
int quantize = 0;
+ srand(getpid());
st = lpcnet_encoder_create();
if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {@@ -231,7 +233,7 @@
}
if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;
if (training && ++gain_change_count > 2821) {- float tmp;
+ float tmp, tmp2;
speech_gain = pow(10., (-20+(rand()%40))/20.);
if (rand()%20==0) speech_gain *= .01;
if (rand()%100==0) speech_gain = 0;
@@ -238,7 +240,8 @@
gain_change_count = 0;
rand_resp(a_sig, b_sig);
tmp = (float)rand()/RAND_MAX;
- noise_std = 10*tmp*tmp;
+ tmp2 = (float)rand()/RAND_MAX;
+ noise_std = -log(tmp)-log(tmp2);
}
biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);
biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);
--- a/dnn/dump_lpcnet.py
+++ /dev/null
@@ -1,270 +1,0 @@
-#!/usr/bin/python3
-'''Copyright (c) 2017-2018 Mozilla
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-
-import lpcnet
-import sys
-import numpy as np
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint
-from keras.layers import Layer, GRU, CuDNNGRU, Dense, Conv1D, Embedding
-from ulaw import ulaw2lin, lin2ulaw
-from mdense import MDense
-import keras.backend as K
-import h5py
-import re
-
-max_rnn_neurons = 1
-max_conv_inputs = 1
-max_mdense_tmp = 1
-
-def printVector(f, vector, name, dtype='float'):
- v = np.reshape(vector, (-1));
- #print('static const float ', name, '[', len(v), '] = \n', file=f)- f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))- for i in range(0, len(v)):
- f.write('{}'.format(v[i]))- if (i!=len(v)-1):
- f.write(',')- else:
- break;
- if (i%8==7):
- f.write("\n ")- else:
- f.write(" ")- #print(v, file=f)
- f.write('\n};\n\n')- return;
-
-def printSparseVector(f, A, name):
- N = A.shape[0]
- W = np.zeros((0,))
- diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
- A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
- A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
- A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
- printVector(f, diag, name + '_diag')
- idx = np.zeros((0,), dtype='int')
- for i in range(3*N//16):
- pos = idx.shape[0]
- idx = np.append(idx, -1)
- nb_nonzero = 0
- for j in range(N):
- if np.sum(np.abs(A[j, i*16:(i+1)*16])) > 1e-10:
- nb_nonzero = nb_nonzero + 1
- idx = np.append(idx, j)
- W = np.concatenate([W, A[j, i*16:(i+1)*16]])
- idx[pos] = nb_nonzero
- printVector(f, W, name)
- #idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
- printVector(f, idx, name + '_idx', dtype='int')
- return;
-
-def dump_layer_ignore(self, f, hf):
- print("ignoring layer " + self.name + " of type " + self.__class__.__name__)- return False
-Layer.dump_layer = dump_layer_ignore
-
-def dump_sparse_gru(self, f, hf):
- global max_rnn_neurons
- name = 'sparse_' + self.name
- print("printing layer " + name + " of type sparse " + self.__class__.__name__)- weights = self.get_weights()
- printSparseVector(f, weights[1], name + '_recurrent_weights')
- printVector(f, weights[-1], name + '_bias')
- if hasattr(self, 'activation'):
- activation = self.activation.__name__.upper()
- else:
- activation = 'TANH'
- if hasattr(self, 'reset_after') and not self.reset_after:
- reset_after = 0
- else:
- reset_after = 1
- neurons = weights[0].shape[1]//3
- max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('extern const SparseGRULayer {};\n\n'.format(name));- return True
-
-def dump_gru_layer(self, f, hf):
- global max_rnn_neurons
- name = self.name
- print("printing layer " + name + " of type " + self.__class__.__name__)- weights = self.get_weights()
- printVector(f, weights[0], name + '_weights')
- printVector(f, weights[1], name + '_recurrent_weights')
- printVector(f, weights[-1], name + '_bias')
- if hasattr(self, 'activation'):
- activation = self.activation.__name__.upper()
- else:
- activation = 'TANH'
- if hasattr(self, 'reset_after') and not self.reset_after:
- reset_after = 0
- else:
- reset_after = 1
- neurons = weights[0].shape[1]//3
- max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const GRULayer {} = {{\n {}_bias,\n {}_weights,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('extern const GRULayer {};\n\n'.format(name));- return True
-CuDNNGRU.dump_layer = dump_gru_layer
-GRU.dump_layer = dump_gru_layer
-
-def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
- printVector(f, weights, name + '_weights')
- printVector(f, bias, name + '_bias')
- f.write('const DenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights.shape[0], weights.shape[1], activation))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))- hf.write('extern const DenseLayer {};\n\n'.format(name));-
-def dump_dense_layer(self, f, hf):
- name = self.name
- print("printing layer " + name + " of type " + self.__class__.__name__)- weights = self.get_weights()
- activation = self.activation.__name__.upper()
- dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
- return False
-
-Dense.dump_layer = dump_dense_layer
-
-def dump_mdense_layer(self, f, hf):
- global max_mdense_tmp
- name = self.name
- print("printing layer " + name + " of type " + self.__class__.__name__)- weights = self.get_weights()
- printVector(f, np.transpose(weights[0], (1, 2, 0)), name + '_weights')
- printVector(f, np.transpose(weights[1], (1, 0)), name + '_bias')
- printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor')
- activation = self.activation.__name__.upper()
- max_mdense_tmp = max(max_mdense_tmp, weights[0].shape[0]*weights[0].shape[2])
- f.write('const MDenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}_factor,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[0]))- hf.write('extern const MDenseLayer {};\n\n'.format(name));- return False
-MDense.dump_layer = dump_mdense_layer
-
-def dump_conv1d_layer(self, f, hf):
- global max_conv_inputs
- name = self.name
- print("printing layer " + name + " of type " + self.__class__.__name__)- weights = self.get_weights()
- printVector(f, weights[0], name + '_weights')
- printVector(f, weights[-1], name + '_bias')
- activation = self.activation.__name__.upper()
- max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
- f.write('const Conv1DLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))- hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))- hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))- hf.write('extern const Conv1DLayer {};\n\n'.format(name));- return True
-Conv1D.dump_layer = dump_conv1d_layer
-
-
-def dump_embedding_layer_impl(name, weights, f, hf):
- printVector(f, weights, name + '_weights')
- f.write('const EmbeddingLayer {} = {{\n {}_weights,\n {}, {}\n}};\n\n'- .format(name, name, weights.shape[0], weights.shape[1]))
- hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))- hf.write('extern const EmbeddingLayer {};\n\n'.format(name));-
-def dump_embedding_layer(self, f, hf):
- name = self.name
- print("printing layer " + name + " of type " + self.__class__.__name__)- weights = self.get_weights()[0]
- dump_embedding_layer_impl(name, weights, f, hf)
- return False
-Embedding.dump_layer = dump_embedding_layer
-
-
-model, _, _ = lpcnet.new_lpcnet_model(rnn_units1=384, use_gpu=False)
-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
-#model.summary()
-
-model.load_weights(sys.argv[1])
-
-if len(sys.argv) > 2:
- cfile = sys.argv[2];
- hfile = sys.argv[3];
-else:
- cfile = 'nnet_data.c'
- hfile = 'nnet_data.h'
-
-
-f = open(cfile, 'w')
-hf = open(hfile, 'w')
-
-
-f.write('/*This file is automatically generated from a Keras model*/\n\n')-f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile))-
-hf.write('/*This file is automatically generated from a Keras model*/\n\n')-hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')-
-embed_size = lpcnet.embed_size
-
-E = model.get_layer('embed_sig').get_weights()[0]-W = model.get_layer('gru_a').get_weights()[0][:embed_size,:]-dump_embedding_layer_impl('gru_a_embed_sig', np.dot(E, W), f, hf)-W = model.get_layer('gru_a').get_weights()[0][embed_size:2*embed_size,:]-dump_embedding_layer_impl('gru_a_embed_pred', np.dot(E, W), f, hf)-W = model.get_layer('gru_a').get_weights()[0][2*embed_size:3*embed_size,:]-dump_embedding_layer_impl('gru_a_embed_exc', np.dot(E, W), f, hf)-W = model.get_layer('gru_a').get_weights()[0][3*embed_size:,:]-#FIXME: dump only half the biases
-b = model.get_layer('gru_a').get_weights()[2]-dump_dense_layer_impl('gru_a_dense_feature', W, b, 'LINEAR', f, hf)-
-layer_list = []
-for i, layer in enumerate(model.layers):
- if layer.dump_layer(f, hf):
- layer_list.append(layer.name)
-
-dump_sparse_gru(model.get_layer('gru_a'), f, hf)-
-hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))-hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))-hf.write('#define MAX_MDENSE_TMP {}\n\n'.format(max_mdense_tmp))-
-
-hf.write('typedef struct {\n')-for i, name in enumerate(layer_list):
- hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) -hf.write('} NNetState;\n')-
-hf.write('\n\n#endif\n')-
-f.close()
-hf.close()
--- a/dnn/gatedconv.py
+++ /dev/null
@@ -1,65 +1,0 @@
-from keras import backend as K
-from keras.engine.topology import Layer
-from keras.layers import activations, initializers, regularizers, constraints, InputSpec, Conv1D, Dense
-import numpy as np
-
-class GatedConv(Conv1D):
-
- def __init__(self, filters,
- kernel_size,
- dilation_rate=1,
- activation='tanh',
- use_bias=True,
- kernel_initializer='glorot_uniform',
- bias_initializer='zeros',
- kernel_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- kernel_constraint=None,
- bias_constraint=None,
- return_memory=False,
- **kwargs):
-
- super(GatedConv, self).__init__(
- filters=2*filters,
- kernel_size=kernel_size,
- strides=1,
- padding='valid',
- data_format='channels_last',
- dilation_rate=dilation_rate,
- activation='linear',
- use_bias=use_bias,
- kernel_initializer=kernel_initializer,
- bias_initializer=bias_initializer,
- kernel_regularizer=kernel_regularizer,
- bias_regularizer=bias_regularizer,
- activity_regularizer=activity_regularizer,
- kernel_constraint=kernel_constraint,
- bias_constraint=bias_constraint,
- **kwargs)
- self.mem_size = dilation_rate*(kernel_size-1)
- self.return_memory = return_memory
- self.out_dims = filters
- self.nongate_activation = activations.get(activation)
-
- def call(self, inputs, cond=None, memory=None):
- if memory is None:
- mem = K.zeros((K.shape(inputs)[0], self.mem_size, K.shape(inputs)[-1]))
- else:
- mem = K.variable(K.cast_to_floatx(memory))
- inputs = K.concatenate([mem, inputs], axis=1)
- ret = super(GatedConv, self).call(inputs)
- if cond is not None:
- d = Dense(2*self.out_dims, use_bias=False, activation='linear')
- ret = ret + d(cond)
- ret = self.nongate_activation(ret[:, :, :self.out_dims]) * activations.sigmoid(ret[:, :, self.out_dims:])
- if self.return_memory:
- ret = ret, inputs[:, :self.mem_size, :]
- return ret
-
- def compute_output_shape(self, input_shape):
- assert input_shape and len(input_shape) >= 2
- assert input_shape[-1]
- output_shape = list(input_shape)
- output_shape[-1] = self.out_dims
- return tuple(output_shape)
--- a/dnn/lpcnet.py
+++ /dev/null
@@ -1,176 +1,0 @@
-#!/usr/bin/python3
-'''Copyright (c) 2018 Mozilla
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-
-import math
-from keras.models import Model
-from keras.layers import Input, GRU, CuDNNGRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation
-from keras import backend as K
-from keras.initializers import Initializer
-from keras.callbacks import Callback
-from mdense import MDense
-import numpy as np
-import h5py
-import sys
-
-frame_size = 160
-pcm_bits = 8
-embed_size = 128
-pcm_levels = 2**pcm_bits
-
-class Sparsify(Callback):
- def __init__(self, t_start, t_end, interval, density):
- super(Sparsify, self).__init__()
- self.batch = 0
- self.t_start = t_start
- self.t_end = t_end
- self.interval = interval
- self.final_density = density
-
- def on_batch_end(self, batch, logs=None):
- #print("batch number", self.batch)- self.batch += 1
- if self.batch < self.t_start or ((self.batch-self.t_start) % self.interval != 0 and self.batch < self.t_end):
- #print("don't constrain");- pass
- else:
- #print("constrain");- layer = self.model.get_layer('gru_a')- w = layer.get_weights()
- p = w[1]
- nb = p.shape[1]//p.shape[0]
- N = p.shape[0]
- #print("nb = ", nb, ", N = ", N);- #print(p.shape)
- #print ("density = ", density)- for k in range(nb):
- density = self.final_density[k]
- if self.batch < self.t_end:
- r = 1 - (self.batch-self.t_start)/(self.t_end - self.t_start)
- density = 1 - (1-self.final_density[k])*(1 - r*r*r)
- A = p[:, k*N:(k+1)*N]
- A = A - np.diag(np.diag(A))
- A = np.transpose(A, (1, 0))
- L=np.reshape(A, (N, N//16, 16))
- S=np.sum(L*L, axis=-1)
- SS=np.sort(np.reshape(S, (-1,)))
- thresh = SS[round(N*N//16*(1-density))]
- mask = (S>=thresh).astype('float32');- mask = np.repeat(mask, 16, axis=1)
- mask = np.minimum(1, mask + np.diag(np.ones((N,))))
- mask = np.transpose(mask, (1, 0))
- p[:, k*N:(k+1)*N] = p[:, k*N:(k+1)*N]*mask
- #print(thresh, np.mean(mask))
- w[1] = p
- layer.set_weights(w)
-
-
-class PCMInit(Initializer):
- def __init__(self, gain=.1, seed=None):
- self.gain = gain
- self.seed = seed
-
- def __call__(self, shape, dtype=None):
- num_rows = 1
- for dim in shape[:-1]:
- num_rows *= dim
- num_cols = shape[-1]
- flat_shape = (num_rows, num_cols)
- if self.seed is not None:
- np.random.seed(self.seed)
- a = np.random.uniform(-1.7321, 1.7321, flat_shape)
- #a[:,0] = math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows
- #a[:,1] = .5*a[:,0]*a[:,0]*a[:,0]
- a = a + np.reshape(math.sqrt(12)*np.arange(-.5*num_rows+.5,.5*num_rows-.4)/num_rows, (num_rows, 1))
- return self.gain * a
-
- def get_config(self):
- return {- 'gain': self.gain,
- 'seed': self.seed
- }
-
-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, use_gpu=True, adaptation=False):
- pcm = Input(shape=(None, 3))
- feat = Input(shape=(None, nb_used_features))
- pitch = Input(shape=(None, 1))
- dec_feat = Input(shape=(None, 128))
- dec_state1 = Input(shape=(rnn_units1,))
- dec_state2 = Input(shape=(rnn_units2,))
-
- padding = 'valid' if training else 'same'
- fconv1 = Conv1D(128, 3, padding=padding, activation='tanh', name='feature_conv1')
- fconv2 = Conv1D(128, 3, padding=padding, activation='tanh', name='feature_conv2')
-
- embed = Embedding(256, embed_size, embeddings_initializer=PCMInit(), name='embed_sig')
- cpcm = Reshape((-1, embed_size*3))(embed(pcm))
-
- pembed = Embedding(256, 64, name='embed_pitch')
- cat_feat = Concatenate()([feat, Reshape((-1, 64))(pembed(pitch))])
-
- cfeat = fconv2(fconv1(cat_feat))
-
- fdense1 = Dense(128, activation='tanh', name='feature_dense1')
- fdense2 = Dense(128, activation='tanh', name='feature_dense2')
-
- cfeat = fdense2(fdense1(cfeat))
-
- rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
-
- if use_gpu:
- rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a')
- rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b')
- else:
- rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a')
- rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b')
-
- rnn_in = Concatenate()([cpcm, rep(cfeat)])
- md = MDense(pcm_levels, activation='softmax', name='dual_fc')
- gru_out1, _ = rnn(rnn_in)
- gru_out2, _ = rnn2(Concatenate()([gru_out1, rep(cfeat)]))
- ulaw_prob = md(gru_out2)
-
- if adaptation:
- rnn.trainable=False
- rnn2.trainable=False
- md.trainable=False
- embed.Trainable=False
-
- model = Model([pcm, feat, pitch], ulaw_prob)
- model.rnn_units1 = rnn_units1
- model.rnn_units2 = rnn_units2
- model.nb_used_features = nb_used_features
- model.frame_size = frame_size
-
- encoder = Model([feat, pitch], cfeat)
-
- dec_rnn_in = Concatenate()([cpcm, dec_feat])
- dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1)
- dec_gru_out2, state2 = rnn2(Concatenate()([dec_gru_out1, dec_feat]), initial_state=dec_state2)
- dec_ulaw_prob = md(dec_gru_out2)
-
- decoder = Model([pcm, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])
- return model, encoder, decoder
--- a/dnn/mdense.py
+++ /dev/null
@@ -1,94 +1,0 @@
-from keras import backend as K
-from keras.engine.topology import Layer
-from keras.layers import activations, initializers, regularizers, constraints, InputSpec
-import numpy as np
-import math
-
-class MDense(Layer):
-
- def __init__(self, outputs,
- channels=2,
- activation=None,
- use_bias=True,
- kernel_initializer='glorot_uniform',
- bias_initializer='zeros',
- kernel_regularizer=None,
- bias_regularizer=None,
- activity_regularizer=None,
- kernel_constraint=None,
- bias_constraint=None,
- **kwargs):
- if 'input_shape' not in kwargs and 'input_dim' in kwargs:
- kwargs['input_shape'] = (kwargs.pop('input_dim'),)- super(MDense, self).__init__(**kwargs)
- self.units = outputs
- self.channels = channels
- self.activation = activations.get(activation)
- self.use_bias = use_bias
- self.kernel_initializer = initializers.get(kernel_initializer)
- self.bias_initializer = initializers.get(bias_initializer)
- self.kernel_regularizer = regularizers.get(kernel_regularizer)
- self.bias_regularizer = regularizers.get(bias_regularizer)
- self.activity_regularizer = regularizers.get(activity_regularizer)
- self.kernel_constraint = constraints.get(kernel_constraint)
- self.bias_constraint = constraints.get(bias_constraint)
- self.input_spec = InputSpec(min_ndim=2)
- self.supports_masking = True
-
- def build(self, input_shape):
- assert len(input_shape) >= 2
- input_dim = input_shape[-1]
-
- self.kernel = self.add_weight(shape=(self.units, input_dim, self.channels),
- initializer=self.kernel_initializer,
- name='kernel',
- regularizer=self.kernel_regularizer,
- constraint=self.kernel_constraint)
- if self.use_bias:
- self.bias = self.add_weight(shape=(self.units, self.channels),
- initializer=self.bias_initializer,
- name='bias',
- regularizer=self.bias_regularizer,
- constraint=self.bias_constraint)
- else:
- self.bias = None
- self.factor = self.add_weight(shape=(self.units, self.channels),
- initializer='ones',
- name='factor',
- regularizer=self.bias_regularizer,
- constraint=self.bias_constraint)
- self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})- self.built = True
-
- def call(self, inputs):
- output = K.dot(inputs, self.kernel)
- if self.use_bias:
- output = output + self.bias
- output = K.tanh(output) * self.factor
- output = K.sum(output, axis=-1)
- if self.activation is not None:
- output = self.activation(output)
- return output
-
- def compute_output_shape(self, input_shape):
- assert input_shape and len(input_shape) >= 2
- assert input_shape[-1]
- output_shape = list(input_shape)
- output_shape[-1] = self.units
- return tuple(output_shape)
-
- def get_config(self):
- config = {- 'units': self.units,
- 'activation': activations.serialize(self.activation),
- 'use_bias': self.use_bias,
- 'kernel_initializer': initializers.serialize(self.kernel_initializer),
- 'bias_initializer': initializers.serialize(self.bias_initializer),
- 'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
- 'bias_regularizer': regularizers.serialize(self.bias_regularizer),
- 'activity_regularizer': regularizers.serialize(self.activity_regularizer),
- 'kernel_constraint': constraints.serialize(self.kernel_constraint),
- 'bias_constraint': constraints.serialize(self.bias_constraint)
- }
- base_config = super(MDense, self).get_config()
- return dict(list(base_config.items()) + list(config.items()))
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -39,17 +39,14 @@
#include "nnet.h"
#include "nnet_data.h"
-#define SOFTMAX_HACK
-
-#ifdef __AVX__
-#include "vec_avx.h"
-#elif __ARM_NEON__
-#include "vec_neon.h"
-#else
+#ifdef NO_OPTIMIZATIONS
#warning Compiling without any vectorization. This code will be very slow
-#include "vec.h"
#endif
+
+#define SOFTMAX_HACK
+
+
static OPUS_INLINE float relu(float x)
{return x < 0 ? 0 : x;
@@ -83,8 +80,9 @@
output[i] = relu(input[i]);
} else if (activation == ACTIVATION_SOFTMAX) {#ifdef SOFTMAX_HACK
- for (i=0;i<N;i++)
- output[i] = input[i];
+ RNN_COPY(output, input, N);
+ /*for (i=0;i<N;i++)
+ output[i] = input[i];*/
#else
float sum = 0;
softmax(output, input, N);
@@ -143,6 +141,7 @@
compute_activation(output, output, N, layer->activation);
}
+#if 0
void compute_gru(const GRULayer *gru, float *state, const float *input)
{int i;
@@ -204,6 +203,7 @@
for (i=0;i<N;i++)
state[i] = h[i];
}
+#endif
void compute_gru2(const GRULayer *gru, float *state, const float *input)
{@@ -225,9 +225,14 @@
celt_assert(gru->reset_after);
stride = 3*N;
/* Compute update gate. */
+#ifdef USE_SU_BIAS
for (i=0;i<3*N;i++)
+ zrh[i] = gru->subias[i];
+#else
+ for (i=0;i<3*N;i++)
zrh[i] = gru->bias[i];
- sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);
+#endif
+ sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);
for (i=0;i<3*N;i++)
recur[i] = gru->bias[3*N + i];
sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);
@@ -277,41 +282,42 @@
state[i] = h[i];
}
-void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input)
+/* WARNING: for efficiency reasons, this function overwrites the input vector. */
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, float *input)
{int i, k;
int N;
- float zrh[3*MAX_RNN_NEURONS];
float recur[3*MAX_RNN_NEURONS];
float *z;
float *r;
float *h;
+ const float *bias;
N = gru->nb_neurons;
- z = zrh;
- r = &zrh[N];
- h = &zrh[2*N];
+ z = input;
+ r = &input[N];
+ h = &input[2*N];
celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
celt_assert(input != state);
celt_assert(gru->reset_after);
- RNN_COPY(zrh, input, 3*N);
- for (i=0;i<3*N;i++)
- recur[i] = gru->bias[3*N + i];
+#ifdef USE_SU_BIAS
+ bias = &gru->subias[3*N];
+#else
+ bias = &gru->bias[3*N];
+#endif
for (k=0;k<3;k++)
{for (i=0;i<N;i++)
- recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
+ recur[k*N + i] = bias[k*N + i] + gru->diag_weights[k*N + i]*state[i];
}
- sparse_sgemv_accum16(recur, gru->recurrent_weights, 3*N, gru->idx, state);
+ sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, N, gru->idx, state);
for (i=0;i<2*N;i++)
- zrh[i] += recur[i];
- compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+ input[i] += recur[i];
+ compute_activation(input, input, 2*N, ACTIVATION_SIGMOID);
for (i=0;i<N;i++)
h[i] += recur[2*N+i]*r[i];
compute_activation(h, h, N, gru->activation);
for (i=0;i<N;i++)
- h[i] = z[i]*state[i] + (1-z[i])*h[i];
- for (i=0;i<N;i++)
- state[i] = h[i];
+ state[i] = z[i]*state[i] + (1-z[i])*h[i];
}
void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)
@@ -393,9 +399,22 @@
}
/* Do the sampling (from the cdf). */
r = tmp[N-1] * ((rand()+.5f)/(RAND_MAX+1.f));
+#if 1 /* Bisection search in the CDF (faster than the equivalent linear one below). */
+ {+ int start=-1;
+ int end = N-1;
+ while (end > start+1) {+ int mid = (start+end)>>1;
+ if (r <= tmp[mid]) end = mid;
+ else start = mid;
+ }
+ return end;
+ }
+#else
for (i=0;i<N-1;i++)
{if (r <= tmp[i]) return i;
}
return N-1;
+#endif
}
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -28,6 +28,8 @@
#ifndef _NNET_H_
#define _NNET_H_
+#include "vec.h"
+
#define ACTIVATION_LINEAR 0
#define ACTIVATION_SIGMOID 1
#define ACTIVATION_TANH 2
@@ -54,7 +56,8 @@
typedef struct {const float *bias;
- const float *input_weights;
+ const float *subias;
+ const qweight *input_weights;
const float *recurrent_weights;
int nb_inputs;
int nb_neurons;
@@ -64,8 +67,9 @@
typedef struct {const float *bias;
+ const float *subias;
const float *diag_weights;
- const float *recurrent_weights;
+ const qweight *recurrent_weights;
const int *idx;
int nb_neurons;
int activation;
@@ -99,7 +103,7 @@
void compute_gru3(const GRULayer *gru, float *state, const float *input);
-void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input);
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, float *input);
void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);
--- a/dnn/tansig_table.h
+++ b/dnn/tansig_table.h
@@ -1,5 +1,8 @@
/* This file is auto-generated by gen_tables */
+#ifndef TANSIG_TABLE_H
+#define TANSIG_TABLE_H
+
static const float tansig_table[201] = {0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
@@ -43,3 +46,5 @@
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f,
};
+
+#endif /*TANSIG_TABLE_H*/
--- a/dnn/test_lpcnet.py
+++ /dev/null
@@ -1,106 +1,0 @@
-#!/usr/bin/python3
-'''Copyright (c) 2018 Mozilla
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-
-import lpcnet
-import sys
-import numpy as np
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint
-from ulaw import ulaw2lin, lin2ulaw
-import keras.backend as K
-import h5py
-
-import tensorflow as tf
-from keras.backend.tensorflow_backend import set_session
-config = tf.ConfigProto()
-config.gpu_options.per_process_gpu_memory_fraction = 0.2
-set_session(tf.Session(config=config))
-
-model, enc, dec = lpcnet.new_lpcnet_model(use_gpu=False)
-
-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
-#model.summary()
-
-feature_file = sys.argv[1]
-out_file = sys.argv[2]
-frame_size = model.frame_size
-nb_features = 55
-nb_used_features = model.nb_used_features
-
-features = np.fromfile(feature_file, dtype='float32')
-features = np.resize(features, (-1, nb_features))
-nb_frames = 1
-feature_chunk_size = features.shape[0]
-pcm_chunk_size = frame_size*feature_chunk_size
-
-features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
-features[:,:,18:36] = 0
-periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')-
-
-
-model.load_weights('lpcnet20h_384_10_G16_80.h5')-
-order = 16
-
-pcm = np.zeros((nb_frames*pcm_chunk_size, ))
-fexc = np.zeros((1, 1, 3), dtype='int16')+128
-state1 = np.zeros((1, model.rnn_units1), dtype='float32')
-state2 = np.zeros((1, model.rnn_units2), dtype='float32')
-
-mem = 0
-coef = 0.85
-
-fout = open(out_file, 'wb')
-
-skip = order + 1
-for c in range(0, nb_frames):
- cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])
- for fr in range(0, feature_chunk_size):
- f = c*feature_chunk_size + fr
- a = features[c, fr, nb_features-order:]
- for i in range(skip, frame_size):
- pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])
- fexc[0, 0, 1] = lin2ulaw(pred)
-
- p, state1, state2 = dec.predict([fexc, cfeat[:, fr:fr+1, :], state1, state2])
- #Lower the temperature for voiced frames to reduce noisiness
- p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5))
- p = p/(1e-18 + np.sum(p))
- #Cut off the tail of the remaining distribution
- p = np.maximum(p-0.002, 0).astype('float64')- p = p/(1e-8 + np.sum(p))
-
- fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))
- pcm[f*frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2])
- fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])
- mem = coef*mem + pcm[f*frame_size + i]
- #print(mem)
- np.array([np.round(mem)], dtype='int16').tofile(fout)
- skip = 0
-
-
--- a/dnn/train_lpcnet.py
+++ /dev/null
@@ -1,125 +1,0 @@
-#!/usr/bin/python3
-'''Copyright (c) 2018 Mozilla
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- - Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- - Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
- CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-'''
-
-# Train a LPCNet model (note not a Wavenet model)
-
-import lpcnet
-import sys
-import numpy as np
-from keras.optimizers import Adam
-from keras.callbacks import ModelCheckpoint
-from ulaw import ulaw2lin, lin2ulaw
-import keras.backend as K
-import h5py
-
-import tensorflow as tf
-from keras.backend.tensorflow_backend import set_session
-config = tf.ConfigProto()
-
-# use this option to reserve GPU memory, e.g. for running more than
-# one thing at a time. Best to disable for GPUs with small memory
-config.gpu_options.per_process_gpu_memory_fraction = 0.44
-
-set_session(tf.Session(config=config))
-
-nb_epochs = 120
-
-# Try reducing batch_size if you run out of memory on your GPU
-batch_size = 64
-
-model, _, _ = lpcnet.new_lpcnet_model(training=True)
-
-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
-model.summary()
-
-feature_file = sys.argv[1]
-pcm_file = sys.argv[2] # 16 bit unsigned short PCM samples
-frame_size = model.frame_size
-nb_features = 55
-nb_used_features = model.nb_used_features
-feature_chunk_size = 15
-pcm_chunk_size = frame_size*feature_chunk_size
-
-# u for unquantised, load 16 bit PCM samples and convert to mu-law
-
-data = np.fromfile(pcm_file, dtype='uint8')
-nb_frames = len(data)//(4*pcm_chunk_size)
-
-features = np.fromfile(feature_file, dtype='float32')
-
-# limit to discrete number of frames
-data = data[:nb_frames*4*pcm_chunk_size]
-features = features[:nb_frames*feature_chunk_size*nb_features]
-
-features = np.reshape(features, (nb_frames*feature_chunk_size, nb_features))
-
-sig = np.reshape(data[0::4], (nb_frames, pcm_chunk_size, 1))
-pred = np.reshape(data[1::4], (nb_frames, pcm_chunk_size, 1))
-in_exc = np.reshape(data[2::4], (nb_frames, pcm_chunk_size, 1))
-out_exc = np.reshape(data[3::4], (nb_frames, pcm_chunk_size, 1))
-del data
-
-print("ulaw std = ", np.std(out_exc))-
-features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
-features = features[:, :, :nb_used_features]
-features[:,:,18:36] = 0
-
-fpad1 = np.concatenate([features[0:1, 0:2, :], features[:-1, -2:, :]], axis=0)
-fpad2 = np.concatenate([features[1:, :2, :], features[0:1, -2:, :]], axis=0)
-features = np.concatenate([fpad1, features, fpad2], axis=1)
-
-
-periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')-
-in_data = np.concatenate([sig, pred, in_exc], axis=-1)
-
-del sig
-del pred
-del in_exc
-
-# dump models to disk as we go
-checkpoint = ModelCheckpoint('lpcnet30_384_10_G16_{epoch:02d}.h5')-
-#Set this to True to adapt an existing model (e.g. on new data)
-adaptation = False
-
-if adaptation:
- #Adapting from an existing model
- model.load_weights('lpcnet24c_384_10_G16_120.h5')- sparsify = lpcnet.Sparsify(0, 0, 1, (0.05, 0.05, 0.2))
- lr = 0.0001
- decay = 0
-else:
- #Training from scratch
- sparsify = lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))
- lr = 0.001
- decay = 5e-5
-
-model.compile(optimizer=Adam(lr, amsgrad=True, decay=decay), loss='sparse_categorical_crossentropy')
-model.save_weights('lpcnet30_384_10_G16_00.h5');-model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -39,7 +39,10 @@
max_conv_inputs = 1
max_mdense_tmp = 1
-def printVector(f, vector, name, dtype='float'):
+def printVector(f, vector, name, dtype='float', dotp=False):
+ if dotp:
+ vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+ vector = vector.transpose((2, 0, 3, 1))
v = np.reshape(vector, (-1));
#print('static const float ', name, '[', len(v), '] = \n', file=f) f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))@@ -59,27 +62,37 @@
def printSparseVector(f, A, name):
N = A.shape[0]
- W = np.zeros((0,))
+ W = np.zeros((0,), dtype='int')
+ W0 = np.zeros((0,))
diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+ AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')printVector(f, diag, name + '_diag')
idx = np.zeros((0,), dtype='int')
- for i in range(3*N//16):
+ for i in range(3*N//8):
pos = idx.shape[0]
idx = np.append(idx, -1)
nb_nonzero = 0
- for j in range(N):
- if np.sum(np.abs(A[j, i*16:(i+1)*16])) > 1e-10:
+ for j in range(N//4):
+ block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+ qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+ if np.sum(np.abs(block)) > 1e-10:
nb_nonzero = nb_nonzero + 1
idx = np.append(idx, j)
- W = np.concatenate([W, A[j, i*16:(i+1)*16]])
+ vblock = qblock.transpose((1,0)).reshape((-1,))
+ W0 = np.concatenate([W0, block.reshape((-1,))])
+ W = np.concatenate([W, vblock])
idx[pos] = nb_nonzero
- printVector(f, W, name)
+ f.write('#ifdef DOT_PROD\n')+ printVector(f, W, name, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')+ printVector(f, W0, name, dtype='qweight')
+ f.write('#endif /*DOT_PROD*/\n')#idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
printVector(f, idx, name + '_idx', dtype='int')
- return;
+ return AQ
def dump_layer_ignore(self, f, hf):
print("ignoring layer " + self.name + " of type " + self.__class__.__name__)@@ -91,8 +104,11 @@
name = 'sparse_' + self.name
print("printing layer " + name + " of type sparse " + self.__class__.__name__)weights = self.get_weights()
- printSparseVector(f, weights[1], name + '_recurrent_weights')
+ qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+ printVector(f, subias, name + '_subias')
if hasattr(self, 'activation'):
activation = self.activation.__name__.upper()
else:
@@ -103,8 +119,8 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+ f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('extern const SparseGRULayer {};\n\n'.format(name));@@ -115,9 +131,17 @@
name = self.name
print("printing layer " + name + " of type " + self.__class__.__name__)weights = self.get_weights()
+ f.write('#ifdef DOT_PROD\n')+ qweight = np.clip(np.round(128.*weights[0]).astype('int'), -128, 127)+ printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')printVector(f, weights[0], name + '_weights')
+ f.write('#endif /*DOT_PROD*/\n')printVector(f, weights[1], name + '_recurrent_weights')
printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+ printVector(f, subias, name + '_subias')
if hasattr(self, 'activation'):
activation = self.activation.__name__.upper()
else:
@@ -128,8 +152,8 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const GRULayer {} = {{\n {}_bias,\n {}_weights,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+ f.write('const GRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_weights,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'+ .format(name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('extern const GRULayer {};\n\n'.format(name));@@ -224,7 +248,8 @@
hf = open(hfile, 'w')
-f.write('/*This file is automatically generated from a Keras model*/\n\n')+f.write('/*This file is automatically generated from a Keras model*/\n')+f.write('/*based on model {}*/\n\n'.format(sys.argv[1])) f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\n#include "{}"\n\n'.format(hfile)) hf.write('/*This file is automatically generated from a Keras model*/\n\n')--- a/dnn/training_tf2/lpcnet.py
+++ b/dnn/training_tf2/lpcnet.py
@@ -26,9 +26,12 @@
'''
import math
+import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
from tensorflow.keras.initializers import Initializer
from tensorflow.keras.callbacks import Callback
from mdense import MDense
@@ -41,6 +44,12 @@
embed_size = 128
pcm_levels = 2**pcm_bits
+def quant_regularizer(x):
+ Q = 128
+ Q_1 = 1./Q
+ #return .01 * tf.reduce_mean(1 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))
+ return .01 * tf.reduce_mean(K.sqrt(K.sqrt(1.0001 - tf.math.cos(2*3.1415926535897931*(Q*x-tf.round(Q*x))))))
+
class Sparsify(Callback):
def __init__(self, t_start, t_end, interval, density):
super(Sparsify, self).__init__()
@@ -73,15 +82,19 @@
density = 1 - (1-self.final_density[k])*(1 - r*r*r)
A = p[:, k*N:(k+1)*N]
A = A - np.diag(np.diag(A))
- #A = np.transpose(A, (1, 0))
- L=np.reshape(A, (N, N//16, 16))
+ #This is needed because of the CuDNNGRU strange weight ordering
+ A = np.transpose(A, (1, 0))
+ L=np.reshape(A, (N//4, 4, N//8, 8))
S=np.sum(L*L, axis=-1)
+ S=np.sum(S, axis=1)
SS=np.sort(np.reshape(S, (-1,)))
- thresh = SS[round(N*N//16*(1-density))]
+ thresh = SS[round(N*N//32*(1-density))]
mask = (S>=thresh).astype('float32');- mask = np.repeat(mask, 16, axis=1)
+ mask = np.repeat(mask, 4, axis=0)
+ mask = np.repeat(mask, 8, axis=1)
mask = np.minimum(1, mask + np.diag(np.ones((N,))))
- #mask = np.transpose(mask, (1, 0))
+ #This is needed because of the CuDNNGRU strange weight ordering
+ mask = np.transpose(mask, (1, 0))
p[:, k*N:(k+1)*N] = p[:, k*N:(k+1)*N]*mask
#print(thresh, np.mean(mask))
w[1] = p
@@ -113,7 +126,25 @@
'seed': self.seed
}
-def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False):
+class WeightClip(Constraint):
+ '''Clips the weights incident to each hidden unit to be inside a range
+ '''
+ def __init__(self, c=2):
+ self.c = c
+
+ def __call__(self, p):
+ # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+ # saturation when implementing dot products with SSSE3 or AVX2.
+ return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+ #return K.clip(p, -self.c, self.c)
+
+ def get_config(self):
+ return {'name': self.__class__.__name__,+ 'c': self.c}
+
+constraint = WeightClip(0.992)
+
+def new_lpcnet_model(rnn_units1=384, rnn_units2=16, nb_used_features = 38, training=False, adaptation=False, quantize=False):
pcm = Input(shape=(None, 3))
feat = Input(shape=(None, nb_used_features))
pitch = Input(shape=(None, 1))
@@ -140,8 +171,18 @@
rep = Lambda(lambda x: K.repeat_elements(x, frame_size, 1))
- rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a')
- rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b')
+ quant = quant_regularizer if quantize else None
+
+ if training:
+ rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True, name='gru_a',
+ recurrent_constraint = constraint, recurrent_regularizer=quant)
+ rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True, name='gru_b',
+ kernel_constraint=constraint, kernel_regularizer=quant)
+ else:
+ rnn = GRU(rnn_units1, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_a',
+ recurrent_constraint = constraint, recurrent_regularizer=quant)
+ rnn2 = GRU(rnn_units2, return_sequences=True, return_state=True, recurrent_activation="sigmoid", reset_after='true', name='gru_b',
+ kernel_constraint=constraint, kernel_regularizer=quant)
rnn_in = Concatenate()([cpcm, rep(cfeat)])
md = MDense(pcm_levels, activation='softmax', name='dual_fc')
--- /dev/null
+++ b/dnn/training_tf2/pade.py
@@ -1,0 +1,70 @@
+# Optimizing a rational function to optimize a tanh() approximation
+
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation
+import tensorflow.keras.backend as K
+from tensorflow.keras.optimizers import Adam, SGD
+
+def my_loss1(y_true, y_pred):
+ return 1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+def my_loss2(y_true, y_pred):
+ return .1*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+def my_loss3(y_true, y_pred):
+ return .01*K.mean(K.square(y_true-y_pred)) + 1*K.max(K.square(y_true-y_pred), axis=1)
+
+# Using these initializers to seed the approximation
+# with a reasonable starting point
+def num_init(shape, dtype=None):
+ rr = tf.constant([[945], [105], [1]], dtype=dtype)
+ #rr = tf.constant([[946.56757], [98.01368], [0.66841]], dtype=dtype)
+ print(rr)
+ return rr
+
+def den_init(shape, dtype=None):
+ rr = tf.constant([[945], [420], [15]], dtype=dtype)
+ #rr = tf.constant([[946.604], [413.342], [12.465]], dtype=dtype)
+ print(rr)
+ return rr
+
+
+x = np.arange(-10, 10, .01)
+N = len(x)
+x = np.reshape(x, (1, -1, 1))
+x2 = x*x
+
+x2in = np.concatenate([x2*0 + 1, x2, x2*x2], axis=2)
+yout = np.tanh(x)
+
+
+model_x = Input(shape=(None, 1,))
+model_x2 = Input(shape=(None, 3,))
+
+num = Dense(1, name='num', use_bias=False, kernel_initializer=num_init)
+den = Dense(1, name='den', use_bias=False, kernel_initializer=den_init)
+
+def ratio(x):
+ return tf.minimum(1., tf.maximum(-1., x[0]*x[1]/x[2]))
+
+out_layer = Lambda(ratio)
+output = out_layer([model_x, num(model_x2), den(model_x2)])
+
+model = Model([model_x, model_x2], output)
+model.summary()
+
+model.compile(Adam(0.05, beta_1=0.9, beta_2=0.9, decay=2e-5), loss='mean_squared_error')
+model.fit([x, x2in], yout, batch_size=1, epochs=500000, validation_split=0.0)
+
+model.compile(Adam(0.001, beta_2=0.9, decay=1e-4), loss=my_loss1)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.compile(Adam(0.0001, beta_2=0.9, decay=1e-4), loss=my_loss2)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.compile(Adam(0.00001, beta_2=0.9, decay=1e-4), loss=my_loss3)
+model.fit([x, x2in], yout, batch_size=1, epochs=50000, validation_split=0.0)
+
+model.save_weights('tanh.h5')--- /dev/null
+++ b/dnn/training_tf2/test_lpcnet.py
@@ -1,0 +1,98 @@
+#!/usr/bin/python3
+'''Copyright (c) 2018 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import lpcnet
+import sys
+import numpy as np
+from ulaw import ulaw2lin, lin2ulaw
+import h5py
+
+
+model, enc, dec = lpcnet.new_lpcnet_model()
+
+model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+#model.summary()
+
+feature_file = sys.argv[1]
+out_file = sys.argv[2]
+frame_size = model.frame_size
+nb_features = 55
+nb_used_features = model.nb_used_features
+
+features = np.fromfile(feature_file, dtype='float32')
+features = np.resize(features, (-1, nb_features))
+nb_frames = 1
+feature_chunk_size = features.shape[0]
+pcm_chunk_size = frame_size*feature_chunk_size
+
+features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
+features[:,:,18:36] = 0
+periods = (.1 + 50*features[:,:,36:37]+100).astype('int16')+
+
+
+model.load_weights('lpcnet34bq17_384_01.h5')+
+order = 16
+
+pcm = np.zeros((nb_frames*pcm_chunk_size, ))
+fexc = np.zeros((1, 1, 3), dtype='int16')+128
+state1 = np.zeros((1, model.rnn_units1), dtype='float32')
+state2 = np.zeros((1, model.rnn_units2), dtype='float32')
+
+mem = 0
+coef = 0.85
+
+fout = open(out_file, 'wb')
+
+skip = order + 1
+for c in range(0, nb_frames):
+ cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])
+ for fr in range(0, feature_chunk_size):
+ f = c*feature_chunk_size + fr
+ a = features[c, fr, nb_features-order:]
+ for i in range(skip, frame_size):
+ pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])
+ fexc[0, 0, 1] = lin2ulaw(pred)
+
+ p, state1, state2 = dec.predict([fexc, cfeat[:, fr:fr+1, :], state1, state2])
+ #Lower the temperature for voiced frames to reduce noisiness
+ p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5))
+ p = p/(1e-18 + np.sum(p))
+ #Cut off the tail of the remaining distribution
+ p = np.maximum(p-0.002, 0).astype('float64')+ p = p/(1e-8 + np.sum(p))
+
+ fexc[0, 0, 2] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))
+ pcm[f*frame_size + i] = pred + ulaw2lin(fexc[0, 0, 2])
+ fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])
+ mem = coef*mem + pcm[f*frame_size + i]
+ #print(mem)
+ np.array([np.round(mem)], dtype='int16').tofile(fout)
+ skip = 0
+
+
--- a/dnn/training_tf2/train_lpcnet.py
+++ b/dnn/training_tf2/train_lpcnet.py
@@ -37,23 +37,36 @@
import h5py
import tensorflow as tf
-gpus = tf.config.experimental.list_physical_devices('GPU')-if gpus:
- try:
- tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
- except RuntimeError as e:
- print(e)
+#gpus = tf.config.experimental.list_physical_devices('GPU')+#if gpus:
+# try:
+# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+# except RuntimeError as e:
+# print(e)
nb_epochs = 120
# Try reducing batch_size if you run out of memory on your GPU
-batch_size = 64
+batch_size = 128
-model, _, _ = lpcnet.new_lpcnet_model(training=True)
+#Set this to True to adapt an existing model (e.g. on new data)
+adaptation = False
-model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
-model.summary()
+if adaptation:
+ lr = 0.0001
+ decay = 0
+else:
+ lr = 0.001
+ decay = 2.5e-5
+opt = Adam(lr, decay=decay, beta_2=0.99)
+strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
+
+with strategy.scope():
+ model, _, _ = lpcnet.new_lpcnet_model(training=True)
+ model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
+ model.summary()
+
feature_file = sys.argv[1]
pcm_file = sys.argv[2] # 16 bit unsigned short PCM samples
frame_size = model.frame_size
@@ -65,7 +78,7 @@
# u for unquantised, load 16 bit PCM samples and convert to mu-law
data = np.fromfile(pcm_file, dtype='uint8')
-nb_frames = len(data)//(4*pcm_chunk_size)
+nb_frames = len(data)//(4*pcm_chunk_size)//batch_size*batch_size
features = np.fromfile(feature_file, dtype='float32')
@@ -102,23 +115,15 @@
del in_exc
# dump models to disk as we go
-checkpoint = ModelCheckpoint('lpcnet32c_384_10_G16_{epoch:02d}.h5')+checkpoint = ModelCheckpoint('lpcnet33e_384_{epoch:02d}.h5')-#Set this to True to adapt an existing model (e.g. on new data)
-adaptation = False
-
if adaptation:
#Adapting from an existing model
- model.load_weights('lpcnet24c_384_10_G16_120.h5')+ model.load_weights('lpcnet33a_384_100.h5')sparsify = lpcnet.Sparsify(0, 0, 1, (0.05, 0.05, 0.2))
- lr = 0.0001
- decay = 0
else:
#Training from scratch
sparsify = lpcnet.Sparsify(2000, 40000, 400, (0.05, 0.05, 0.2))
- lr = 0.001
- decay = 5e-5
-model.compile(optimizer=Adam(lr, decay=decay, beta_2=0.99), loss='sparse_categorical_crossentropy')
-model.save_weights('lpcnet32c_384_10_G16_00.h5');+model.save_weights('lpcnet33e_384_00.h5');model.fit([in_data, features, periods], out_exc, batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=[checkpoint, sparsify])
--- a/dnn/ulaw.py
+++ /dev/null
@@ -1,19 +1,0 @@
-
-import numpy as np
-import math
-
-scale = 255.0/32768.0
-scale_1 = 32768.0/255.0
-def ulaw2lin(u):
- u = u - 128
- s = np.sign(u)
- u = np.abs(u)
- return s*scale_1*(np.exp(u/128.*math.log(256))-1)
-
-
-def lin2ulaw(x):
- s = np.sign(x)
- x = np.abs(x)
- u = (s*(128*np.log(1+scale*x)/math.log(256)))
- u = np.clip(128 + np.round(u), 0, 255)
- return u.astype('int16')--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -25,9 +25,41 @@
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+
+#ifndef VEC_H
+#define VEC_H
+
+#include "tansig_table.h"
+#include "opus_types.h"
+#include <math.h>
+#include "arch.h"
+
+
+#ifdef __AVX__
+#include "vec_avx.h"
+#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "vec_neon.h"
+#else
+
+#define MAX_INPUTS (2048)
+
+#define NO_OPTIMIZATIONS
+
+#ifndef DISABLE_DOT_PROD
+#define DOT_PROD
+//#define USE_SU_BIAS
+#endif
+
+#ifdef DOT_PROD
+typedef signed char qweight;
+#else
+typedef float qweight;
+#endif
+
+
/* No AVX2/FMA support */
#ifndef LPCNET_TEST
-static float celt_exp2(float x)
+static inline float celt_exp2(float x)
{int integer;
float frac;
@@ -47,7 +79,7 @@
}
#define celt_exp(x) celt_exp2((x)*1.44269504f)
-static float tansig_approx(float x)
+static inline float tansig_approx(float x)
{int i;
float y, dy;
@@ -66,12 +98,12 @@
return sign*y;
}
-static OPUS_INLINE float sigmoid_approx(float x)
+static inline float sigmoid_approx(float x)
{return .5f + .5f*tansig_approx(.5f*x);
}
-static void softmax(float *y, const float *x, int N)
+static inline void softmax(float *y, const float *x, int N)
{int i;
for (i=0;i<N;i++)
@@ -78,7 +110,7 @@
y[i] = celt_exp(x[i]);
}
-static void vec_tanh(float *y, const float *x, int N)
+static inline void vec_tanh(float *y, const float *x, int N)
{int i;
for (i=0;i<N;i++)
@@ -87,7 +119,7 @@
}
}
-static void vec_sigmoid(float *y, const float *x, int N)
+static inline void vec_sigmoid(float *y, const float *x, int N)
{int i;
for (i=0;i<N;i++)
@@ -96,7 +128,7 @@
}
}
#endif
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{int i, j;
for (i=0;i<rows;i+=16)
@@ -129,7 +161,7 @@
}
}
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
{int i, j;
for (i=0;i<rows;i+=16)
@@ -162,3 +194,216 @@
}
}
}
+
+#ifdef DOT_PROD
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+
+#ifdef USE_SU_BIAS
+
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{+ int i, j;
+ unsigned char x[MAX_INPUTS];
+ (void)col_stride;
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ for (j=0;j<cols;j+=4)
+ {+ float * restrict y;
+ float xj0, xj1, xj2, xj3;
+ xj0 = x[j+0];
+ xj1 = x[j+1];
+ xj2 = x[j+2];
+ xj3 = x[j+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{+ int i, j;
+ unsigned char x[MAX_INPUTS];
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ int colblocks;
+ colblocks = *idx++;
+ for (j=0;j<colblocks;j++)
+ {+ int pos;
+ float * restrict y;
+ int xj0, xj1, xj2, xj3;
+ pos = 4 * (*idx++);
+ xj0 = x[pos+0];
+ xj1 = x[pos+1];
+ xj2 = x[pos+2];
+ xj3 = x[pos+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#else /*USE_SU_BIAS*/
+
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{+ int i, j;
+ signed char x[MAX_INPUTS];
+ (void)col_stride;
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ for (j=0;j<cols;j+=4)
+ {+ float * restrict y;
+ float xj0, xj1, xj2, xj3;
+ xj0 = x[j+0];
+ xj1 = x[j+1];
+ xj2 = x[j+2];
+ xj3 = x[j+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{+ int i, j;
+ signed char x[MAX_INPUTS];
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ int colblocks;
+ colblocks = *idx++;
+ for (j=0;j<colblocks;j++)
+ {+ int pos;
+ float * restrict y;
+ int xj0, xj1, xj2, xj3;
+ pos = 4 * (*idx++);
+ xj0 = x[pos+0];
+ xj1 = x[pos+1];
+ xj2 = x[pos+2];
+ xj3 = x[pos+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#endif /*USE_SU_BIAS*/
+
+#else /*DOT_PROD*/
+
+#define sgemv_accum8x4 sgemv_accum
+
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
+{+ int i, j;
+ (void)ignore;
+ for (i=0;i<rows;i+=8)
+ {+ int cols;
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {+ int pos;
+ float * restrict y;
+ float xj0, xj1, xj2, xj3;
+ pos = 4 * (*idx++);
+ xj0 = x[pos+0];
+ xj1 = x[pos+1];
+ xj2 = x[pos+2];
+ xj3 = x[pos+3];
+ y = &out[i];
+ y[0] += w[0]*xj0;
+ y[1] += w[1]*xj0;
+ y[2] += w[2]*xj0;
+ y[3] += w[3]*xj0;
+ y[4] += w[4]*xj0;
+ y[5] += w[5]*xj0;
+ y[6] += w[6]*xj0;
+ y[7] += w[7]*xj0;
+
+ y[0] += w[8]*xj1;
+ y[1] += w[9]*xj1;
+ y[2] += w[10]*xj1;
+ y[3] += w[11]*xj1;
+ y[4] += w[12]*xj1;
+ y[5] += w[13]*xj1;
+ y[6] += w[14]*xj1;
+ y[7] += w[15]*xj1;
+
+ y[0] += w[16]*xj2;
+ y[1] += w[17]*xj2;
+ y[2] += w[18]*xj2;
+ y[3] += w[19]*xj2;
+ y[4] += w[20]*xj2;
+ y[5] += w[21]*xj2;
+ y[6] += w[22]*xj2;
+ y[7] += w[23]*xj2;
+
+ y[0] += w[24]*xj3;
+ y[1] += w[25]*xj3;
+ y[2] += w[26]*xj3;
+ y[3] += w[27]*xj3;
+ y[4] += w[28]*xj3;
+ y[5] += w[29]*xj3;
+ y[6] += w[30]*xj3;
+ y[7] += w[31]*xj3;
+ w += 32;
+ }
+ }
+}
+#endif /*DOT_PROD*/
+
+
+#endif /*no optimizations*/
+#endif /*VEC_H*/
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -29,10 +29,23 @@
AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
*/
+#ifndef VEC_AVX_H
+#define VEC_AVX_H
+
#include <immintrin.h>
+#ifndef DISABLE_DOT_PROD
+#define DOT_PROD
+#define USE_SU_BIAS
+#endif
+
+#ifndef __FMA__
+#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
+#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
+#endif
+
#ifdef __AVX2__
-static __m256 exp8_approx(__m256 X)
+static inline __m256 exp8_approx(__m256 X)
{const __m256 K0 = _mm256_set1_ps(0.99992522f);
const __m256 K1 = _mm256_set1_ps(0.69583354f);
@@ -41,7 +54,6 @@
const __m256 log2_E = _mm256_set1_ps(1.44269504);
const __m256 max_in = _mm256_set1_ps(50.f);
const __m256 min_in = _mm256_set1_ps(-50.f);
- const __m256i mask = _mm256_set1_epi32(0x7fffffff);
__m256 XF, Y;
__m256i I;
X = _mm256_mul_ps(X, log2_E);
@@ -51,13 +63,70 @@
X = _mm256_sub_ps(X, XF);
Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0);
I = _mm256_slli_epi32(I, 23);
- Y = _mm256_castsi256_ps(_mm256_and_si256(mask, _mm256_add_epi32(I, _mm256_castps_si256(Y))));
+ Y = _mm256_castsi256_ps(_mm256_add_epi32(I, _mm256_castps_si256(Y)));
return Y;
}
+
+/* Approximating tanh() using a Padé-like rational function:
+ tanh(x) ~= x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)
+ subject to the +/- 1 bounds.
+ The coefficients were determined by gradient descent trying to minimize
+ the maximum deviation over the whole range (this is only possible because
+ of the bounds). The max error is around 3e-4 and is dominated by the
+ reciprocal approximation (the max error of the rational function is
+ around 6e-5).
+ */
+static inline __m256 tanh8_approx(__m256 X)
+{+ const __m256 N0 = _mm256_set1_ps(952.52801514f);
+ const __m256 N1 = _mm256_set1_ps(96.39235687f);
+ const __m256 N2 = _mm256_set1_ps(0.60863042f);
+ const __m256 D0 = _mm256_set1_ps(952.72399902f);
+ const __m256 D1 = _mm256_set1_ps(413.36801147f);
+ const __m256 D2 = _mm256_set1_ps(11.88600922f);
+ const __m256 max_out = _mm256_set1_ps(1.f);
+ const __m256 min_out = _mm256_set1_ps(-1.f);
+ __m256 X2, num, den;
+ X2 = _mm256_mul_ps(X, X);
+ num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);
+ den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);
+ num = _mm256_mul_ps(num, X);
+ den = _mm256_rcp_ps(den);
+ num = _mm256_mul_ps(num, den);
+ return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));
+}
+
+/* Sigmoid approximation using a Padé-like rational function:
+ 1/(1+exp(-x)) ~= 0.5 + x * (N0 + N1*x^2 + N2*x^4)/(D0 + D1*x^2 + D2*x^4)
+ subject to the [0, 1] bounds.
+ The coefficients are directly derived by dividing the tanh() coefficients
+ by powers of two to get the correct scaling. The max error is around 1.5e-4
+ and is dominated by the reciprocal approximation (the max error of the
+ rational function is around 3e-5).
+ */
+static inline __m256 sigmoid8_approx(__m256 X)
+{+ const __m256 N0 = _mm256_set1_ps(238.13200378f);
+ const __m256 N1 = _mm256_set1_ps(6.02452230f);
+ const __m256 N2 = _mm256_set1_ps(0.00950985f);
+ const __m256 D0 = _mm256_set1_ps(952.72399902f);
+ const __m256 D1 = _mm256_set1_ps(103.34200287f);
+ const __m256 D2 = _mm256_set1_ps(0.74287558f);
+ const __m256 half = _mm256_set1_ps(0.5);
+ const __m256 max_out = _mm256_set1_ps(1.f);
+ const __m256 min_out = _mm256_set1_ps(0.f);
+ __m256 X2, num, den;
+ X2 = _mm256_mul_ps(X, X);
+ num = _mm256_fmadd_ps(_mm256_fmadd_ps(N2, X2, N1), X2, N0);
+ den = _mm256_fmadd_ps(_mm256_fmadd_ps(D2, X2, D1), X2, D0);
+ num = _mm256_mul_ps(num, X);
+ den = _mm256_rcp_ps(den);
+ num = _mm256_fmadd_ps(num, den, half);
+ return _mm256_max_ps(min_out, _mm256_min_ps(max_out, num));
+}
+
#else
-#define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
-#define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c)
-static __m128 exp4_approx(__m128 X)
+static inline __m128 exp4_approx(__m128 X)
{const __m128 K0 = _mm_set1_ps(0.99992522f);
const __m128 K1 = _mm_set1_ps(0.69583354f);
@@ -79,7 +148,7 @@
Y = _mm_castsi128_ps(_mm_and_si128(mask, _mm_add_epi32(I, _mm_castps_si128(Y))));
return Y;
}
-static __m256 exp8_approx(__m256 X)
+static inline __m256 exp8_approx(__m256 X)
{__m256 Y;
__m128 Xhi, Xlo, Yhi, Ylo;
@@ -91,9 +160,51 @@
Y = _mm256_insertf128_ps(Y, Ylo, 0);
return Y;
}
+
+static inline __m128 tanh4_approx(__m128 X)
+{+ const __m128 N0 = _mm_set1_ps(952.52801514f);
+ const __m128 N1 = _mm_set1_ps(96.39235687f);
+ const __m128 N2 = _mm_set1_ps(0.60863042f);
+ const __m128 D0 = _mm_set1_ps(952.72399902f);
+ const __m128 D1 = _mm_set1_ps(413.36801147f);
+ const __m128 D2 = _mm_set1_ps(11.88600922f);
+ const __m128 max_out = _mm_set1_ps(1.f);
+ const __m128 min_out = _mm_set1_ps(-1.f);
+ __m128 X2, num, den;
+ X2 = _mm_mul_ps(X, X);
+ num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);
+ den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);
+ num = _mm_mul_ps(num, X);
+ den = _mm_rcp_ps(den);
+ num = _mm_mul_ps(num, den);
+ return _mm_max_ps(min_out, _mm_min_ps(max_out, num));
+}
+
+static inline __m128 sigmoid4_approx(__m128 X)
+{+ const __m128 N0 = _mm_set1_ps(238.13200378f);
+ const __m128 N1 = _mm_set1_ps(6.02452230f);
+ const __m128 N2 = _mm_set1_ps(0.00950985f);
+ const __m128 D0 = _mm_set1_ps(952.72399902f);
+ const __m128 D1 = _mm_set1_ps(103.34200287f);
+ const __m128 D2 = _mm_set1_ps(0.74287558f);
+ const __m128 half = _mm_set1_ps(0.5);
+ const __m128 max_out = _mm_set1_ps(1.f);
+ const __m128 min_out = _mm_set1_ps(0.f);
+ __m128 X2, num, den;
+ X2 = _mm_mul_ps(X, X);
+ num = _mm_fmadd_ps(_mm_fmadd_ps(N2, X2, N1), X2, N0);
+ den = _mm_fmadd_ps(_mm_fmadd_ps(D2, X2, D1), X2, D0);
+ num = _mm_mul_ps(num, X);
+ den = _mm_rcp_ps(den);
+ num = _mm_fmadd_ps(num, den, half);
+ return _mm_max_ps(min_out, _mm_min_ps(max_out, num));
+}
+
#endif
-static float celt_exp(float x)
+static inline float celt_exp(float x)
{float out[8];
__m256 X, Y;
@@ -103,7 +214,7 @@
return out[0];
}
-static void softmax(float *y, const float *x, int N)
+static inline void softmax(float *y, const float *x, int N)
{int i;
for (i=0;i<N-7;i+=8)
@@ -117,18 +228,15 @@
y[i] = celt_exp(x[i]);
}
-static void vec_tanh(float *y, const float *x, int N)
+#ifdef __AVX2__
+static inline void vec_tanh(float *y, const float *x, int N)
{int i;
for (i=0;i<N-7;i+=8)
{- const __m256 two = _mm256_set1_ps(2.f);
- const __m256 one = _mm256_set1_ps(1.f);
__m256 X, Y;
X = _mm256_loadu_ps(&x[i]);
- X = _mm256_mul_ps(X, two);
- Y = exp8_approx(X);
- Y = _mm256_mul_ps(_mm256_sub_ps(Y, one), _mm256_rcp_ps(_mm256_add_ps(Y, one)));
+ Y = tanh8_approx(X);
_mm256_storeu_ps(&y[i], Y);
}
for (;i<N;i++)
@@ -139,17 +247,14 @@
}
}
-static void vec_sigmoid(float *y, const float *x, int N)
+static inline void vec_sigmoid(float *y, const float *x, int N)
{int i;
for (i=0;i<N-7;i+=8)
{- const __m256 one = _mm256_set1_ps(1.f);
__m256 X, Y;
X = _mm256_loadu_ps(&x[i]);
- Y = exp8_approx(X);
- /* Compute as 1-1/(1+e^x) to avoid >1 values caused by the reciprocal approximation. */
- Y = _mm256_sub_ps(one, _mm256_mul_ps(one, _mm256_rcp_ps(_mm256_add_ps(Y, one))));
+ Y = sigmoid8_approx(X);
_mm256_storeu_ps(&y[i], Y);
}
for (;i<N;i++)
@@ -159,9 +264,47 @@
y[i] = (ex)/(ex+1);
}
}
+#else
+static inline void vec_tanh(float *y, const float *x, int N)
+{+ int i;
+ for (i=0;i<N-3;i+=4)
+ {+ __m128 X, Y;
+ X = _mm_loadu_ps(&x[i]);
+ Y = tanh4_approx(X);
+ _mm_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {+ float ex2;
+ ex2 = celt_exp(2*x[i]);
+ y[i] = (ex2-1)/(ex2+1);
+ }
+}
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+static inline void vec_sigmoid(float *y, const float *x, int N)
{+ int i;
+ for (i=0;i<N-3;i+=4)
+ {+ __m128 X, Y;
+ X = _mm_loadu_ps(&x[i]);
+ Y = sigmoid4_approx(X);
+ _mm_storeu_ps(&y[i], Y);
+ }
+ for (;i<N;i++)
+ {+ float ex;
+ ex = celt_exp(x[i]);
+ y[i] = (ex)/(ex+1);
+ }
+}
+
+#endif
+
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+{int i, j;
for (i=0;i<rows;i+=16)
{@@ -186,7 +329,7 @@
_mm256_storeu_ps (&y[8], vy8);
}
}
-static void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+static inline void sparse_sgemv_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
{int i, j;
for (i=0;i<rows;i+=16)
@@ -218,3 +361,198 @@
}
}
+#ifdef DOT_PROD
+#define USE_SU_BIAS
+
+typedef signed char qweight;
+
+
+#define MAX_INPUTS (2048)
+#define MAX_OUTPUTS (8192)
+
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+#if 1
+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{+ __m256i ones;
+ int i, j;
+ unsigned char x[MAX_INPUTS];
+ int out[MAX_OUTPUTS];
+ (void)col_stride;
+ ones = _mm256_set1_epi16(1);
+ for (i=0;i<rows;i++) out[i] = SCALE*_out[i];
+ //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
+ __m256 const127 = _mm256_set1_ps(127.f);
+ for (i=0;i<cols;i+=8) {+ __m256 xf;
+ __m256i xi;
+ xf = _mm256_loadu_ps(&_x[i]);
+ //xf = _mm256_mul_ps(xf, const127);
+ //xf = _mm256_add_ps(xf, const127);
+ xf = _mm256_fmadd_ps(xf, const127, const127);
+ xi = _mm256_cvtps_epi32(xf);
+ xi = _mm256_packus_epi32(xi, _mm256_setzero_si256());
+ xi = _mm256_permute4x64_epi64(xi, 0xD8);
+ xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());
+ xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));
+ //xi = _mm256_permute4x64_epi64(xi, 0x);
+ _mm256_storeu_si256 ((__m256i *)&x[i], xi);
+ }
+ for (i=0;i<rows;i+=8)
+ {+ int * restrict y;
+ __m256i vy0;
+ y = &out[i];
+ vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
+ for (j=0;j<cols;j+=4)
+ {+ __m256i tmp;
+ __m256i vxj;
+ __m256i vw;
+ vxj = _mm256_set1_epi32(*(int*)&x[j]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ }
+ _mm256_storeu_si256 ((__m256i *)&y[0], vy0);
+ }
+ for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];
+}
+#else
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{+ int i, j;
+ unsigned char x[MAX_INPUTS];
+ (void)col_stride;
+ for (i=0;i<rows;i++) out[i] *= SCALE;
+ for (i=0;i<cols;i++) x[i] = 127+(int)floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ for (j=0;j<cols;j+=4)
+ {+ float * restrict y;
+ float xj0, xj1, xj2, xj3;
+ xj0 = x[j+0];
+ xj1 = x[j+1];
+ xj2 = x[j+2];
+ xj3 = x[j+3];
+ y = &out[i];
+ y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+ y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+ y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+ y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+ y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+ y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+ y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+ y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+ w += 32;
+ }
+ }
+ for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#endif
+
+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{+ __m256i ones;
+ int i, j;
+ unsigned char x[MAX_INPUTS];
+ int out[MAX_OUTPUTS];
+ ones = _mm256_set1_epi16(1);
+ for (i=0;i<rows;i++) out[i] = SCALE*_out[i];
+ //for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
+ __m256 const127 = _mm256_set1_ps(127.f);
+ for (i=0;i<cols;i+=8) {+ __m256 xf;
+ __m256i xi;
+ xf = _mm256_loadu_ps(&_x[i]);
+ //xf = _mm256_mul_ps(xf, const127);
+ //xf = _mm256_add_ps(xf, const127);
+ xf = _mm256_fmadd_ps(xf, const127, const127);
+ xi = _mm256_cvtps_epi32(xf);
+ xi = _mm256_packus_epi32(xi, _mm256_setzero_si256());
+ xi = _mm256_permute4x64_epi64(xi, 0xD8);
+ xi = _mm256_packus_epi16(xi, _mm256_setzero_si256());
+ xi = _mm256_permutevar8x32_epi32(xi, _mm256_setr_epi32(0,1, 0,0, 0,0, 0,0));
+ //xi = _mm256_permute4x64_epi64(xi, 0x);
+ _mm256_storeu_si256 ((__m256i *)&x[i], xi);
+ }
+ for (i=0;i<rows;i+=8)
+ {+ int * restrict y;
+ int colblocks;
+ __m256i vy0;
+ colblocks = *idx++;
+ y = &out[i];
+ vy0 = _mm256_loadu_si256((const __m256i *)&y[0]);
+ for (j=0;j<colblocks;j++)
+ {+ __m256i tmp;
+ __m256i vxj;
+ __m256i vw;
+ int pos;
+ pos = 4 * (*idx++);
+ vxj = _mm256_set1_epi32(*(int*)&x[pos]);
+ vw = _mm256_loadu_si256((const __m256i *)w); //_mm256_lddqu_si256?
+ tmp = _mm256_maddubs_epi16(vxj, vw); //swap?
+ tmp = _mm256_madd_epi16(tmp, ones);
+ vy0 = _mm256_add_epi32(vy0, tmp);
+ w += 32;
+ }
+ _mm256_storeu_si256 ((__m256i *)&y[0], vy0);
+ }
+ for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];
+}
+
+
+#else /*DOT_PROD*/
+typedef float qweight;
+#define sgemv_accum8x4 sgemv_accum
+
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)
+{+ int i, j;
+ (void)ignore;
+ for (i=0;i<rows;i+=8)
+ {+ float * restrict y;
+ int cols;
+ __m256 vy0;
+ y = &out[i];
+ vy0 = _mm256_loadu_ps(&y[0]);
+ cols = *idx++;
+ for (j=0;j<cols;j++)
+ {+ int id;
+ __m256 vxj;
+ __m256 vw;
+ id = *idx++;
+ vxj = _mm256_broadcast_ss(&x[4*id]);
+ vw = _mm256_loadu_ps(&weights[0]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+1]);
+ vw = _mm256_loadu_ps(&weights[8]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+2]);
+ vw = _mm256_loadu_ps(&weights[16]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ vxj = _mm256_broadcast_ss(&x[4*id+3]);
+ vw = _mm256_loadu_ps(&weights[24]);
+ vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+ weights += 32;
+ }
+ _mm256_storeu_ps (&y[0], vy0);
+ }
+}
+#endif /*DOT_PROD*/
+
+#endif /*VEC_AVX_H*/
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -29,8 +29,15 @@
/* NEON support for ARM machines */
#include <arm_neon.h>
+
+#ifndef DISABLE_DOT_PROD
+#define DOT_PROD
+#endif
+typedef signed char qweight;
+
+
#ifndef LPCNET_TEST
-static OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {+static inline OPUS_INLINE float32x4_t exp4_approx(float32x4_t x) {int32x4_t i;
float32x4_t xf;
@@ -57,7 +64,7 @@
return Y;
}
-static OPUS_INLINE float celt_exp(float x)
+static inline float celt_exp(float x)
{float out[4];
float32x4_t X, Y;
@@ -67,7 +74,7 @@
return out[0];
}
-static void softmax(float *y, const float *x, int N)
+static inline void softmax(float *y, const float *x, int N)
{int i;
for (i=0;i<N-3;i+=4)
@@ -81,7 +88,7 @@
y[i] = celt_exp(x[i]);
}
-static void vec_tanh(float *y, const float *x, int N)
+static inline void vec_tanh(float *y, const float *x, int N)
{int i;
for (i=0;i<N-3;i+=4)
@@ -103,7 +110,7 @@
}
}
-static void vec_sigmoid(float *y, const float *x, int N)
+static inline void vec_sigmoid(float *y, const float *x, int N)
{int i;
for (i=0;i<N-3;i+=4)
@@ -124,7 +131,7 @@
}
#endif
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
{int i, j;
for (i=0;i<rows;i+=16)
@@ -168,7 +175,7 @@
}
}
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
{int i, j;
for (i=0;i<rows;i+=16)
@@ -206,4 +213,76 @@
vst1q_f32(&y[12], y12_15);
}
+}
+
+#define SCALE (128.f*127.f)
+#define SCALE_1 (1.f/128.f/127.f)
+
+#define MAX_INPUTS 2048
+#define MAX_OUTPUTS 8192
+
+static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b)
+{+ return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)), vmull_high_s8(a, b)));
+}
+
+static inline void sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{+ int i, j;
+ signed char x[MAX_INPUTS];
+ int out[MAX_OUTPUTS];
+ (void)col_stride;
+ for (i=0;i<rows;i++) out[i] = (int)floor(.5+SCALE*_out[i]);
+ for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ int32x4_t acc0, acc1;
+ acc0 = vld1q_s32(&out[i]);
+ acc1 = vld1q_s32(&out[i+4]);
+ for (j=0;j<cols;j+=4)
+ {+ int8x16_t vw0, vw1, vx;
+ vx = (int8x16_t)vld1q_dup_s32((int*)&x[j]);
+ vw0 = vld1q_s8(w);
+ vw1 = vld1q_s8(&w[16]);
+ acc0 = vdotprod(acc0, vw0, vx);
+ acc1 = vdotprod(acc1, vw1, vx);
+ w += 32;
+ }
+ vst1q_s32(&out[i], acc0);
+ vst1q_s32(&out[i+4], acc1);
+ }
+ for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];
+}
+
+static inline void sparse_sgemv_accum8x4(float *_out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
+{+ int i, j;
+ signed char x[MAX_INPUTS];
+ int out[MAX_OUTPUTS];
+ for (i=0;i<rows;i++) out[i] = (int)floor(.5+SCALE*_out[i]);
+ for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
+ for (i=0;i<rows;i+=8)
+ {+ int colblocks;
+ int32x4_t acc0, acc1;
+ acc0 = vld1q_s32(&out[i]);
+ acc1 = vld1q_s32(&out[i+4]);
+ colblocks = *idx++;
+ for (j=0;j<colblocks;j++)
+ {+ int pos;
+ pos = 4 * (*idx++);
+ int8x16_t vw0, vw1, vx;
+ vx = (int8x16_t)vld1q_dup_s32((int*)&x[pos]);
+ vw0 = vld1q_s8(w);
+ vw1 = vld1q_s8(&w[16]);
+ acc0 = vdotprod(acc0, vw0, vx);
+ acc1 = vdotprod(acc1, vw1, vx);
+ w += 32;
+ }
+ vst1q_s32(&out[i], acc0);
+ vst1q_s32(&out[i+4], acc1);
+ }
+ for (i=0;i<rows;i++) _out[i] = SCALE_1*out[i];
}
--
⑨