ref: 93bc94ba793810e075b403f88c17536cca309414
parent: b6ac1c78bb1c6376a818c7a68ab034a68624b19a
parent: 886d647bb18cec8fb11955a27a9b48d5e7aba84a
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Tue May 23 20:57:20 EDT 2023
Merge branch 'exp_rdovae6'
--- a/dnn/Makefile.am
+++ b/dnn/Makefile.am
@@ -32,11 +32,13 @@
lpcnet.c \
lpcnet_dec.c \
lpcnet_enc.c \
+ lpcnet_tables.c \
nnet.c \
nnet_data.c \
plc_data.c \
ceps_codebooks.c \
pitch.c \
+ parse_lpcnet_weights.c \
freq.c \
kiss_fft.c \
lpcnet_plc.c
@@ -45,7 +47,7 @@
liblpcnet_la_LDFLAGS = -no-undefined \
-version-info @OP_LT_CURRENT@:@OP_LT_REVISION@:@OP_LT_AGE@
-noinst_PROGRAMS = lpcnet_demo dump_data
+noinst_PROGRAMS = lpcnet_demo dump_data dump_weights_blob
lpcnet_demo_SOURCES = lpcnet_demo.c
lpcnet_demo_LDADD = liblpcnet.la
@@ -56,9 +58,13 @@
#dump_data_SOURCES = dump_data.c
#dump_data_LDADD = $(DUMP_OBJ) $(LIBM)
-dump_data_SOURCES = common.c dump_data.c burg.c freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c ceps_codebooks.c
+dump_data_SOURCES = common.c dump_data.c burg.c freq.c kiss_fft.c pitch.c lpcnet_dec.c lpcnet_enc.c lpcnet_tables.c ceps_codebooks.c
dump_data_LDADD = $(LIBM)
dump_data_CFLAGS = $(AM_CFLAGS)
+
+dump_weights_blob_SOURCES = nnet_data.c plc_data.c write_lpcnet_weights.c
+dump_weights_blob_LDADD = $(LIBM)
+dump_weights_blob_CFLAGS = $(AM_CFLAGS) -DDUMP_BINARY_WEIGHTS
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = lpcnet.pc
--- a/dnn/README.md
+++ b/dnn/README.md
@@ -11,6 +11,7 @@
- J.-M. Valin, J. Skoglund, [A Real-Time Wideband Neural Vocoder at 1.6 kb/s Using LPCNet](https://jmvalin.ca/papers/lpcnet_codec.pdf), *Proc. INTERSPEECH*, arxiv:1903.12087, 2019.
- J. Skoglund, J.-M. Valin, [Improving Opus Low Bit Rate Quality with Neural Speech Synthesis](https://jmvalin.ca/papers/opusnet.pdf), *Proc. INTERSPEECH*, arxiv:1905.04628, 2020.
- J.-M. Valin, A. Mustafa, C. Montgomery, T.B. Terriberry, M. Klingbeil, P. Smaragdis, A. Krishnaswamy, [Real-Time Packet Loss Concealment With Mixed Generative and Predictive Model](https://jmvalin.ca/papers/lpcnet_plc.pdf), *Proc. INTERSPEECH*, arxiv:2205.05785, 2022.
+- J.-M. Valin, J. Büthe, A. Mustafa, [Low-Bitrate Redundancy Coding of Speech Using a Rate-Distortion-Optimized Variational Autoencoder](https://jmvalin.ca/papers/valin_dred.pdf), *Proc. ICASSP*, arXiv:2212.04453, 2023. ([blog post](https://www.amazon.science/blog/neural-encoding-enables-more-efficient-recovery-of-lost-audio-packets))
# Introduction
--- a/dnn/autogen.sh
+++ b/dnn/autogen.sh
@@ -6,7 +6,7 @@
test -n "$srcdir" && cd "$srcdir"
#SHA1 of the first commit compatible with the current model
-commit=97e64b3
+commit=399be7c
./download_model.sh $commit
echo "Updating build configuration files for lpcnet, please wait...."
--- a/dnn/common.c
+++ b/dnn/common.c
@@ -40,7 +40,7 @@
float ref[NB_BANDS];
float pred[3*NB_BANDS];
RNN_COPY(ref, x, NB_BANDS);
- for (i=0;i<NB_BANDS;i++) pred[i] = .5*(left[i] + right[i]);
+ for (i=0;i<NB_BANDS;i++) pred[i] = .5f*(left[i] + right[i]);
for (i=0;i<NB_BANDS;i++) pred[NB_BANDS+i] = left[i];
for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = right[i];
for (i=0;i<NB_BANDS;i++) {--- a/dnn/common.h
+++ b/dnn/common.h
@@ -36,8 +36,8 @@
{float s;
float scale_1 = 32768.f/255.f;
- u = u - 128;
- s = u >= 0 ? 1 : -1;
+ u = u - 128.f;
+ s = u >= 0.f ? 1.f : -1.f;
u = fabs(u);
return s*scale_1*(exp(u/128.*LOG256)-1);
}
--- /dev/null
+++ b/dnn/download_model.bat
@@ -1,0 +1,10 @@
+@echo off
+set model=lpcnet_data-%1.tar.gz
+
+if not exist %model% (
+ echo Downloading latest model
+ powershell -Command "(New-Object System.Net.WebClient).DownloadFile('https://media.xiph.org/lpcnet/data/%model%', '%model%')"+)
+
+tar -xvzf %model%
+
--- a/dnn/download_model.sh
+++ b/dnn/download_model.sh
@@ -7,7 +7,7 @@
echo "Downloading latest model"
wget https://media.xiph.org/lpcnet/data/$model
fi
-tar xvf $model
+tar xvof $model
touch src/nnet_data.[ch]
touch src/plc_data.[ch]
mv src/*.[ch] .
--- /dev/null
+++ b/dnn/dred_rdovae.c
@@ -1,0 +1,135 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "dred_rdovae.h"
+#include "dred_rdovae_enc.h"
+#include "dred_rdovae_dec.h"
+#include "dred_rdovae_stats_data.h"
+
+void DRED_rdovae_decode_all(float *features, const float *state, const float *latents, int nb_latents)
+{+ int i;
+ RDOVAEDec dec;
+ memset(&dec, 0, sizeof(dec));
+ DRED_rdovae_dec_init_states(&dec, state);
+ for (i = 0; i < 2*nb_latents; i += 2)
+ {+ DRED_rdovae_decode_qframe(
+ &dec,
+ &features[2*i*DRED_NUM_FEATURES],
+ &latents[(i/2)*DRED_LATENT_DIM]);
+ }
+}
+
+size_t DRED_rdovae_get_enc_size()
+{+ return sizeof(RDOVAEEnc);
+}
+
+size_t DRED_rdovae_get_dec_size()
+{+ return sizeof(RDOVAEDec);
+}
+
+void DRED_rdovae_init_encoder(RDOVAEEnc *enc_state)
+{+ memset(enc_state, 0, sizeof(*enc_state));
+
+}
+
+void DRED_rdovae_init_decoder(RDOVAEDec *dec_state)
+{+ memset(dec_state, 0, sizeof(*dec_state));
+}
+
+
+RDOVAEEnc * DRED_rdovae_create_encoder()
+{+ RDOVAEEnc *enc;
+ enc = (RDOVAEEnc*) calloc(sizeof(*enc), 1);
+ DRED_rdovae_init_encoder(enc);
+ return enc;
+}
+
+RDOVAEDec * DRED_rdovae_create_decoder()
+{+ RDOVAEDec *dec;
+ dec = (RDOVAEDec*) calloc(sizeof(*dec), 1);
+ DRED_rdovae_init_decoder(dec);
+ return dec;
+}
+
+void DRED_rdovae_destroy_decoder(RDOVAEDec* dec)
+{+ free(dec);
+}
+
+void DRED_rdovae_destroy_encoder(RDOVAEEnc* enc)
+{+ free(enc);
+}
+
+void DRED_rdovae_encode_dframe(RDOVAEEnc *enc_state, float *latents, float *initial_state, const float *input)
+{+ dred_rdovae_encode_dframe(enc_state, latents, initial_state, input);
+}
+
+void DRED_rdovae_dec_init_states(RDOVAEDec *h, const float * initial_state)
+{+ dred_rdovae_dec_init_states(h, initial_state);
+}
+
+void DRED_rdovae_decode_qframe(RDOVAEDec *h, float *qframe, const float *z)
+{+ dred_rdovae_decode_qframe(h, qframe, z);
+}
+
+
+const opus_uint16 * DRED_rdovae_get_p0_pointer(void)
+{+ return &dred_p0_q15[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_dead_zone_pointer(void)
+{+ return &dred_dead_zone_q10[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_r_pointer(void)
+{+ return &dred_r_q15[0];
+}
+
+const opus_uint16 * DRED_rdovae_get_quant_scales_pointer(void)
+{+ return &dred_quant_scales_q8[0];
+}
--- /dev/null
+++ b/dnn/dred_rdovae_dec.c
@@ -1,0 +1,96 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "dred_rdovae_dec.h"
+#include "dred_rdovae_constants.h"
+
+
+void dred_rdovae_dec_init_states(
+ RDOVAEDec *h, /* io: state buffer handle */
+ const float *initial_state /* i: initial state */
+ )
+{+ /* initialize GRU states from initial state */
+ _lpcnet_compute_dense(&state1, h->dense2_state, initial_state);
+ _lpcnet_compute_dense(&state2, h->dense4_state, initial_state);
+ _lpcnet_compute_dense(&state3, h->dense6_state, initial_state);
+}
+
+
+void dred_rdovae_decode_qframe(
+ RDOVAEDec *dec_state, /* io: state buffer handle */
+ float *qframe, /* o: quadruple feature frame (four concatenated frames in reverse order) */
+ const float *input /* i: latent vector */
+ )
+{+ float buffer[DEC_DENSE1_OUT_SIZE + DEC_DENSE2_OUT_SIZE + DEC_DENSE3_OUT_SIZE + DEC_DENSE4_OUT_SIZE + DEC_DENSE5_OUT_SIZE + DEC_DENSE6_OUT_SIZE + DEC_DENSE7_OUT_SIZE + DEC_DENSE8_OUT_SIZE];
+ int output_index = 0;
+ int input_index = 0;
+ float zero_vector[1024] = {0};+
+ /* run encoder stack and concatenate output in buffer*/
+ _lpcnet_compute_dense(&dec_dense1, &buffer[output_index], input);
+ input_index = output_index;
+ output_index += DEC_DENSE1_OUT_SIZE;
+
+ compute_gruB(&dec_dense2, zero_vector, dec_state->dense2_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense2_state, DEC_DENSE2_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += DEC_DENSE2_OUT_SIZE;
+
+ _lpcnet_compute_dense(&dec_dense3, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += DEC_DENSE3_OUT_SIZE;
+
+ compute_gruB(&dec_dense4, zero_vector, dec_state->dense4_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense4_state, DEC_DENSE4_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += DEC_DENSE4_OUT_SIZE;
+
+ _lpcnet_compute_dense(&dec_dense5, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += DEC_DENSE5_OUT_SIZE;
+
+ compute_gruB(&dec_dense6, zero_vector, dec_state->dense6_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], dec_state->dense6_state, DEC_DENSE6_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += DEC_DENSE6_OUT_SIZE;
+
+ _lpcnet_compute_dense(&dec_dense7, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += DEC_DENSE7_OUT_SIZE;
+
+ _lpcnet_compute_dense(&dec_dense8, &buffer[output_index], &buffer[input_index]);
+ output_index += DEC_DENSE8_OUT_SIZE;
+
+ _lpcnet_compute_dense(&dec_final, qframe, buffer);
+}
\ No newline at end of file
--- /dev/null
+++ b/dnn/dred_rdovae_dec.h
@@ -1,0 +1,44 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _DRED_RDOVAE_DEC_H
+#define _DRED_RDOVAE_DEC_H
+
+#include "dred_rdovae.h"
+#include "dred_rdovae_dec_data.h"
+#include "dred_rdovae_stats_data.h"
+
+struct RDOVAEDecStruct {+ float dense2_state[DEC_DENSE2_STATE_SIZE];
+ float dense4_state[DEC_DENSE2_STATE_SIZE];
+ float dense6_state[DEC_DENSE2_STATE_SIZE];
+};
+
+void dred_rdovae_dec_init_states(RDOVAEDec *h, const float * initial_state);
+void dred_rdovae_decode_qframe(RDOVAEDec *h, float *qframe, const float * z);
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/dred_rdovae_enc.c
@@ -1,0 +1,94 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <math.h>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#include "dred_rdovae_enc.h"
+
+
+void dred_rdovae_encode_dframe(
+ RDOVAEEnc *enc_state, /* io: encoder state */
+ float *latents, /* o: latent vector */
+ float *initial_state, /* o: initial state */
+ const float *input /* i: double feature frame (concatenated) */
+ )
+{+ float buffer[ENC_DENSE1_OUT_SIZE + ENC_DENSE2_OUT_SIZE + ENC_DENSE3_OUT_SIZE + ENC_DENSE4_OUT_SIZE + ENC_DENSE5_OUT_SIZE + ENC_DENSE6_OUT_SIZE + ENC_DENSE7_OUT_SIZE + ENC_DENSE8_OUT_SIZE + GDENSE1_OUT_SIZE];
+ int output_index = 0;
+ int input_index = 0;
+ float zero_vector[1024] = {0};+
+ /* run encoder stack and concatenate output in buffer*/
+ _lpcnet_compute_dense(&enc_dense1, &buffer[output_index], input);
+ input_index = output_index;
+ output_index += ENC_DENSE1_OUT_SIZE;
+
+ compute_gruB(&enc_dense2, zero_vector, enc_state->dense2_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], enc_state->dense2_state, ENC_DENSE2_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += ENC_DENSE2_OUT_SIZE;
+
+ _lpcnet_compute_dense(&enc_dense3, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += ENC_DENSE3_OUT_SIZE;
+
+ compute_gruB(&enc_dense4, zero_vector, enc_state->dense4_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], enc_state->dense4_state, ENC_DENSE4_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += ENC_DENSE4_OUT_SIZE;
+
+ _lpcnet_compute_dense(&enc_dense5, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += ENC_DENSE5_OUT_SIZE;
+
+ compute_gruB(&enc_dense6, zero_vector, enc_state->dense6_state, &buffer[input_index]);
+ memcpy(&buffer[output_index], enc_state->dense6_state, ENC_DENSE6_OUT_SIZE * sizeof(float));
+ input_index = output_index;
+ output_index += ENC_DENSE6_OUT_SIZE;
+
+ _lpcnet_compute_dense(&enc_dense7, &buffer[output_index], &buffer[input_index]);
+ input_index = output_index;
+ output_index += ENC_DENSE7_OUT_SIZE;
+
+ _lpcnet_compute_dense(&enc_dense8, &buffer[output_index], &buffer[input_index]);
+ output_index += ENC_DENSE8_OUT_SIZE;
+
+ /* compute latents from concatenated input buffer */
+ compute_conv1d(&bits_dense, latents, enc_state->bits_dense_state, buffer);
+
+
+ /* next, calculate initial state */
+ _lpcnet_compute_dense(&gdense1, &buffer[output_index], buffer);
+ input_index = output_index;
+ _lpcnet_compute_dense(&gdense2, initial_state, &buffer[input_index]);
+
+}
--- /dev/null
+++ b/dnn/dred_rdovae_enc.h
@@ -1,0 +1,45 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _DRED_RDOVAE_ENC_H
+#define _DRED_RDOVAE_ENC_H
+
+#include "dred_rdovae.h"
+
+#include "dred_rdovae_enc_data.h"
+
+struct RDOVAEEncStruct {+ float dense2_state[3 * ENC_DENSE2_STATE_SIZE];
+ float dense4_state[3 * ENC_DENSE4_STATE_SIZE];
+ float dense6_state[3 * ENC_DENSE6_STATE_SIZE];
+ float bits_dense_state[BITS_DENSE_STATE_SIZE];
+};
+
+void dred_rdovae_encode_dframe(RDOVAEEnc *enc_state, float *latents, float *initial_state, const float *input);
+
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/dump_lpcnet_tables.c
@@ -1,0 +1,104 @@
+/* Copyright (c) 2017-2018 Mozilla
+ Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <math.h>
+#include <stdio.h>
+#include "freq.h"
+#include "kiss_fft.h"
+
+
+int main(void) {+ int i;
+ FILE *file;
+ kiss_fft_state *kfft;
+ float half_window[OVERLAP_SIZE];
+ float dct_table[NB_BANDS*NB_BANDS];
+
+ file=fopen("lpcnet_tables.c", "wb");+ fprintf(file, "/* The contents of this file was automatically generated by dump_lpcnet_tables.c*/\n\n");
+ fprintf(file, "#ifdef HAVE_CONFIG_H\n");
+ fprintf(file, "#include \"config.h\"\n");
+ fprintf(file, "#endif\n");
+
+ fprintf(file, "#include \"kiss_fft.h\"\n\n");
+
+ kfft = opus_fft_alloc_twiddles(WINDOW_SIZE, NULL, NULL, NULL, 0);
+
+ fprintf(file, "static const arch_fft_state arch_fft = {0, NULL};\n\n");+
+ fprintf (file, "static const opus_int16 fft_bitrev[%d] = {\n", kfft->nfft);+ for (i=0;i<kfft->nfft;i++)
+ fprintf (file, "%d,%c", kfft->bitrev[i],(i+16)%15==0?'\n':' ');
+ fprintf (file, "};\n\n");
+
+ fprintf (file, "static const kiss_twiddle_cpx fft_twiddles[%d] = {\n", kfft->nfft);+ for (i=0;i<kfft->nfft;i++)
+ fprintf (file, "{%#0.9gf, %#0.9gf},%c", kfft->twiddles[i].r, kfft->twiddles[i].i,(i+3)%2==0?'\n':' ');+ fprintf (file, "};\n\n");
+
+
+ fprintf(file, "const kiss_fft_state kfft = {\n");+ fprintf(file, "%d, /* nfft */\n", kfft->nfft);
+ fprintf(file, "%#0.8gf, /* scale */\n", kfft->scale);
+ fprintf(file, "%d, /* shift */\n", kfft->shift);
+ fprintf(file, "{");+ for (i=0;i<2*MAXFACTORS;i++) {+ fprintf(file, "%d, ", kfft->factors[i]);
+ }
+ fprintf(file, "}, /* factors */\n");
+ fprintf(file, "fft_bitrev, /* bitrev*/\n");
+ fprintf(file, "fft_twiddles, /* twiddles*/\n");
+ fprintf(file, "(arch_fft_state *)&arch_fft, /* arch_fft*/\n");
+
+ fprintf(file, "};\n\n");
+
+ for (i=0;i<OVERLAP_SIZE;i++)
+ half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/OVERLAP_SIZE) * sin(.5*M_PI*(i+.5)/OVERLAP_SIZE));
+ fprintf(file, "const float half_window[] = {\n");+ for (i=0;i<OVERLAP_SIZE;i++)
+ fprintf (file, "%#0.9gf,%c", half_window[i],(i+6)%5==0?'\n':' ');
+ fprintf(file, "};\n\n");
+
+ for (i=0;i<NB_BANDS;i++) {+ int j;
+ for (j=0;j<NB_BANDS;j++) {+ dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
+ if (j==0) dct_table[i*NB_BANDS + j] *= sqrt(.5);
+ }
+ }
+ fprintf(file, "const float dct_table[] = {\n");+ for (i=0;i<NB_BANDS*NB_BANDS;i++)
+ fprintf (file, "%#0.9gf,%c", dct_table[i],(i+6)%5==0?'\n':' ');
+ fprintf(file, "};\n");
+
+ fclose(file);
+ return 0;
+}
--- a/dnn/freq.c
+++ b/dnn/freq.c
@@ -51,14 +51,12 @@
0.8f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 0.666667f, 0.5f, 0.5f, 0.5f, 0.333333f, 0.25f, 0.25f, 0.2f, 0.166667f, 0.173913f
};
-typedef struct {- int init;
- kiss_fft_state *kfft;
- float half_window[OVERLAP_SIZE];
- float dct_table[NB_BANDS*NB_BANDS];
-} CommonState;
+extern const kiss_fft_state kfft;
+extern const float half_window[OVERLAP_SIZE];
+extern const float dct_table[NB_BANDS*NB_BANDS];
+
void compute_band_energy_inverse(float *bandE, const kiss_fft_cpx *X) {int i;
float sum[NB_BANDS] = {0};@@ -162,15 +160,15 @@
float x[WINDOW_SIZE];
float Eburg[NB_BANDS];
float g;
- float E;
kiss_fft_cpx LPC[FREQ_SIZE];
float Ly[NB_BANDS];
+ float logMax = -2;
+ float follow = -2;
assert(order <= LPC_ORDER);
assert(len <= FRAME_SIZE);
for (i=0;i<len-1;i++) burg_in[i] = pcm[i+1] - PREEMPHASIS*pcm[i];
g = silk_burg_analysis(burg_lpc, burg_in, 1e-3, len-1, 1, order);
g /= len - 2*(order-1);
- //printf("%g\n", g);RNN_CLEAR(x, WINDOW_SIZE);
x[0] = 1;
for (i=0;i<order;i++) x[i+1] = -burg_lpc[i]*pow(.995, i+1);
@@ -177,14 +175,11 @@
forward_transform(LPC, x);
compute_band_energy_inverse(Eburg, LPC);
for (i=0;i<NB_BANDS;i++) Eburg[i] *= .45*g*(1.f/((float)WINDOW_SIZE*WINDOW_SIZE*WINDOW_SIZE));
- float logMax = -2;
- float follow = -2;
for (i=0;i<NB_BANDS;i++) {Ly[i] = log10(1e-2+Eburg[i]);
Ly[i] = MAX16(logMax-8, MAX16(follow-2.5, Ly[i]));
logMax = MAX16(logMax, Ly[i]);
follow = MAX16(follow-2.5, Ly[i]);
- E += Eburg[i];
}
dct(burg_cepstrum, Ly);
burg_cepstrum[0] += - 4;
@@ -243,32 +238,14 @@
}
}
-CommonState common;
-static void check_init(void) {- int i;
- if (common.init) return;
- common.kfft = opus_fft_alloc_twiddles(WINDOW_SIZE, NULL, NULL, NULL, 0);
- for (i=0;i<OVERLAP_SIZE;i++)
- common.half_window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/OVERLAP_SIZE) * sin(.5*M_PI*(i+.5)/OVERLAP_SIZE));
- for (i=0;i<NB_BANDS;i++) {- int j;
- for (j=0;j<NB_BANDS;j++) {- common.dct_table[i*NB_BANDS + j] = cos((i+.5)*j*M_PI/NB_BANDS);
- if (j==0) common.dct_table[i*NB_BANDS + j] *= sqrt(.5);
- }
- }
- common.init = 1;
-}
-
void dct(float *out, const float *in) {int i;
- check_init();
for (i=0;i<NB_BANDS;i++) {int j;
float sum = 0;
for (j=0;j<NB_BANDS;j++) {- sum += in[j] * common.dct_table[j*NB_BANDS + i];
+ sum += in[j] * dct_table[j*NB_BANDS + i];
}
out[i] = sum*sqrt(2./NB_BANDS);
}
@@ -276,12 +253,11 @@
void idct(float *out, const float *in) {int i;
- check_init();
for (i=0;i<NB_BANDS;i++) {int j;
float sum = 0;
for (j=0;j<NB_BANDS;j++) {- sum += in[j] * common.dct_table[i*NB_BANDS + j];
+ sum += in[j] * dct_table[i*NB_BANDS + j];
}
out[i] = sum*sqrt(2./NB_BANDS);
}
@@ -291,12 +267,11 @@
int i;
kiss_fft_cpx x[WINDOW_SIZE];
kiss_fft_cpx y[WINDOW_SIZE];
- check_init();
for (i=0;i<WINDOW_SIZE;i++) {x[i].r = in[i];
x[i].i = 0;
}
- opus_fft(common.kfft, x, y, 0);
+ opus_fft(&kfft, x, y, 0);
for (i=0;i<FREQ_SIZE;i++) {out[i] = y[i];
}
@@ -306,7 +281,6 @@
int i;
kiss_fft_cpx x[WINDOW_SIZE];
kiss_fft_cpx y[WINDOW_SIZE];
- check_init();
for (i=0;i<FREQ_SIZE;i++) {x[i] = in[i];
}
@@ -314,7 +288,7 @@
x[i].r = x[WINDOW_SIZE - i].r;
x[i].i = -x[WINDOW_SIZE - i].i;
}
- opus_fft(common.kfft, x, y, 0);
+ opus_fft(&kfft, x, y, 0);
/* output in reverse order for IFFT. */
out[0] = WINDOW_SIZE*y[0].r;
for (i=1;i<WINDOW_SIZE;i++) {@@ -371,10 +345,9 @@
void apply_window(float *x) {int i;
- check_init();
for (i=0;i<OVERLAP_SIZE;i++) {- x[i] *= common.half_window[i];
- x[WINDOW_SIZE - 1 - i] *= common.half_window[i];
+ x[i] *= half_window[i];
+ x[WINDOW_SIZE - 1 - i] *= half_window[i];
}
}
--- /dev/null
+++ b/dnn/include/dred_rdovae.h
@@ -1,0 +1,60 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+#include <stdlib.h>
+
+#include "opus_types.h"
+
+typedef struct RDOVAEDecStruct RDOVAEDec;
+typedef struct RDOVAEEncStruct RDOVAEEnc;
+
+void DRED_rdovae_decode_all(float *features, const float *state, const float *latents, int nb_latents);
+
+
+size_t DRED_rdovae_get_enc_size(void);
+
+size_t DRED_rdovae_get_dec_size(void);
+
+RDOVAEDec * DRED_rdovae_create_decoder(void);
+RDOVAEEnc * DRED_rdovae_create_encoder(void);
+void DRED_rdovae_destroy_decoder(RDOVAEDec* h);
+void DRED_rdovae_destroy_encoder(RDOVAEEnc* h);
+
+
+void DRED_rdovae_init_encoder(RDOVAEEnc *enc_state);
+
+void DRED_rdovae_encode_dframe(RDOVAEEnc *enc_state, float *latents, float *initial_state, const float *input);
+
+void DRED_rdovae_dec_init_states(RDOVAEDec *h, const float * initial_state);
+
+void DRED_rdovae_decode_qframe(RDOVAEDec *h, float *qframe, const float * z);
+
+const opus_uint16 * DRED_rdovae_get_p0_pointer(void);
+const opus_uint16 * DRED_rdovae_get_dead_zone_pointer(void);
+const opus_uint16 * DRED_rdovae_get_r_pointer(void);
+const opus_uint16 * DRED_rdovae_get_quant_scales_pointer(void);
--- a/dnn/include/lpcnet.h
+++ b/dnn/include/lpcnet.h
@@ -197,4 +197,6 @@
LPCNET_EXPORT void lpcnet_plc_fec_add(LPCNetPLCState *st, const float *features);
+LPCNET_EXPORT void lpcnet_plc_fec_clear(LPCNetPLCState *st);
+
#endif
--- a/dnn/kiss_fft.c
+++ b/dnn/kiss_fft.c
@@ -506,10 +506,10 @@
if (cfg)
{opus_fft_free_arch((kiss_fft_state *)cfg, arch);
- opus_free((opus_int16*)cfg->bitrev);
+ free((opus_int16*)cfg->bitrev);
if (cfg->shift < 0)
- opus_free((kiss_twiddle_cpx*)cfg->twiddles);
- opus_free((kiss_fft_state*)cfg);
+ free((kiss_twiddle_cpx*)cfg->twiddles);
+ free((kiss_fft_state*)cfg);
}
}
--- a/dnn/kiss_fft.h
+++ b/dnn/kiss_fft.h
@@ -34,8 +34,7 @@
#include "arch.h"
#include <stdlib.h>
-#define opus_alloc(x) malloc(x)
-#define opus_free(x) free(x)
+#define lpcnet_alloc(x) malloc(x)
#ifdef __cplusplus
extern "C" {@@ -46,7 +45,7 @@
# define kiss_fft_scalar __m128
#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
#else
-#define KISS_FFT_MALLOC opus_alloc
+#define KISS_FFT_MALLOC lpcnet_alloc
#endif
#ifdef FIXED_POINT
--- a/dnn/lpcnet.c
+++ b/dnn/lpcnet.c
@@ -89,23 +89,21 @@
float dense1_out[FEATURE_DENSE1_OUT_SIZE];
int pitch;
float rc[LPC_ORDER];
- //static float features[NB_FEATURES];
- //RNN_COPY(features, lpcnet->last_features, NB_FEATURES);
/* Matches the Python code -- the 0.1 avoids rounding issues. */
pitch = (int)floor(.1 + 50*features[NB_BANDS]+100);
pitch = IMIN(255, IMAX(33, pitch));
net = &lpcnet->nnet;
RNN_COPY(in, features, NB_FEATURES);
- compute_embedding(&embed_pitch, &in[NB_FEATURES], pitch);
- compute_conv1d(&feature_conv1, conv1_out, net->feature_conv1_state, in);
+ compute_embedding(&lpcnet->model.embed_pitch, &in[NB_FEATURES], pitch);
+ compute_conv1d(&lpcnet->model.feature_conv1, conv1_out, net->feature_conv1_state, in);
if (lpcnet->frame_count < FEATURE_CONV1_DELAY) RNN_CLEAR(conv1_out, FEATURE_CONV1_OUT_SIZE);
- compute_conv1d(&feature_conv2, conv2_out, net->feature_conv2_state, conv1_out);
+ compute_conv1d(&lpcnet->model.feature_conv2, conv2_out, net->feature_conv2_state, conv1_out);
if (lpcnet->frame_count < FEATURES_DELAY) RNN_CLEAR(conv2_out, FEATURE_CONV2_OUT_SIZE);
- _lpcnet_compute_dense(&feature_dense1, dense1_out, conv2_out);
- _lpcnet_compute_dense(&feature_dense2, condition, dense1_out);
+ _lpcnet_compute_dense(&lpcnet->model.feature_dense1, dense1_out, conv2_out);
+ _lpcnet_compute_dense(&lpcnet->model.feature_dense2, condition, dense1_out);
RNN_COPY(rc, condition, LPC_ORDER);
- _lpcnet_compute_dense(&gru_a_dense_feature, gru_a_condition, condition);
- _lpcnet_compute_dense(&gru_b_dense_feature, gru_b_condition, condition);
+ _lpcnet_compute_dense(&lpcnet->model.gru_a_dense_feature, gru_a_condition, condition);
+ _lpcnet_compute_dense(&lpcnet->model.gru_b_dense_feature, gru_b_condition, condition);
#ifdef END2END
rc2lpc(lpc, rc);
#elif FEATURES_DELAY>0
@@ -118,29 +116,54 @@
#ifdef LPC_GAMMA
lpc_weighting(lpc, LPC_GAMMA);
#endif
- //RNN_COPY(lpcnet->last_features, _features, NB_FEATURES);
if (lpcnet->frame_count < 1000) lpcnet->frame_count++;
}
-int run_sample_network(NNetState *net, const float *gru_a_condition, const float *gru_b_condition, int last_exc, int last_sig, int pred, const float *sampling_logit_table, kiss99_ctx *rng)
+void run_frame_network_deferred(LPCNetState *lpcnet, const float *features)
{+ int max_buffer_size = lpcnet->model.feature_conv1.kernel_size + lpcnet->model.feature_conv2.kernel_size - 2;
+ celt_assert(max_buffer_size <= MAX_FEATURE_BUFFER_SIZE);
+ if (lpcnet->feature_buffer_fill == max_buffer_size) {+ RNN_MOVE(lpcnet->feature_buffer, &lpcnet->feature_buffer[NB_FEATURES], (max_buffer_size-1)*NB_FEATURES);
+ } else {+ lpcnet->feature_buffer_fill++;
+ }
+ RNN_COPY(&lpcnet->feature_buffer[(lpcnet->feature_buffer_fill-1)*NB_FEATURES], features, NB_FEATURES);
+}
+
+void run_frame_network_flush(LPCNetState *lpcnet)
+{+ int i;
+ for (i=0;i<lpcnet->feature_buffer_fill;i++) {+ float lpc[LPC_ORDER];
+ float gru_a_condition[3*GRU_A_STATE_SIZE];
+ float gru_b_condition[3*GRU_B_STATE_SIZE];
+ run_frame_network(lpcnet, gru_a_condition, gru_b_condition, lpc, &lpcnet->feature_buffer[i*NB_FEATURES]);
+ }
+ lpcnet->feature_buffer_fill = 0;
+}
+
+int run_sample_network(LPCNetState *lpcnet, const float *gru_a_condition, const float *gru_b_condition, int last_exc, int last_sig, int pred, const float *sampling_logit_table, kiss99_ctx *rng)
+{+ NNetState *net;
float gru_a_input[3*GRU_A_STATE_SIZE];
float in_b[GRU_A_STATE_SIZE+FEATURE_DENSE2_OUT_SIZE];
float gru_b_input[3*GRU_B_STATE_SIZE];
+ net = &lpcnet->nnet;
#if 1
- compute_gru_a_input(gru_a_input, gru_a_condition, GRU_A_STATE_SIZE, &gru_a_embed_sig, last_sig, &gru_a_embed_pred, pred, &gru_a_embed_exc, last_exc);
+ compute_gru_a_input(gru_a_input, gru_a_condition, GRU_A_STATE_SIZE, &lpcnet->model.gru_a_embed_sig, last_sig, &lpcnet->model.gru_a_embed_pred, pred, &lpcnet->model.gru_a_embed_exc, last_exc);
#else
RNN_COPY(gru_a_input, gru_a_condition, 3*GRU_A_STATE_SIZE);
- accum_embedding(&gru_a_embed_sig, gru_a_input, last_sig);
- accum_embedding(&gru_a_embed_pred, gru_a_input, pred);
- accum_embedding(&gru_a_embed_exc, gru_a_input, last_exc);
+ accum_embedding(&lpcnet->model.gru_a_embed_sig, gru_a_input, last_sig);
+ accum_embedding(&lpcnet->model.gru_a_embed_pred, gru_a_input, pred);
+ accum_embedding(&lpcnet->model.gru_a_embed_exc, gru_a_input, last_exc);
#endif
/*compute_gru3(&gru_a, net->gru_a_state, gru_a_input);*/
- compute_sparse_gru(&sparse_gru_a, net->gru_a_state, gru_a_input);
+ compute_sparse_gru(&lpcnet->model.sparse_gru_a, net->gru_a_state, gru_a_input);
RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);
RNN_COPY(gru_b_input, gru_b_condition, 3*GRU_B_STATE_SIZE);
- compute_gruB(&gru_b, gru_b_input, net->gru_b_state, in_b);
- return sample_mdense(&dual_fc, net->gru_b_state, sampling_logit_table, rng);
+ compute_gruB(&lpcnet->model.gru_b, gru_b_input, net->gru_b_state, in_b);
+ return sample_mdense(&lpcnet->model.dual_fc, net->gru_b_state, sampling_logit_table, rng);
}
LPCNET_EXPORT int lpcnet_get_size()
@@ -151,15 +174,18 @@
LPCNET_EXPORT int lpcnet_init(LPCNetState *lpcnet)
{int i;
+ int ret;
const char* rng_string="LPCNet";
memset(lpcnet, 0, lpcnet_get_size());
lpcnet->last_exc = lin2ulaw(0.f);
for (i=0;i<256;i++) {- float prob = .025+.95*i/255.;
+ float prob = .025f+.95f*i/255.f;
lpcnet->sampling_logit_table[i] = -log((1-prob)/prob);
}
kiss99_srand(&lpcnet->rng, (const unsigned char *)rng_string, strlen(rng_string));
- return 0;
+ ret = init_lpcnet_model(&lpcnet->model, lpcnet_arrays);
+ celt_assert(ret == 0);
+ return ret;
}
@@ -176,6 +202,14 @@
free(lpcnet);
}
+void lpcnet_reset_signal(LPCNetState *lpcnet)
+{+ lpcnet->deemph_mem = 0;
+ lpcnet->last_exc = lin2ulaw(0.f);
+ RNN_CLEAR(lpcnet->last_sig, LPC_ORDER);
+ RNN_CLEAR(lpcnet->nnet.gru_a_state, GRU_A_STATE_SIZE);
+ RNN_CLEAR(lpcnet->nnet.gru_b_state, GRU_B_STATE_SIZE);
+}
void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, short *output, int N, int preload)
{@@ -197,7 +231,7 @@
for (j=0;j<LPC_ORDER;j++) pred -= lpcnet->last_sig[j]*lpcnet->lpc[j];
last_sig_ulaw = lin2ulaw(lpcnet->last_sig[0]);
pred_ulaw = lin2ulaw(pred);
- exc = run_sample_network(&lpcnet->nnet, lpcnet->gru_a_condition, lpcnet->gru_b_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw, lpcnet->sampling_logit_table, &lpcnet->rng);
+ exc = run_sample_network(lpcnet, lpcnet->gru_a_condition, lpcnet->gru_b_condition, lpcnet->last_exc, last_sig_ulaw, pred_ulaw, lpcnet->sampling_logit_table, &lpcnet->rng);
if (i < preload) {exc = lin2ulaw(output[i]-PREEMPH*lpcnet->deemph_mem - pred);
pcm = output[i]-PREEMPH*lpcnet->deemph_mem;
--- a/dnn/lpcnet_dec.c
+++ b/dnn/lpcnet_dec.c
@@ -121,13 +121,13 @@
}
for (sub=0;sub<4;sub++) {float p = pow(2.f, main_pitch/21.)*PITCH_MIN_PERIOD;
- p *= 1 + modulation/16./7.*(2*sub-3);
+ p *= 1.f + modulation/16.f/7.f*(2*sub-3);
p = MIN16(255, MAX16(33, p));
- features[sub][NB_BANDS] = .02*(p-100);
- features[sub][NB_BANDS + 1] = frame_corr-.5;
+ features[sub][NB_BANDS] = .02f*(p-100.f);
+ features[sub][NB_BANDS + 1] = frame_corr-.5f;
}
- features[3][0] = (c0_id-64)/4.;
+ features[3][0] = (c0_id-64)/4.f;
for (i=0;i<NB_BANDS_1;i++) {features[3][i+1] = ceps_codebook1[vq_end[0]*NB_BANDS_1 + i] + ceps_codebook2[vq_end[1]*NB_BANDS_1 + i] + ceps_codebook3[vq_end[2]*NB_BANDS_1 + i];
}
@@ -141,7 +141,7 @@
features[1][i] = sign*ceps_codebook_diff4[vq_mid*NB_BANDS + i];
}
if ((vq_mid&MULTI_MASK) < 2) {- for (i=0;i<NB_BANDS;i++) features[1][i] += .5*(vq_mem[i] + features[3][i]);
+ for (i=0;i<NB_BANDS;i++) features[1][i] += .5f*(vq_mem[i] + features[3][i]);
} else if ((vq_mid&MULTI_MASK) == 2) {for (i=0;i<NB_BANDS;i++) features[1][i] += vq_mem[i];
} else {--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@@ -39,6 +39,7 @@
#define MODE_FEATURES 2
#define MODE_SYNTHESIS 3
#define MODE_PLC 4
+#define MODE_ADDLPC 5
void usage(void) {fprintf(stderr, "usage: lpcnet_demo -encode <input.pcm> <compressed.lpcnet>\n");
@@ -46,7 +47,8 @@
fprintf(stderr, " lpcnet_demo -features <input.pcm> <features.f32>\n");
fprintf(stderr, " lpcnet_demo -synthesis <features.f32> <output.pcm>\n");
fprintf(stderr, " lpcnet_demo -plc <plc_options> <percent> <input.pcm> <output.pcm>\n");
- fprintf(stderr, " lpcnet_demo -plc_file <plc_options> <percent> <input.pcm> <output.pcm>\n\n");
+ fprintf(stderr, " lpcnet_demo -plc_file <plc_options> <percent> <input.pcm> <output.pcm>\n");
+ fprintf(stderr, " lpcnet_demo -addlpc <features_without_lpc.f32> <features_with_lpc.lpc>\n\n");
fprintf(stderr, " plc_options:\n");
fprintf(stderr, " causal: normal (causal) PLC\n");
fprintf(stderr, " causal_dc: normal (causal) PLC with DC offset compensation\n");
@@ -83,6 +85,8 @@
}
argv+=2;
argc-=2;
+ } else if (strcmp(argv[1], "-addlpc") == 0){+ mode=MODE_ADDLPC;
} else {usage();
}
@@ -165,8 +169,8 @@
int count=0;
int loss=0;
int skip=0, extra=0;
- if ((plc_flags&0x3) == LPCNET_PLC_NONCAUSAL) skip=extra=80;
LPCNetPLCState *net;
+ if ((plc_flags&0x3) == LPCNET_PLC_NONCAUSAL) skip=extra=80;
net = lpcnet_plc_create(plc_flags);
while (1) {size_t ret;
@@ -187,6 +191,17 @@
fwrite(pcm, sizeof(pcm[0]), extra, fout);
}
lpcnet_plc_destroy(net);
+ } else if (mode == MODE_ADDLPC) {+ float features[36];
+ size_t ret;
+
+ while (1) {+ ret = fread(features, sizeof(features[0]), 36, fin);
+ if (ret != 36 || feof(fin)) break;
+ lpc_from_cepstrum(&features[20], &features[0]);
+ fwrite(features, sizeof(features[0]), 36, fout);
+ }
+
} else {fprintf(stderr, "unknown action\n");
}
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@@ -52,7 +52,7 @@
void vq_quantize_mbest(const float *codebook, int nb_entries, const float *x, int ndim, int mbest, float *dist, int *index)
{int i, j;
- for (i=0;i<mbest;i++) dist[i] = 1e15;
+ for (i=0;i<mbest;i++) dist[i] = 1e15f;
for (i=0;i<nb_entries;i++)
{@@ -80,7 +80,7 @@
int vq_quantize(const float *codebook, int nb_entries, const float *x, int ndim, float *dist_out)
{int i, j;
- float min_dist = 1e15;
+ float min_dist = 1e15f;
int nearest = 0;
for (i=0;i<nb_entries;i++)
@@ -242,7 +242,7 @@
static int find_nearest_multi(const float *codebook, int nb_entries, const float *x, int ndim, float *dist_out, int sign)
{int i, j;
- float min_dist = 1e15;
+ float min_dist = 1e15f;
int nearest = 0;
for (i=0;i<nb_entries;i++)
@@ -290,7 +290,7 @@
float s = 1;
nb_entries = 1<<bits;
RNN_COPY(ref, x, NB_BANDS);
- for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5*(left[i] + right[i]);
+ for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
for (i=0;i<4*NB_BANDS;i++) target[i] = x[i%NB_BANDS] - pred[i];
@@ -319,10 +319,10 @@
int interp_search(const float *x, const float *left, const float *right, float *dist_out)
{int i, k;
- float min_dist = 1e15;
+ float min_dist = 1e15f;
int best_pred = 0;
float pred[4*NB_BANDS];
- for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5*(left[i] + right[i]);
+ for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
@@ -342,7 +342,7 @@
void interp_diff(float *x, float *left, float *right, float *codebook, int bits, int sign)
{int i, k;
- float min_dist = 1e15;
+ float min_dist = 1e15f;
int best_pred = 0;
float ref[NB_BANDS];
float pred[4*NB_BANDS];
@@ -350,7 +350,7 @@
(void)codebook;
(void)bits;
RNN_COPY(ref, x, NB_BANDS);
- for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5*(left[i] + right[i]);
+ for (i=0;i<NB_BANDS;i++) pred[i] = pred[NB_BANDS+i] = .5f*(left[i] + right[i]);
for (i=0;i<NB_BANDS;i++) pred[2*NB_BANDS+i] = left[i];
for (i=0;i<NB_BANDS;i++) pred[3*NB_BANDS+i] = right[i];
@@ -378,7 +378,7 @@
int double_interp_search(float features[4][NB_TOTAL_FEATURES], const float *mem) {int i, j;
int best_id=0;
- float min_dist = 1e15;
+ float min_dist = 1e15f;
float dist[2][3];
interp_search(features[0], mem, features[1], dist[0]);
interp_search(features[2], features[1], features[3], dist[1]);
@@ -410,12 +410,12 @@
id1 = best_id % 3;
count = 1;
if (id0 != 1) {- float t = (id0==0) ? .5 : 1.;
+ float t = (id0==0) ? .5f : 1.f;
for (i=0;i<NB_BANDS;i++) features[1][i] += t*features[0][i];
count += t;
}
if (id1 != 2) {- float t = (id1==0) ? .5 : 1.;
+ float t = (id1==0) ? .5f : 1.f;
for (i=0;i<NB_BANDS;i++) features[1][i] += t*features[2][i];
count += t;
}
@@ -511,9 +511,9 @@
follow = -2;
for (i=0;i<NB_BANDS;i++) {Ly[i] = log10(1e-2+Ex[i]);
- Ly[i] = MAX16(logMax-8, MAX16(follow-2.5, Ly[i]));
+ Ly[i] = MAX16(logMax-8, MAX16(follow-2.5f, Ly[i]));
logMax = MAX16(logMax, Ly[i]);
- follow = MAX16(follow-2.5, Ly[i]);
+ follow = MAX16(follow-2.5f, Ly[i]);
E += Ex[i];
}
dct(st->features[st->pcount], Ly);
@@ -529,7 +529,7 @@
sum += st->lpc[j]*st->pitch_mem[j];
RNN_MOVE(st->pitch_mem+1, st->pitch_mem, LPC_ORDER-1);
st->pitch_mem[0] = aligned_in[i];
- st->exc_buf[PITCH_MAX_PERIOD+i] = sum + .7*st->pitch_filt;
+ st->exc_buf[PITCH_MAX_PERIOD+i] = sum + .7f*st->pitch_filt;
st->pitch_filt = sum;
/*printf("%f\n", st->exc_buf[PITCH_MAX_PERIOD+i]);*/}
@@ -548,7 +548,7 @@
/* Upsample correlation by 3x and keep the max. */
float interpolated[PITCH_MAX_PERIOD]={0};/* interp=sinc([-3:3]+1/3).*(.5+.5*cos(pi*[-3:3]/4.5)); interp=interp/sum(interp); */
- static const float interp[7] = {0.026184, -0.098339, 0.369938, 0.837891, -0.184969, 0.070242, -0.020947};+ static const float interp[7] = {0.026184f, -0.098339f, 0.369938f, 0.837891f, -0.184969f, 0.070242f, -0.020947f}; for (i=4;i<PITCH_MAX_PERIOD-4;i++) {float val1=0, val2=0;
int j;
@@ -582,7 +582,7 @@
float sx=0, sxx=0, sxy=0, sy=0, sw=0;
float frame_corr;
int voiced;
- float frame_weight_sum = 1e-15;
+ float frame_weight_sum = 1e-15f;
float center_pitch;
int main_pitch;
int modulation;
@@ -594,11 +594,11 @@
for(sub=0;sub<8;sub++) frame_weight_sum += st->frame_weight[2+sub];
for(sub=0;sub<8;sub++) st->frame_weight[2+sub] *= (8.f/frame_weight_sum);
for(sub=0;sub<8;sub++) {- float max_path_all = -1e15;
+ float max_path_all = -1e15f;
best_i = 0;
for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {float xc_half = MAX16(MAX16(st->xc[2+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+sub][(PITCH_MAX_PERIOD+i-1)/2]);
- if (st->xc[2+sub][i] < xc_half*1.1) st->xc[2+sub][i] *= .8;
+ if (st->xc[2+sub][i] < xc_half*1.1f) st->xc[2+sub][i] *= .8f;
}
for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {int j;
@@ -666,7 +666,7 @@
/*best_b = (sxx*sy - sx*sxy)/(sw*sxx - sx*sx);*/
best_b = (sy - best_a*sx)/sw;
/* Quantizing the pitch as "main" pitch + slope. */
- center_pitch = best_b+5.5*best_a;
+ center_pitch = best_b+5.5f*best_a;
main_pitch = (int)floor(.5 + 21.*log2(center_pitch/PITCH_MIN_PERIOD));
main_pitch = IMAX(0, IMIN(63, main_pitch));
modulation = (int)floor(.5 + 16*7*best_a/center_pitch);
@@ -677,13 +677,13 @@
for (sub=0;sub<4;sub++) { if (quantize) {float p = pow(2.f, main_pitch/21.)*PITCH_MIN_PERIOD;
- p *= 1 + modulation/16./7.*(2*sub-3);
+ p *= 1.f + modulation/16.f/7.f*(2*sub-3);
p = MIN16(255, MAX16(33, p));
- st->features[sub][NB_BANDS] = .02*(p-100);
- st->features[sub][NB_BANDS + 1] = frame_corr-.5;
+ st->features[sub][NB_BANDS] = .02f*(p-100);
+ st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
} else {- st->features[sub][NB_BANDS] = .01*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
- st->features[sub][NB_BANDS + 1] = frame_corr-.5;
+ st->features[sub][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
+ st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
}
/*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/}
@@ -694,7 +694,7 @@
/*printf("%f\n", st->features[3][0]);*/c0_id = (int)floor(.5 + st->features[3][0]*4);
c0_id = IMAX(-64, IMIN(63, c0_id));
- st->features[3][0] = c0_id/4.;
+ st->features[3][0] = c0_id/4.f;
quantize_3stage_mbest(&st->features[3][1], vq_end);
/*perform_interp_relaxation(st->features, st->vq_mem);*/
quantize_diff(&st->features[1][0], st->vq_mem, &st->features[3][0], ceps_codebook_diff4, 12, 1, &vq_mid);
@@ -736,15 +736,15 @@
int best[10];
int pitch_prev[8][PITCH_MAX_PERIOD];
float frame_corr;
- float frame_weight_sum = 1e-15;
+ float frame_weight_sum = 1e-15f;
for(sub=0;sub<8;sub++) frame_weight_sum += st->frame_weight[2+sub];
for(sub=0;sub<8;sub++) st->frame_weight[2+sub] *= (8.f/frame_weight_sum);
for(sub=0;sub<8;sub++) {- float max_path_all = -1e15;
+ float max_path_all = -1e15f;
best_i = 0;
for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {float xc_half = MAX16(MAX16(st->xc[2+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+sub][(PITCH_MAX_PERIOD+i-1)/2]);
- if (st->xc[2+sub][i] < xc_half*1.1) st->xc[2+sub][i] *= .8;
+ if (st->xc[2+sub][i] < xc_half*1.1) st->xc[2+sub][i] *= .8f;
}
for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {int j;
@@ -781,8 +781,8 @@
}
frame_corr /= 8;
for (sub=0;sub<4;sub++) {- st->features[sub][NB_BANDS] = .01*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
- st->features[sub][NB_BANDS + 1] = frame_corr-.5;
+ st->features[sub][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2+2*sub]+best[2+2*sub+1]))-200);
+ st->features[sub][NB_BANDS + 1] = frame_corr-.5f;
/*printf("%f %d %f\n", st->features[sub][NB_BANDS], best[2+2*sub], frame_corr);*/}
/*printf("%d %f %f %f\n", best_period, best_a, best_b, best_corr);*/@@ -804,15 +804,15 @@
int best[4];
int pitch_prev[2][PITCH_MAX_PERIOD];
float frame_corr;
- float frame_weight_sum = 1e-15;
+ float frame_weight_sum = 1e-15f;
for(sub=0;sub<2;sub++) frame_weight_sum += st->frame_weight[2+2*st->pcount+sub];
for(sub=0;sub<2;sub++) st->frame_weight[2+2*st->pcount+sub] *= (2.f/frame_weight_sum);
for(sub=0;sub<2;sub++) {- float max_path_all = -1e15;
+ float max_path_all = -1e15f;
best_i = 0;
for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {float xc_half = MAX16(MAX16(st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i)/2], st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[2+2*st->pcount+sub][(PITCH_MAX_PERIOD+i-1)/2]);
- if (st->xc[2+2*st->pcount+sub][i] < xc_half*1.1) st->xc[2+2*st->pcount+sub][i] *= .8;
+ if (st->xc[2+2*st->pcount+sub][i] < xc_half*1.1f) st->xc[2+2*st->pcount+sub][i] *= .8f;
}
for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {int j;
@@ -848,8 +848,8 @@
best_i = pitch_prev[sub][best_i];
}
frame_corr /= 2;
- st->features[st->pcount][NB_BANDS] = .01*(IMAX(66, IMIN(510, best[2]+best[3]))-200);
- st->features[st->pcount][NB_BANDS + 1] = frame_corr-.5;
+ st->features[st->pcount][NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2]+best[3]))-200);
+ st->features[st->pcount][NB_BANDS + 1] = frame_corr-.5f;
if (ffeat) {fwrite(st->features[st->pcount], sizeof(float), NB_TOTAL_FEATURES, ffeat);
}
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@@ -32,11 +32,19 @@
#include "lpcnet.h"
#include "plc_data.h"
+#ifndef M_PI
+#define M_PI 3.141592653
+#endif
+
+/* Comment this out to have LPCNet update its state on every good packet (slow). */
+#define PLC_SKIP_UPDATES
+
LPCNET_EXPORT int lpcnet_plc_get_size() {return sizeof(LPCNetPLCState);
}
LPCNET_EXPORT int lpcnet_plc_init(LPCNetPLCState *st, int options) {+ int ret;
RNN_CLEAR(st, 1);
lpcnet_init(&st->lpcnet);
lpcnet_encoder_init(&st->enc);
@@ -60,7 +68,9 @@
return -1;
}
st->remove_dc = !!(options&LPCNET_PLC_DC_FILTER);
- return 0;
+ ret = init_plc_model(&st->model, lpcnet_plc_arrays);
+ celt_assert(ret == 0);
+ return ret;
}
LPCNET_EXPORT LPCNetPLCState *lpcnet_plc_create(int options) {@@ -75,6 +85,10 @@
}
void lpcnet_plc_fec_add(LPCNetPLCState *st, const float *features) {+ if (features == NULL) {+ st->fec_skip++;
+ return;
+ }
if (st->fec_fill_pos == PLC_MAX_FEC) { if (st->fec_keep_pos == 0) {fprintf(stderr, "FEC buffer full\n");
@@ -89,28 +103,40 @@
st->fec_fill_pos++;
}
-static void compute_plc_pred(PLCNetState *net, float *out, const float *in) {+void lpcnet_plc_fec_clear(LPCNetPLCState *st) {+ st->fec_keep_pos = st->fec_read_pos = st->fec_fill_pos = st-> fec_skip = 0;
+}
+
+
+static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) { float zeros[3*PLC_MAX_RNN_NEURONS] = {0};float dense_out[PLC_DENSE1_OUT_SIZE];
- _lpcnet_compute_dense(&plc_dense1, dense_out, in);
- compute_gruB(&plc_gru1, zeros, net->plc_gru1_state, dense_out);
- compute_gruB(&plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
- _lpcnet_compute_dense(&plc_out, out, net->plc_gru2_state);
+ PLCNetState *net = &st->plc_net;
+ _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
+ compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
+ compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
+ _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
/* Artificially boost the correlation to make harmonics cleaner. */
out[19] = MIN16(.5f, out[19]+.1f);
}
static int get_fec_or_pred(LPCNetPLCState *st, float *out) {- if (st->fec_read_pos != st->fec_fill_pos) {+ if (st->fec_read_pos != st->fec_fill_pos && st->fec_skip==0) {+ float plc_features[2*NB_BANDS+NB_FEATURES+1] = {0};+ float discard[NB_FEATURES];
RNN_COPY(out, &st->fec[st->fec_read_pos][0], NB_FEATURES);
st->fec_read_pos++;
/* Make sure we can rewind a few frames back at resync time. */
st->fec_keep_pos = IMAX(0, IMAX(st->fec_keep_pos, st->fec_read_pos-FEATURES_DELAY-1));
- /* FIXME: Figure out how to update compute_plc_pred() without Burg features. */
+ /* Update PLC state using FEC, so without Burg features. */
+ RNN_COPY(&plc_features[2*NB_BANDS], out, NB_FEATURES);
+ plc_features[2*NB_BANDS+NB_FEATURES] = -1;
+ compute_plc_pred(st, discard, plc_features);
return 1;
} else { float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};- compute_plc_pred(&st->plc_net, out, zeros);
+ compute_plc_pred(st, out, zeros);
+ if (st->fec_skip > 0) st->fec_skip--;
return 0;
}
}
@@ -119,13 +145,12 @@
st->fec_read_pos -= offset;
if (st->fec_read_pos < st->fec_keep_pos) {st->fec_read_pos = st->fec_keep_pos;
- fprintf(stderr, "cannot rewind\n");
}
}
void clear_state(LPCNetPLCState *st) {RNN_CLEAR(st->lpcnet.last_sig, LPC_ORDER);
- st->lpcnet.last_exc = lin2ulaw(0.f);;
+ st->lpcnet.last_exc = lin2ulaw(0.f);
st->lpcnet.deemph_mem = 0;
RNN_CLEAR(st->lpcnet.nnet.gru_a_state, GRU_A_STATE_SIZE);
RNN_CLEAR(st->lpcnet.nnet.gru_b_state, GRU_B_STATE_SIZE);
@@ -163,22 +188,14 @@
float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};RNN_COPY(zeros, plc_features, 2*NB_BANDS);
zeros[2*NB_BANDS+NB_FEATURES] = 1;
- if (st->fec_active) {- if (FEATURES_DELAY > 0) st->plc_net = st->plc_copy[FEATURES_DELAY-1];
- fec_rewind(st, FEATURES_DELAY);
- } else {+ if (st->enable_blending) {+ LPCNetState copy;
st->plc_net = st->plc_copy[FEATURES_DELAY];
- compute_plc_pred(&st->plc_net, st->features, zeros);
+ compute_plc_pred(st, st->features, zeros);
for (i=0;i<FEATURES_DELAY;i++) {- float lpc[LPC_ORDER];
- float gru_a_condition[3*GRU_A_STATE_SIZE];
- float gru_b_condition[3*GRU_B_STATE_SIZE];
/* FIXME: backtrack state, replace features. */
- run_frame_network(&st->lpcnet, gru_a_condition, gru_b_condition, lpc, st->features);
+ run_frame_network_deferred(&st->lpcnet, st->features);
}
- }
- if (st->enable_blending) {- LPCNetState copy;
copy = st->lpcnet;
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], tmp, FRAME_SIZE-TRAINING_OFFSET, 0);
for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) {@@ -189,8 +206,14 @@
st->lpcnet = copy;
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
} else {+ if (FEATURES_DELAY > 0) st->plc_net = st->plc_copy[FEATURES_DELAY-1];
+ fec_rewind(st, FEATURES_DELAY);
+#ifdef PLC_SKIP_UPDATES
+ lpcnet_reset_signal(&st->lpcnet);
+#else
RNN_COPY(tmp, pcm, FRAME_SIZE-TRAINING_OFFSET);
lpcnet_synthesize_tail_impl(&st->lpcnet, tmp, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
+#endif
}
RNN_COPY(st->pcm, &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
st->pcm_fill = TRAINING_OFFSET;
@@ -208,24 +231,28 @@
if (!st->blend) {RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
plc_features[2*NB_BANDS+NB_FEATURES] = 1;
- compute_plc_pred(&st->plc_net, st->features, plc_features);
+ compute_plc_pred(st, st->features, plc_features);
/* Discard an FEC frame that we know we will no longer need. */
- if (st->fec_read_pos < st->fec_fill_pos) st->fec_read_pos++;
+ if (st->fec_skip) st->fec_skip--;
+ else if (st->fec_read_pos < st->fec_fill_pos) st->fec_read_pos++;
st->fec_keep_pos = IMAX(0, IMAX(st->fec_keep_pos, st->fec_read_pos-FEATURES_DELAY-1));
}
if (st->skip_analysis) {- if (!st->fec_active) {- float lpc[LPC_ORDER];
- float gru_a_condition[3*GRU_A_STATE_SIZE];
- float gru_b_condition[3*GRU_B_STATE_SIZE];
+ if (st->enable_blending) {/* FIXME: backtrack state, replace features. */
- run_frame_network(&st->lpcnet, gru_a_condition, gru_b_condition, lpc, st->enc.features[0]);
+ run_frame_network_deferred(&st->lpcnet, st->enc.features[0]);
}
st->skip_analysis--;
} else {for (i=0;i<FRAME_SIZE;i++) st->pcm[PLC_BUF_SIZE+i] = pcm[i];
RNN_COPY(output, &st->pcm[0], FRAME_SIZE);
+#ifdef PLC_SKIP_UPDATES
+ {+ run_frame_network_deferred(&st->lpcnet, st->enc.features[0]);
+ }
+#else
lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], output, FRAME_SIZE, FRAME_SIZE);
+#endif
RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
}
st->loss_count = 0;
@@ -235,7 +262,6 @@
}
}
st->blend = 0;
- st->fec_active = 0;
return 0;
}
@@ -243,6 +269,7 @@
static int lpcnet_plc_conceal_causal(LPCNetPLCState *st, short *pcm) {int i;
short output[FRAME_SIZE];
+ run_frame_network_flush(&st->lpcnet);
st->enc.pcount = 0;
/* If we concealed the previous frame, finish synthesizing the rest of the samples. */
/* FIXME: Copy/predict features. */
@@ -253,7 +280,7 @@
RNN_COPY(output, &st->pcm[0], update_count);
RNN_MOVE(&st->plc_copy[1], &st->plc_copy[0], FEATURES_DELAY);
st->plc_copy[0] = st->plc_net;
- st->fec_active = get_fec_or_pred(st, st->features);
+ get_fec_or_pred(st, st->features);
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], output, update_count, update_count);
RNN_MOVE(st->pcm, &st->pcm[FRAME_SIZE], PLC_BUF_SIZE);
st->pcm_fill -= update_count;
@@ -262,10 +289,10 @@
RNN_MOVE(&st->plc_copy[1], &st->plc_copy[0], FEATURES_DELAY);
st->plc_copy[0] = st->plc_net;
lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, 0);
- st->fec_active = get_fec_or_pred(st, st->features);
+ if (get_fec_or_pred(st, st->features)) st->loss_count = 0;
+ else st->loss_count++;
if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
- //if (st->loss_count > 4) st->features[NB_FEATURES-1] = MAX16(-.5, st->features[NB_FEATURES-1]-.1*(st->loss_count-4));
lpcnet_synthesize_impl(&st->lpcnet, &st->features[0], &pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
{float x[FRAME_SIZE];
@@ -275,7 +302,6 @@
compute_frame_features(&st->enc, x);
process_single_frame(&st->enc, NULL);
}
- if (!st->fec_active) st->loss_count++;
st->blend = 1;
if (st->remove_dc) { for (i=0;i<FRAME_SIZE;i++) {@@ -330,7 +356,7 @@
float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};RNN_COPY(zeros, plc_features, 2*NB_BANDS);
zeros[2*NB_BANDS+NB_FEATURES] = 1;
- compute_plc_pred(&st->plc_net, st->features, zeros);
+ compute_plc_pred(st, st->features, zeros);
copy = st->lpcnet;
lpcnet_synthesize_impl(&st->lpcnet, st->features, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, 0);
/* Undo initial DC offset removal so that we can take into account the last 5ms of synthesis. */
@@ -383,7 +409,7 @@
if (st->loss_count == 0) {RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);
plc_features[2*NB_BANDS+NB_FEATURES] = 1;
- compute_plc_pred(&st->plc_net, st->features, plc_features);
+ compute_plc_pred(st, st->features, plc_features);
lpcnet_synthesize_impl(&st->lpcnet, st->enc.features[0], &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET, TRAINING_OFFSET);
lpcnet_synthesize_tail_impl(&st->lpcnet, pcm, FRAME_SIZE-TRAINING_OFFSET, FRAME_SIZE-TRAINING_OFFSET);
}
@@ -406,10 +432,9 @@
process_queued_update(st);
st->enc.pcount = 0;
- compute_plc_pred(&st->plc_net, st->features, zeros);
+ compute_plc_pred(st, st->features, zeros);
if (st->loss_count >= 10) st->features[0] = MAX16(-10, st->features[0]+att_table[9] - 2*(st->loss_count-9));
else st->features[0] = MAX16(-10, st->features[0]+att_table[st->loss_count]);
- //if (st->loss_count > 4) st->features[NB_FEATURES-1] = MAX16(-.5, st->features[NB_FEATURES-1]-.1*(st->loss_count-4));
if (st->loss_count == 0) {RNN_COPY(pcm, &st->pcm[FRAME_SIZE-TRAINING_OFFSET], TRAINING_OFFSET);
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@@ -23,11 +23,14 @@
#define FORBIDDEN_INTERP 7
#define PLC_MAX_FEC 100
+#define MAX_FEATURE_BUFFER_SIZE 4
struct LPCNetState {NNetState nnet;
int last_exc;
float last_sig[LPC_ORDER];
+ float feature_buffer[NB_FEATURES*MAX_FEATURE_BUFFER_SIZE];
+ int feature_buffer_fill;
float last_features[NB_FEATURES];
#if FEATURES_DELAY>0
float old_lpc[FEATURES_DELAY][LPC_ORDER];
@@ -39,6 +42,7 @@
float deemph_mem;
float lpc[LPC_ORDER];
kiss99_ctx rng;
+ LPCNetModel model;
};
struct LPCNetDecState {@@ -76,7 +80,7 @@
int fec_keep_pos;
int fec_read_pos;
int fec_fill_pos;
- int fec_active;
+ int fec_skip;
short pcm[PLC_BUF_SIZE+FRAME_SIZE];
int pcm_fill;
int skip_analysis;
@@ -94,6 +98,7 @@
short dc_buf[TRAINING_OFFSET];
int queued_update;
short queued_samples[FRAME_SIZE];
+ PLCModel model;
};
extern float ceps_codebook1[];
@@ -111,7 +116,12 @@
void decode_packet(float features[4][NB_TOTAL_FEATURES], float *vq_mem, const unsigned char buf[8]);
+void lpcnet_reset_signal(LPCNetState *lpcnet);
void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
+void run_frame_network_deferred(LPCNetState *lpcnet, const float *features);
+void run_frame_network_flush(LPCNetState *lpcnet);
+
+
void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, short *output, int N, int preload);
void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, short *output, int N, int preload);
void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const short *pcm_in, short *output, int N);
--- /dev/null
+++ b/dnn/lpcnet_tables.c
@@ -1,0 +1,307 @@
+/* The contents of this file was automatically generated by dump_lpcnet_tables.c*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include "kiss_fft.h"
+
+static const arch_fft_state arch_fft = {0, NULL};+
+static const opus_int16 fft_bitrev[320] = {+0, 64, 128, 192, 256, 16, 80, 144, 208, 272, 32, 96, 160, 224, 288,
+48, 112, 176, 240, 304, 4, 68, 132, 196, 260, 20, 84, 148, 212, 276,
+36, 100, 164, 228, 292, 52, 116, 180, 244, 308, 8, 72, 136, 200, 264,
+24, 88, 152, 216, 280, 40, 104, 168, 232, 296, 56, 120, 184, 248, 312,
+12, 76, 140, 204, 268, 28, 92, 156, 220, 284, 44, 108, 172, 236, 300,
+60, 124, 188, 252, 316, 1, 65, 129, 193, 257, 17, 81, 145, 209, 273,
+33, 97, 161, 225, 289, 49, 113, 177, 241, 305, 5, 69, 133, 197, 261,
+21, 85, 149, 213, 277, 37, 101, 165, 229, 293, 53, 117, 181, 245, 309,
+9, 73, 137, 201, 265, 25, 89, 153, 217, 281, 41, 105, 169, 233, 297,
+57, 121, 185, 249, 313, 13, 77, 141, 205, 269, 29, 93, 157, 221, 285,
+45, 109, 173, 237, 301, 61, 125, 189, 253, 317, 2, 66, 130, 194, 258,
+18, 82, 146, 210, 274, 34, 98, 162, 226, 290, 50, 114, 178, 242, 306,
+6, 70, 134, 198, 262, 22, 86, 150, 214, 278, 38, 102, 166, 230, 294,
+54, 118, 182, 246, 310, 10, 74, 138, 202, 266, 26, 90, 154, 218, 282,
+42, 106, 170, 234, 298, 58, 122, 186, 250, 314, 14, 78, 142, 206, 270,
+30, 94, 158, 222, 286, 46, 110, 174, 238, 302, 62, 126, 190, 254, 318,
+3, 67, 131, 195, 259, 19, 83, 147, 211, 275, 35, 99, 163, 227, 291,
+51, 115, 179, 243, 307, 7, 71, 135, 199, 263, 23, 87, 151, 215, 279,
+39, 103, 167, 231, 295, 55, 119, 183, 247, 311, 11, 75, 139, 203, 267,
+27, 91, 155, 219, 283, 43, 107, 171, 235, 299, 59, 123, 187, 251, 315,
+15, 79, 143, 207, 271, 31, 95, 159, 223, 287, 47, 111, 175, 239, 303,
+63, 127, 191, 255, 319, };
+
+static const kiss_twiddle_cpx fft_twiddles[320] = {+{1.00000000f, -0.00000000f}, {0.999807239f, -0.0196336918f},+{0.999229014f, -0.0392598175f}, {0.998265624f, -0.0588708036f},+{0.996917307f, -0.0784590989f}, {0.995184720f, -0.0980171412f},+{0.993068457f, -0.117537394f}, {0.990569353f, -0.137012348f},+{0.987688363f, -0.156434461f}, {0.984426558f, -0.175796285f},+{0.980785251f, -0.195090324f}, {0.976765871f, -0.214309156f},+{0.972369909f, -0.233445361f}, {0.967599094f, -0.252491564f},+{0.962455213f, -0.271440446f}, {0.956940353f, -0.290284663f},+{0.951056540f, -0.309017003f}, {0.944806039f, -0.327630192f},+{0.938191354f, -0.346117049f}, {0.931214929f, -0.364470512f},+{0.923879504f, -0.382683426f}, {0.916187942f, -0.400748819f},+{0.908143163f, -0.418659747f}, {0.899748266f, -0.436409235f},+{0.891006529f, -0.453990489f}, {0.881921291f, -0.471396744f},+{0.872496009f, -0.488621235f}, {0.862734377f, -0.505657375f},+{0.852640152f, -0.522498548f}, {0.842217207f, -0.539138317f},+{0.831469595f, -0.555570245f}, {0.820401430f, -0.571787953f},+{0.809017003f, -0.587785244f}, {0.797320664f, -0.603555918f},+{0.785316944f, -0.619093955f}, {0.773010433f, -0.634393275f},+{0.760405958f, -0.649448037f}, {0.747508347f, -0.664252460f},+{0.734322488f, -0.678800762f}, {0.720853567f, -0.693087339f},+{0.707106769f, -0.707106769f}, {0.693087339f, -0.720853567f},+{0.678800762f, -0.734322488f}, {0.664252460f, -0.747508347f},+{0.649448037f, -0.760405958f}, {0.634393275f, -0.773010433f},+{0.619093955f, -0.785316944f}, {0.603555918f, -0.797320664f},+{0.587785244f, -0.809017003f}, {0.571787953f, -0.820401430f},+{0.555570245f, -0.831469595f}, {0.539138317f, -0.842217207f},+{0.522498548f, -0.852640152f}, {0.505657375f, -0.862734377f},+{0.488621235f, -0.872496009f}, {0.471396744f, -0.881921291f},+{0.453990489f, -0.891006529f}, {0.436409235f, -0.899748266f},+{0.418659747f, -0.908143163f}, {0.400748819f, -0.916187942f},+{0.382683426f, -0.923879504f}, {0.364470512f, -0.931214929f},+{0.346117049f, -0.938191354f}, {0.327630192f, -0.944806039f},+{0.309017003f, -0.951056540f}, {0.290284663f, -0.956940353f},+{0.271440446f, -0.962455213f}, {0.252491564f, -0.967599094f},+{0.233445361f, -0.972369909f}, {0.214309156f, -0.976765871f},+{0.195090324f, -0.980785251f}, {0.175796285f, -0.984426558f},+{0.156434461f, -0.987688363f}, {0.137012348f, -0.990569353f},+{0.117537394f, -0.993068457f}, {0.0980171412f, -0.995184720f},+{0.0784590989f, -0.996917307f}, {0.0588708036f, -0.998265624f},+{0.0392598175f, -0.999229014f}, {0.0196336918f, -0.999807239f},+{6.12323426e-17f, -1.00000000f}, {-0.0196336918f, -0.999807239f},+{-0.0392598175f, -0.999229014f}, {-0.0588708036f, -0.998265624f},+{-0.0784590989f, -0.996917307f}, {-0.0980171412f, -0.995184720f},+{-0.117537394f, -0.993068457f}, {-0.137012348f, -0.990569353f},+{-0.156434461f, -0.987688363f}, {-0.175796285f, -0.984426558f},+{-0.195090324f, -0.980785251f}, {-0.214309156f, -0.976765871f},+{-0.233445361f, -0.972369909f}, {-0.252491564f, -0.967599094f},+{-0.271440446f, -0.962455213f}, {-0.290284663f, -0.956940353f},+{-0.309017003f, -0.951056540f}, {-0.327630192f, -0.944806039f},+{-0.346117049f, -0.938191354f}, {-0.364470512f, -0.931214929f},+{-0.382683426f, -0.923879504f}, {-0.400748819f, -0.916187942f},+{-0.418659747f, -0.908143163f}, {-0.436409235f, -0.899748266f},+{-0.453990489f, -0.891006529f}, {-0.471396744f, -0.881921291f},+{-0.488621235f, -0.872496009f}, {-0.505657375f, -0.862734377f},+{-0.522498548f, -0.852640152f}, {-0.539138317f, -0.842217207f},+{-0.555570245f, -0.831469595f}, {-0.571787953f, -0.820401430f},+{-0.587785244f, -0.809017003f}, {-0.603555918f, -0.797320664f},+{-0.619093955f, -0.785316944f}, {-0.634393275f, -0.773010433f},+{-0.649448037f, -0.760405958f}, {-0.664252460f, -0.747508347f},+{-0.678800762f, -0.734322488f}, {-0.693087339f, -0.720853567f},+{-0.707106769f, -0.707106769f}, {-0.720853567f, -0.693087339f},+{-0.734322488f, -0.678800762f}, {-0.747508347f, -0.664252460f},+{-0.760405958f, -0.649448037f}, {-0.773010433f, -0.634393275f},+{-0.785316944f, -0.619093955f}, {-0.797320664f, -0.603555918f},+{-0.809017003f, -0.587785244f}, {-0.820401430f, -0.571787953f},+{-0.831469595f, -0.555570245f}, {-0.842217207f, -0.539138317f},+{-0.852640152f, -0.522498548f}, {-0.862734377f, -0.505657375f},+{-0.872496009f, -0.488621235f}, {-0.881921291f, -0.471396744f},+{-0.891006529f, -0.453990489f}, {-0.899748266f, -0.436409235f},+{-0.908143163f, -0.418659747f}, {-0.916187942f, -0.400748819f},+{-0.923879504f, -0.382683426f}, {-0.931214929f, -0.364470512f},+{-0.938191354f, -0.346117049f}, {-0.944806039f, -0.327630192f},+{-0.951056540f, -0.309017003f}, {-0.956940353f, -0.290284663f},+{-0.962455213f, -0.271440446f}, {-0.967599094f, -0.252491564f},+{-0.972369909f, -0.233445361f}, {-0.976765871f, -0.214309156f},+{-0.980785251f, -0.195090324f}, {-0.984426558f, -0.175796285f},+{-0.987688363f, -0.156434461f}, {-0.990569353f, -0.137012348f},+{-0.993068457f, -0.117537394f}, {-0.995184720f, -0.0980171412f},+{-0.996917307f, -0.0784590989f}, {-0.998265624f, -0.0588708036f},+{-0.999229014f, -0.0392598175f}, {-0.999807239f, -0.0196336918f},+{-1.00000000f, -1.22464685e-16f}, {-0.999807239f, 0.0196336918f},+{-0.999229014f, 0.0392598175f}, {-0.998265624f, 0.0588708036f},+{-0.996917307f, 0.0784590989f}, {-0.995184720f, 0.0980171412f},+{-0.993068457f, 0.117537394f}, {-0.990569353f, 0.137012348f},+{-0.987688363f, 0.156434461f}, {-0.984426558f, 0.175796285f},+{-0.980785251f, 0.195090324f}, {-0.976765871f, 0.214309156f},+{-0.972369909f, 0.233445361f}, {-0.967599094f, 0.252491564f},+{-0.962455213f, 0.271440446f}, {-0.956940353f, 0.290284663f},+{-0.951056540f, 0.309017003f}, {-0.944806039f, 0.327630192f},+{-0.938191354f, 0.346117049f}, {-0.931214929f, 0.364470512f},+{-0.923879504f, 0.382683426f}, {-0.916187942f, 0.400748819f},+{-0.908143163f, 0.418659747f}, {-0.899748266f, 0.436409235f},+{-0.891006529f, 0.453990489f}, {-0.881921291f, 0.471396744f},+{-0.872496009f, 0.488621235f}, {-0.862734377f, 0.505657375f},+{-0.852640152f, 0.522498548f}, {-0.842217207f, 0.539138317f},+{-0.831469595f, 0.555570245f}, {-0.820401430f, 0.571787953f},+{-0.809017003f, 0.587785244f}, {-0.797320664f, 0.603555918f},+{-0.785316944f, 0.619093955f}, {-0.773010433f, 0.634393275f},+{-0.760405958f, 0.649448037f}, {-0.747508347f, 0.664252460f},+{-0.734322488f, 0.678800762f}, {-0.720853567f, 0.693087339f},+{-0.707106769f, 0.707106769f}, {-0.693087339f, 0.720853567f},+{-0.678800762f, 0.734322488f}, {-0.664252460f, 0.747508347f},+{-0.649448037f, 0.760405958f}, {-0.634393275f, 0.773010433f},+{-0.619093955f, 0.785316944f}, {-0.603555918f, 0.797320664f},+{-0.587785244f, 0.809017003f}, {-0.571787953f, 0.820401430f},+{-0.555570245f, 0.831469595f}, {-0.539138317f, 0.842217207f},+{-0.522498548f, 0.852640152f}, {-0.505657375f, 0.862734377f},+{-0.488621235f, 0.872496009f}, {-0.471396744f, 0.881921291f},+{-0.453990489f, 0.891006529f}, {-0.436409235f, 0.899748266f},+{-0.418659747f, 0.908143163f}, {-0.400748819f, 0.916187942f},+{-0.382683426f, 0.923879504f}, {-0.364470512f, 0.931214929f},+{-0.346117049f, 0.938191354f}, {-0.327630192f, 0.944806039f},+{-0.309017003f, 0.951056540f}, {-0.290284663f, 0.956940353f},+{-0.271440446f, 0.962455213f}, {-0.252491564f, 0.967599094f},+{-0.233445361f, 0.972369909f}, {-0.214309156f, 0.976765871f},+{-0.195090324f, 0.980785251f}, {-0.175796285f, 0.984426558f},+{-0.156434461f, 0.987688363f}, {-0.137012348f, 0.990569353f},+{-0.117537394f, 0.993068457f}, {-0.0980171412f, 0.995184720f},+{-0.0784590989f, 0.996917307f}, {-0.0588708036f, 0.998265624f},+{-0.0392598175f, 0.999229014f}, {-0.0196336918f, 0.999807239f},+{-1.83697015e-16f, 1.00000000f}, {0.0196336918f, 0.999807239f},+{0.0392598175f, 0.999229014f}, {0.0588708036f, 0.998265624f},+{0.0784590989f, 0.996917307f}, {0.0980171412f, 0.995184720f},+{0.117537394f, 0.993068457f}, {0.137012348f, 0.990569353f},+{0.156434461f, 0.987688363f}, {0.175796285f, 0.984426558f},+{0.195090324f, 0.980785251f}, {0.214309156f, 0.976765871f},+{0.233445361f, 0.972369909f}, {0.252491564f, 0.967599094f},+{0.271440446f, 0.962455213f}, {0.290284663f, 0.956940353f},+{0.309017003f, 0.951056540f}, {0.327630192f, 0.944806039f},+{0.346117049f, 0.938191354f}, {0.364470512f, 0.931214929f},+{0.382683426f, 0.923879504f}, {0.400748819f, 0.916187942f},+{0.418659747f, 0.908143163f}, {0.436409235f, 0.899748266f},+{0.453990489f, 0.891006529f}, {0.471396744f, 0.881921291f},+{0.488621235f, 0.872496009f}, {0.505657375f, 0.862734377f},+{0.522498548f, 0.852640152f}, {0.539138317f, 0.842217207f},+{0.555570245f, 0.831469595f}, {0.571787953f, 0.820401430f},+{0.587785244f, 0.809017003f}, {0.603555918f, 0.797320664f},+{0.619093955f, 0.785316944f}, {0.634393275f, 0.773010433f},+{0.649448037f, 0.760405958f}, {0.664252460f, 0.747508347f},+{0.678800762f, 0.734322488f}, {0.693087339f, 0.720853567f},+{0.707106769f, 0.707106769f}, {0.720853567f, 0.693087339f},+{0.734322488f, 0.678800762f}, {0.747508347f, 0.664252460f},+{0.760405958f, 0.649448037f}, {0.773010433f, 0.634393275f},+{0.785316944f, 0.619093955f}, {0.797320664f, 0.603555918f},+{0.809017003f, 0.587785244f}, {0.820401430f, 0.571787953f},+{0.831469595f, 0.555570245f}, {0.842217207f, 0.539138317f},+{0.852640152f, 0.522498548f}, {0.862734377f, 0.505657375f},+{0.872496009f, 0.488621235f}, {0.881921291f, 0.471396744f},+{0.891006529f, 0.453990489f}, {0.899748266f, 0.436409235f},+{0.908143163f, 0.418659747f}, {0.916187942f, 0.400748819f},+{0.923879504f, 0.382683426f}, {0.931214929f, 0.364470512f},+{0.938191354f, 0.346117049f}, {0.944806039f, 0.327630192f},+{0.951056540f, 0.309017003f}, {0.956940353f, 0.290284663f},+{0.962455213f, 0.271440446f}, {0.967599094f, 0.252491564f},+{0.972369909f, 0.233445361f}, {0.976765871f, 0.214309156f},+{0.980785251f, 0.195090324f}, {0.984426558f, 0.175796285f},+{0.987688363f, 0.156434461f}, {0.990569353f, 0.137012348f},+{0.993068457f, 0.117537394f}, {0.995184720f, 0.0980171412f},+{0.996917307f, 0.0784590989f}, {0.998265624f, 0.0588708036f},+{0.999229014f, 0.0392598175f}, {0.999807239f, 0.0196336918f},+};
+
+const kiss_fft_state kfft = {+320, /* nfft */
+0.0031250000f, /* scale */
+-1, /* shift */
+{5, 64, 4, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, }, /* factors */+fft_bitrev, /* bitrev*/
+fft_twiddles, /* twiddles*/
+(arch_fft_state *)&arch_fft, /* arch_fft*/
+};
+
+const float half_window[] = {+3.78491532e-05f, 0.000340620492f, 0.000946046319f, 0.00185389258f, 0.00306380726f,
+0.00457531959f, 0.00638783723f, 0.00850064680f, 0.0109129101f, 0.0136236614f,
+0.0166318044f, 0.0199361145f, 0.0235352255f, 0.0274276342f, 0.0316116922f,
+0.0360856056f, 0.0408474281f, 0.0458950549f, 0.0512262285f, 0.0568385124f,
+0.0627293140f, 0.0688958541f, 0.0753351897f, 0.0820441842f, 0.0890194997f,
+0.0962576419f, 0.103754878f, 0.111507311f, 0.119510807f, 0.127761051f,
+0.136253506f, 0.144983411f, 0.153945804f, 0.163135484f, 0.172547072f,
+0.182174906f, 0.192013159f, 0.202055752f, 0.212296382f, 0.222728521f,
+0.233345464f, 0.244140238f, 0.255105674f, 0.266234398f, 0.277518868f,
+0.288951218f, 0.300523549f, 0.312227666f, 0.324055225f, 0.335997701f,
+0.348046392f, 0.360192508f, 0.372427016f, 0.384740859f, 0.397124738f,
+0.409569323f, 0.422065198f, 0.434602767f, 0.447172493f, 0.459764689f,
+0.472369671f, 0.484977663f, 0.497579008f, 0.510163903f, 0.522722721f,
+0.535245717f, 0.547723293f, 0.560145974f, 0.572504222f, 0.584788740f,
+0.596990347f, 0.609099925f, 0.621108532f, 0.633007407f, 0.644788086f,
+0.656442165f, 0.667961538f, 0.679338276f, 0.690564752f, 0.701633692f,
+0.712537885f, 0.723270535f, 0.733825266f, 0.744195819f, 0.754376352f,
+0.764361382f, 0.774145722f, 0.783724606f, 0.793093503f, 0.802248418f,
+0.811185598f, 0.819901764f, 0.828393936f, 0.836659551f, 0.844696403f,
+0.852502763f, 0.860077202f, 0.867418647f, 0.874526560f, 0.881400526f,
+0.888040781f, 0.894447744f, 0.900622249f, 0.906565487f, 0.912279010f,
+0.917764664f, 0.923024654f, 0.928061485f, 0.932878017f, 0.937477291f,
+0.941862822f, 0.946038187f, 0.950007319f, 0.953774393f, 0.957343817f,
+0.960720181f, 0.963908315f, 0.966913164f, 0.969739914f, 0.972393870f,
+0.974880517f, 0.977205336f, 0.979374051f, 0.981392324f, 0.983266115f,
+0.985001266f, 0.986603677f, 0.988079309f, 0.989434063f, 0.990674019f,
+0.991804957f, 0.992832899f, 0.993763626f, 0.994602919f, 0.995356441f,
+0.996029854f, 0.996628702f, 0.997158289f, 0.997623861f, 0.998030603f,
+0.998383403f, 0.998687088f, 0.998946249f, 0.999165416f, 0.999348700f,
+0.999500215f, 0.999623775f, 0.999723017f, 0.999801278f, 0.999861658f,
+0.999907196f, 0.999940455f, 0.999963880f, 0.999979615f, 0.999989510f,
+0.999995291f, 0.999998271f, 0.999999523f, 0.999999940f, 1.00000000f,
+};
+
+const float dct_table[] = {+0.707106769f, 0.996194720f, 0.984807730f, 0.965925813f, 0.939692616f,
+0.906307817f, 0.866025388f, 0.819152057f, 0.766044438f, 0.707106769f,
+0.642787635f, 0.573576450f, 0.500000000f, 0.422618270f, 0.342020154f,
+0.258819044f, 0.173648179f, 0.0871557444f, 0.707106769f, 0.965925813f,
+0.866025388f, 0.707106769f, 0.500000000f, 0.258819044f, 6.12323426e-17f,
+-0.258819044f, -0.500000000f, -0.707106769f, -0.866025388f, -0.965925813f,
+-1.00000000f, -0.965925813f, -0.866025388f, -0.707106769f, -0.500000000f,
+-0.258819044f, 0.707106769f, 0.906307817f, 0.642787635f, 0.258819044f,
+-0.173648179f, -0.573576450f, -0.866025388f, -0.996194720f, -0.939692616f,
+-0.707106769f, -0.342020154f, 0.0871557444f, 0.500000000f, 0.819152057f,
+0.984807730f, 0.965925813f, 0.766044438f, 0.422618270f, 0.707106769f,
+0.819152057f, 0.342020154f, -0.258819044f, -0.766044438f, -0.996194720f,
+-0.866025388f, -0.422618270f, 0.173648179f, 0.707106769f, 0.984807730f,
+0.906307817f, 0.500000000f, -0.0871557444f, -0.642787635f, -0.965925813f,
+-0.939692616f, -0.573576450f, 0.707106769f, 0.707106769f, 6.12323426e-17f,
+-0.707106769f, -1.00000000f, -0.707106769f, -1.83697015e-16f, 0.707106769f,
+1.00000000f, 0.707106769f, 3.06161700e-16f, -0.707106769f, -1.00000000f,
+-0.707106769f, -4.28626385e-16f, 0.707106769f, 1.00000000f, 0.707106769f,
+0.707106769f, 0.573576450f, -0.342020154f, -0.965925813f, -0.766044438f,
+0.0871557444f, 0.866025388f, 0.906307817f, 0.173648179f, -0.707106769f,
+-0.984807730f, -0.422618270f, 0.500000000f, 0.996194720f, 0.642787635f,
+-0.258819044f, -0.939692616f, -0.819152057f, 0.707106769f, 0.422618270f,
+-0.642787635f, -0.965925813f, -0.173648179f, 0.819152057f, 0.866025388f,
+-0.0871557444f, -0.939692616f, -0.707106769f, 0.342020154f, 0.996194720f,
+0.500000000f, -0.573576450f, -0.984807730f, -0.258819044f, 0.766044438f,
+0.906307817f, 0.707106769f, 0.258819044f, -0.866025388f, -0.707106769f,
+0.500000000f, 0.965925813f, 3.06161700e-16f, -0.965925813f, -0.500000000f,
+0.707106769f, 0.866025388f, -0.258819044f, -1.00000000f, -0.258819044f,
+0.866025388f, 0.707106769f, -0.500000000f, -0.965925813f, 0.707106769f,
+0.0871557444f, -0.984807730f, -0.258819044f, 0.939692616f, 0.422618270f,
+-0.866025388f, -0.573576450f, 0.766044438f, 0.707106769f, -0.642787635f,
+-0.819152057f, 0.500000000f, 0.906307817f, -0.342020154f, -0.965925813f,
+0.173648179f, 0.996194720f, 0.707106769f, -0.0871557444f, -0.984807730f,
+0.258819044f, 0.939692616f, -0.422618270f, -0.866025388f, 0.573576450f,
+0.766044438f, -0.707106769f, -0.642787635f, 0.819152057f, 0.500000000f,
+-0.906307817f, -0.342020154f, 0.965925813f, 0.173648179f, -0.996194720f,
+0.707106769f, -0.258819044f, -0.866025388f, 0.707106769f, 0.500000000f,
+-0.965925813f, -4.28626385e-16f, 0.965925813f, -0.500000000f, -0.707106769f,
+0.866025388f, 0.258819044f, -1.00000000f, 0.258819044f, 0.866025388f,
+-0.707106769f, -0.500000000f, 0.965925813f, 0.707106769f, -0.422618270f,
+-0.642787635f, 0.965925813f, -0.173648179f, -0.819152057f, 0.866025388f,
+0.0871557444f, -0.939692616f, 0.707106769f, 0.342020154f, -0.996194720f,
+0.500000000f, 0.573576450f, -0.984807730f, 0.258819044f, 0.766044438f,
+-0.906307817f, 0.707106769f, -0.573576450f, -0.342020154f, 0.965925813f,
+-0.766044438f, -0.0871557444f, 0.866025388f, -0.906307817f, 0.173648179f,
+0.707106769f, -0.984807730f, 0.422618270f, 0.500000000f, -0.996194720f,
+0.642787635f, 0.258819044f, -0.939692616f, 0.819152057f, 0.707106769f,
+-0.707106769f, -1.83697015e-16f, 0.707106769f, -1.00000000f, 0.707106769f,
+5.51091070e-16f, -0.707106769f, 1.00000000f, -0.707106769f, -2.69484189e-15f,
+0.707106769f, -1.00000000f, 0.707106769f, -4.90477710e-16f, -0.707106769f,
+1.00000000f, -0.707106769f, 0.707106769f, -0.819152057f, 0.342020154f,
+0.258819044f, -0.766044438f, 0.996194720f, -0.866025388f, 0.422618270f,
+0.173648179f, -0.707106769f, 0.984807730f, -0.906307817f, 0.500000000f,
+0.0871557444f, -0.642787635f, 0.965925813f, -0.939692616f, 0.573576450f,
+0.707106769f, -0.906307817f, 0.642787635f, -0.258819044f, -0.173648179f,
+0.573576450f, -0.866025388f, 0.996194720f, -0.939692616f, 0.707106769f,
+-0.342020154f, -0.0871557444f, 0.500000000f, -0.819152057f, 0.984807730f,
+-0.965925813f, 0.766044438f, -0.422618270f, 0.707106769f, -0.965925813f,
+0.866025388f, -0.707106769f, 0.500000000f, -0.258819044f, 1.10280111e-15f,
+0.258819044f, -0.500000000f, 0.707106769f, -0.866025388f, 0.965925813f,
+-1.00000000f, 0.965925813f, -0.866025388f, 0.707106769f, -0.500000000f,
+0.258819044f, 0.707106769f, -0.996194720f, 0.984807730f, -0.965925813f,
+0.939692616f, -0.906307817f, 0.866025388f, -0.819152057f, 0.766044438f,
+-0.707106769f, 0.642787635f, -0.573576450f, 0.500000000f, -0.422618270f,
+0.342020154f, -0.258819044f, 0.173648179f, -0.0871557444f, };
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -38,11 +38,16 @@
#include "tansig_table.h"
#include "nnet.h"
#include "nnet_data.h"
+#include "dred_rdovae_constants.h"
#include "plc_data.h"
#ifdef NO_OPTIMIZATIONS
+#if defined(_MSC_VER)
+#pragma message ("Compiling without any vectorization. This code will be very slow")+#else
#warning Compiling without any vectorization. This code will be very slow
#endif
+#endif
#define SOFTMAX_HACK
@@ -316,7 +321,7 @@
state[i] = h[i];
}
-#define MAX_RNN_NEURONS_ALL IMAX(MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS)
+#define MAX_RNN_NEURONS_ALL IMAX(IMAX(MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
{@@ -372,8 +377,8 @@
int i;
int N;
int stride;
- float zrh[3*MAX_RNN_NEURONS];
- float recur[3*MAX_RNN_NEURONS];
+ float zrh[3*MAX_RNN_NEURONS_ALL];
+ float recur[3*MAX_RNN_NEURONS_ALL];
float *z;
float *r;
float *h;
@@ -381,7 +386,7 @@
z = zrh;
r = &zrh[N];
h = &zrh[2*N];
- celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+ celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
celt_assert(input != state);
celt_assert(gru->reset_after);
stride = 3*N;
@@ -406,7 +411,7 @@
{int i, k;
int N;
- float recur[3*MAX_RNN_NEURONS];
+ float recur[3*MAX_RNN_NEURONS_ALL];
float *z;
float *r;
float *h;
@@ -415,7 +420,7 @@
z = recur;
r = &recur[N];
h = &recur[2*N];
- celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+ celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS_ALL);
celt_assert(input != state);
celt_assert(gru->reset_after);
#ifdef USE_SU_BIAS
@@ -442,14 +447,16 @@
state[i] = z[i]*state[i] + (1-z[i])*h[i];
}
+#define MAX_CONV_INPUTS_ALL IMAX(MAX_CONV_INPUTS, DRED_MAX_CONV_INPUTS)
+
void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)
{int i;
int N, M;
int stride;
- float tmp[MAX_CONV_INPUTS];
+ float tmp[MAX_CONV_INPUTS_ALL];
celt_assert(input != output);
- celt_assert(layer->nb_inputs*layer->kernel_size <= MAX_CONV_INPUTS);
+ celt_assert(layer->nb_inputs*layer->kernel_size <= MAX_CONV_INPUTS_ALL);
RNN_COPY(tmp, mem, layer->nb_inputs*(layer->kernel_size-1));
RNN_COPY(&tmp[layer->nb_inputs*(layer->kernel_size-1)], input, layer->nb_inputs);
M = layer->nb_inputs*layer->kernel_size;
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -38,7 +38,30 @@
#define ACTIVATION_SOFTMAX 4
#define ACTIVATION_SWISH 5
+#define WEIGHT_BLOB_VERSION 0
+#define WEIGHT_BLOCK_SIZE 64
typedef struct {+ const char *name;
+ int type;
+ int size;
+ const void *data;
+} WeightArray;
+
+#define WEIGHT_TYPE_float 0
+#define WEIGHT_TYPE_int 1
+#define WEIGHT_TYPE_qweight 2
+
+typedef struct {+ char head[4];
+ int version;
+ int type;
+ int size;
+ int block_size;
+ char name[44];
+} WeightHead;
+
+
+typedef struct {const float *bias;
const float *input_weights;
int nb_inputs;
@@ -121,5 +144,60 @@
void compute_gru_a_input(float *output, const float *input, int N, const EmbeddingLayer *layer1, int val1, const EmbeddingLayer *layer2, int val2, const EmbeddingLayer *layer3, int val3);
int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor);
+
+
+extern const WeightArray lpcnet_arrays[];
+extern const WeightArray lpcnet_plc_arrays[];
+
+int mdense_init(MDenseLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ const char *factor,
+ int nb_inputs,
+ int nb_neurons,
+ int nb_channels,
+ int activation);
+
+int dense_init(DenseLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ int nb_inputs,
+ int nb_neurons,
+ int activation);
+
+int gru_init(GRULayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *subias,
+ const char *input_weights,
+ const char *input_weights_idx,
+ const char *recurrent_weights,
+ int nb_inputs,
+ int nb_neurons,
+ int activation,
+ int reset_after);
+
+int sparse_gru_init(SparseGRULayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *subias,
+ const char *diag_weights,
+ const char *recurrent_weights,
+ const char *idx,
+ int nb_neurons,
+ int activation,
+ int reset_after);
+
+int conv1d_init(Conv1DLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ int nb_inputs,
+ int kernel_size,
+ int nb_neurons,
+ int activation);
+
+int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
+ const char *embedding_weights,
+ int nb_inputs,
+ int dim);
+
#endif /* _MLP_H_ */
--- /dev/null
+++ b/dnn/parse_lpcnet_weights.c
@@ -1,0 +1,254 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "nnet.h"
+
+#define SPARSE_BLOCK_SIZE 32
+
+extern const WeightArray lpcnet_arrays[];
+
+int parse_record(const unsigned char **data, int *len, WeightArray *array) {+ WeightHead *h = (WeightHead *)*data;
+ if (*len < WEIGHT_BLOCK_SIZE) return -1;
+ if (h->block_size < h->size) return -1;
+ if (h->block_size > *len-WEIGHT_BLOCK_SIZE) return -1;
+ if (h->name[sizeof(h->name)-1] != 0) return -1;
+ if (h->size < 0) return -1;
+ array->name = h->name;
+ array->type = h->type;
+ array->size = h->size;
+ array->data = (*data)+WEIGHT_BLOCK_SIZE;
+
+ *data += h->block_size+WEIGHT_BLOCK_SIZE;
+ *len -= h->block_size+WEIGHT_BLOCK_SIZE;
+ return array->size;
+}
+
+int parse_weights(WeightArray **list, const unsigned char *data, int len)
+{+ int nb_arrays=0;
+ int capacity=20;
+ *list = malloc(capacity*sizeof(WeightArray));
+ while (len > 0) {+ int ret;
+ WeightArray array = {NULL, 0, 0, 0};+ ret = parse_record(&data, &len, &array);
+ if (ret > 0) {+ if (nb_arrays+1 >= capacity) {+ /* Make sure there's room for the ending NULL element too. */
+ capacity = capacity*3/2;
+ *list = realloc(*list, capacity*sizeof(WeightArray));
+ }
+ (*list)[nb_arrays++] = array;
+ }
+ }
+ (*list)[nb_arrays].name=NULL;
+ return nb_arrays;
+}
+
+static const void *find_array_entry(const WeightArray *arrays, const char *name) {+ while (arrays->name && strcmp(arrays->name, name) != 0) arrays++;
+ return arrays;
+}
+
+static const void *find_array_check(const WeightArray *arrays, const char *name, int size) {+ const WeightArray *a = find_array_entry(arrays, name);
+ if (a && a->size == size) return a->data;
+ else return NULL;
+}
+
+static const void *find_idx_check(const WeightArray *arrays, const char *name, int nb_in, int nb_out, int *total_blocks) {+ int remain;
+ const int *idx;
+ const WeightArray *a = find_array_entry(arrays, name);
+ *total_blocks = 0;
+ if (a == NULL) return NULL;
+ idx = a->data;
+ remain = a->size/sizeof(int);
+ while (remain > 0) {+ int nb_blocks;
+ int i;
+ nb_blocks = *idx++;
+ if (remain < nb_blocks+1) return NULL;
+ for (i=0;i<nb_blocks;i++) {+ int pos = *idx++;
+ if (pos+3 >= nb_in || (pos&0x3)) return NULL;
+ }
+ nb_out -= 8;
+ remain -= nb_blocks+1;
+ *total_blocks += nb_blocks;
+ }
+ if (nb_out != 0) return NULL;
+ return a->data;
+}
+
+int mdense_init(MDenseLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ const char *factor,
+ int nb_inputs,
+ int nb_neurons,
+ int nb_channels,
+ int activation)
+{+ if ((layer->bias = find_array_check(arrays, bias, nb_neurons*nb_channels*sizeof(layer->bias[0]))) == NULL) return 1;
+ if ((layer->input_weights = find_array_check(arrays, input_weights, nb_inputs*nb_channels*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+ if ((layer->factor = find_array_check(arrays, factor, nb_channels*nb_neurons*sizeof(layer->factor[0]))) == NULL) return 1;
+ layer->nb_inputs = nb_inputs;
+ layer->nb_neurons = nb_neurons;
+ layer->nb_channels = nb_channels;
+ layer->activation = activation;
+ return 0;
+}
+
+int dense_init(DenseLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ int nb_inputs,
+ int nb_neurons,
+ int activation)
+{+ if ((layer->bias = find_array_check(arrays, bias, nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+ if ((layer->input_weights = find_array_check(arrays, input_weights, nb_inputs*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+ layer->nb_inputs = nb_inputs;
+ layer->nb_neurons = nb_neurons;
+ layer->activation = activation;
+ return 0;
+}
+
+int gru_init(GRULayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *subias,
+ const char *input_weights,
+ const char *input_weights_idx,
+ const char *recurrent_weights,
+ int nb_inputs,
+ int nb_neurons,
+ int activation,
+ int reset_after)
+{+ int total_blocks;
+ if ((layer->bias = find_array_check(arrays, bias, 6*nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+ if ((layer->subias = find_array_check(arrays, subias, 6*nb_neurons*sizeof(layer->subias[0]))) == NULL) return 1;
+ if ((layer->input_weights_idx = find_idx_check(arrays, input_weights_idx, nb_inputs, 3*nb_neurons, &total_blocks)) == NULL) return 1;
+ if ((layer->input_weights = find_array_check(arrays, input_weights, SPARSE_BLOCK_SIZE*total_blocks*sizeof(layer->input_weights[0]))) == NULL) return 1;
+ if ((layer->recurrent_weights = find_array_check(arrays, recurrent_weights, 3*nb_neurons*nb_neurons*sizeof(layer->recurrent_weights[0]))) == NULL) return 1;
+ layer->nb_inputs = nb_inputs;
+ layer->nb_neurons = nb_neurons;
+ layer->activation = activation;
+ layer->reset_after = reset_after;
+ return 0;
+}
+
+int sparse_gru_init(SparseGRULayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *subias,
+ const char *diag_weights,
+ const char *recurrent_weights,
+ const char *idx,
+ int nb_neurons,
+ int activation,
+ int reset_after)
+{+ int total_blocks;
+ if ((layer->bias = find_array_check(arrays, bias, 6*nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+ if ((layer->subias = find_array_check(arrays, subias, 6*nb_neurons*sizeof(layer->subias[0]))) == NULL) return 1;
+ if ((layer->diag_weights = find_array_check(arrays, diag_weights, 3*nb_neurons*sizeof(layer->diag_weights[0]))) == NULL) return 1;
+ if ((layer->idx = find_idx_check(arrays, idx, nb_neurons, 3*nb_neurons, &total_blocks)) == NULL) return 1;
+ if ((layer->recurrent_weights = find_array_check(arrays, recurrent_weights, SPARSE_BLOCK_SIZE*total_blocks*sizeof(layer->recurrent_weights[0]))) == NULL) return 1;
+ layer->nb_neurons = nb_neurons;
+ layer->activation = activation;
+ layer->reset_after = reset_after;
+ return 0;
+}
+
+int conv1d_init(Conv1DLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *input_weights,
+ int nb_inputs,
+ int kernel_size,
+ int nb_neurons,
+ int activation)
+{+ if ((layer->bias = find_array_check(arrays, bias, nb_neurons*sizeof(layer->bias[0]))) == NULL) return 1;
+ if ((layer->input_weights = find_array_check(arrays, input_weights, kernel_size*nb_inputs*nb_neurons*sizeof(layer->input_weights[0]))) == NULL) return 1;
+ layer->nb_inputs = nb_inputs;
+ layer->kernel_size = kernel_size;
+ layer->nb_neurons = nb_neurons;
+ layer->activation = activation;
+ return 0;
+}
+
+int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
+ const char *embedding_weights,
+ int nb_inputs,
+ int dim)
+{+ if ((layer->embedding_weights = find_array_check(arrays, embedding_weights, nb_inputs*dim*sizeof(layer->embedding_weights[0]))) == NULL) return 1;
+ layer->nb_inputs = nb_inputs;
+ layer->dim = dim;
+ return 0;
+}
+
+
+
+#if 0
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <stdio.h>
+
+int main()
+{+ int fd;
+ unsigned char *data;
+ int len;
+ int nb_arrays;
+ int i;
+ WeightArray *list;
+ struct stat st;
+ const char *filename = "weights_blob.bin";
+ stat(filename, &st);
+ len = st.st_size;
+ fd = open(filename, O_RDONLY);
+ data = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+ printf("size is %d\n", len);+ nb_arrays = parse_weights(&list, data, len);
+ for (i=0;i<nb_arrays;i++) {+ printf("found %s: size %d\n", list[i].name, list[i].size);+ }
+ printf("%p\n", list[i].name);+ free(list);
+ munmap(data, len);
+ close(fd);
+ return 0;
+}
+#endif
--- a/dnn/test_vec.c
+++ b/dnn/test_vec.c
@@ -10,7 +10,7 @@
// we need to call two versions of each functions that have the same
// name, so use #defines to temp rename them
-#define celt_exp2 celt_exp2_fast
+#define lpcnet_exp2 lpcnet_exp2_fast
#define tansig_approx tansig_approx_fast
#define sigmoid_approx sigmoid_approx_fast
#define softmax softmax_fast
@@ -34,7 +34,7 @@
#endif
-#undef celt_exp2
+#undef lpcnet_exp2
#undef tansig_approx
#undef sigmoid_approx
#undef softmax
--- /dev/null
+++ b/dnn/torch/rdovae/README.md
@@ -1,0 +1,24 @@
+# Rate-Distortion-Optimized Variational Auto-Encoder
+
+## Setup
+The python code requires python >= 3.6 and has been tested with python 3.6 and python 3.10. To install requirements run
+```
+python -m pip install -r requirements.txt
+```
+
+## Training
+To generate training data use dump date from the main LPCNet repo
+```
+./dump_data -train 16khz_speech_input.s16 features.f32 data.s16
+```
+
+To train the model, simply run
+```
+python train_rdovae.py features.f32 output_folder
+```
+
+To train on CUDA device add `--cuda-visible-devices idx`.
+
+
+## ToDo
+- Upload checkpoints and add URLs
--- /dev/null
+++ b/dnn/torch/rdovae/export_rdovae_weights.py
@@ -1,0 +1,258 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('checkpoint', type=str, help='rdovae model checkpoint')+parser.add_argument('output_dir', type=str, help='output folder')+parser.add_argument('--format', choices=['C', 'numpy'], help='output format, default: C', default='C')+
+args = parser.parse_args()
+
+import torch
+import numpy as np
+
+from rdovae import RDOVAE
+from wexchange.torch import dump_torch_weights
+from wexchange.c_export import CWriter, print_vector
+
+
+def dump_statistical_model(writer, qembedding):
+ w = qembedding.weight.detach()
+ levels, dim = w.shape
+ N = dim // 6
+
+ print("printing statistical model")+ quant_scales = torch.nn.functional.softplus(w[:, : N]).numpy()
+ dead_zone = 0.05 * torch.nn.functional.softplus(w[:, N : 2 * N]).numpy()
+ r = torch.sigmoid(w[:, 5 * N : 6 * N]).numpy()
+ p0 = torch.sigmoid(w[:, 4 * N : 5 * N]).numpy()
+ p0 = 1 - r ** (0.5 + 0.5 * p0)
+
+ quant_scales_q8 = np.round(quant_scales * 2**8).astype(np.uint16)
+ dead_zone_q10 = np.round(dead_zone * 2**10).astype(np.uint16)
+ r_q15 = np.round(r * 2**15).astype(np.uint16)
+ p0_q15 = np.round(p0 * 2**15).astype(np.uint16)
+
+ print_vector(writer.source, quant_scales_q8, 'dred_quant_scales_q8', dtype='opus_uint16', static=False)
+ print_vector(writer.source, dead_zone_q10, 'dred_dead_zone_q10', dtype='opus_uint16', static=False)
+ print_vector(writer.source, r_q15, 'dred_r_q15', dtype='opus_uint16', static=False)
+ print_vector(writer.source, p0_q15, 'dred_p0_q15', dtype='opus_uint16', static=False)
+
+ writer.header.write(
+f"""
+extern const opus_uint16 dred_quant_scales_q8[{levels * N}];+extern const opus_uint16 dred_dead_zone_q10[{levels * N}];+extern const opus_uint16 dred_r_q15[{levels * N}];+extern const opus_uint16 dred_p0_q15[{levels * N}];+
+"""
+ )
+
+
+def c_export(args, model):
+
+ message = f"Auto generated from checkpoint {os.path.basename(args.checkpoint)}"+
+ enc_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_enc_data"), message=message)
+ dec_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_dec_data"), message=message)
+ stats_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_stats_data"), message=message)
+ constants_writer = CWriter(os.path.join(args.output_dir, "dred_rdovae_constants"), message=message, header_only=True)
+
+ # some custom includes
+ for writer in [enc_writer, dec_writer, stats_writer]:
+ writer.header.write(
+f"""
+#include "opus_types.h"
+
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+ )
+
+ # encoder
+ encoder_dense_layers = [
+ ('core_encoder.module.dense_1' , 'enc_dense1', 'TANH'), + ('core_encoder.module.dense_2' , 'enc_dense3', 'TANH'),+ ('core_encoder.module.dense_3' , 'enc_dense5', 'TANH'),+ ('core_encoder.module.dense_4' , 'enc_dense7', 'TANH'),+ ('core_encoder.module.dense_5' , 'enc_dense8', 'TANH'),+ ('core_encoder.module.state_dense_1' , 'gdense1' , 'TANH'),+ ('core_encoder.module.state_dense_2' , 'gdense2' , 'TANH')+ ]
+
+ for name, export_name, activation in encoder_dense_layers:
+ layer = model.get_submodule(name)
+ dump_torch_weights(enc_writer, layer, name=export_name, activation=activation, verbose=True)
+
+
+ encoder_gru_layers = [
+ ('core_encoder.module.gru_1' , 'enc_dense2', 'TANH'),+ ('core_encoder.module.gru_2' , 'enc_dense4', 'TANH'),+ ('core_encoder.module.gru_3' , 'enc_dense6', 'TANH')+ ]
+
+ enc_max_rnn_units = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
+ for name, export_name, activation in encoder_gru_layers])
+
+
+ encoder_conv_layers = [
+ ('core_encoder.module.conv1' , 'bits_dense' , 'LINEAR') + ]
+
+ enc_max_conv_inputs = max([dump_torch_weights(enc_writer, model.get_submodule(name), export_name, activation, verbose=True) for name, export_name, activation in encoder_conv_layers])
+
+
+ del enc_writer
+
+ # decoder
+ decoder_dense_layers = [
+ ('core_decoder.module.gru_1_init' , 'state1', 'TANH'),+ ('core_decoder.module.gru_2_init' , 'state2', 'TANH'),+ ('core_decoder.module.gru_3_init' , 'state3', 'TANH'),+ ('core_decoder.module.dense_1' , 'dec_dense1', 'TANH'),+ ('core_decoder.module.dense_2' , 'dec_dense3', 'TANH'),+ ('core_decoder.module.dense_3' , 'dec_dense5', 'TANH'),+ ('core_decoder.module.dense_4' , 'dec_dense7', 'TANH'),+ ('core_decoder.module.dense_5' , 'dec_dense8', 'TANH'),+ ('core_decoder.module.output' , 'dec_final', 'LINEAR')+ ]
+
+ for name, export_name, activation in decoder_dense_layers:
+ layer = model.get_submodule(name)
+ dump_torch_weights(dec_writer, layer, name=export_name, activation=activation, verbose=True)
+
+
+ decoder_gru_layers = [
+ ('core_decoder.module.gru_1' , 'dec_dense2', 'TANH'),+ ('core_decoder.module.gru_2' , 'dec_dense4', 'TANH'),+ ('core_decoder.module.gru_3' , 'dec_dense6', 'TANH')+ ]
+
+ dec_max_rnn_units = max([dump_torch_weights(dec_writer, model.get_submodule(name), export_name, activation, verbose=True, input_sparse=True, dotp=True)
+ for name, export_name, activation in decoder_gru_layers])
+
+ del dec_writer
+
+ # statistical model
+ qembedding = model.statistical_model.quant_embedding
+ dump_statistical_model(stats_writer, qembedding)
+
+ del stats_writer
+
+ # constants
+ constants_writer.header.write(
+f"""
+#define DRED_NUM_FEATURES {model.feature_dim}+
+#define DRED_LATENT_DIM {model.latent_dim}+
+#define DRED_STATE_DIME {model.state_dim}+
+#define DRED_NUM_QUANTIZATION_LEVELS {model.quant_levels}+
+#define DRED_MAX_RNN_NEURONS {max(enc_max_rnn_units, dec_max_rnn_units)}+
+#define DRED_MAX_CONV_INPUTS {enc_max_conv_inputs}+
+#define DRED_ENC_MAX_RNN_NEURONS {enc_max_conv_inputs}+
+#define DRED_ENC_MAX_CONV_INPUTS {enc_max_conv_inputs}+
+#define DRED_DEC_MAX_RNN_NEURONS {dec_max_rnn_units}+
+"""
+ )
+
+ del constants_writer
+
+
+def numpy_export(args, model):
+
+ exchange_name_to_name = {+ 'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1',
+ 'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2',
+ 'encoder_stack_layer5_dense' : 'core_encoder.module.dense_3',
+ 'encoder_stack_layer7_dense' : 'core_encoder.module.dense_4',
+ 'encoder_stack_layer8_dense' : 'core_encoder.module.dense_5',
+ 'encoder_state_layer1_dense' : 'core_encoder.module.state_dense_1',
+ 'encoder_state_layer2_dense' : 'core_encoder.module.state_dense_2',
+ 'encoder_stack_layer2_gru' : 'core_encoder.module.gru_1',
+ 'encoder_stack_layer4_gru' : 'core_encoder.module.gru_2',
+ 'encoder_stack_layer6_gru' : 'core_encoder.module.gru_3',
+ 'encoder_stack_layer9_conv' : 'core_encoder.module.conv1',
+ 'statistical_model_embedding' : 'statistical_model.quant_embedding',
+ 'decoder_state1_dense' : 'core_decoder.module.gru_1_init',
+ 'decoder_state2_dense' : 'core_decoder.module.gru_2_init',
+ 'decoder_state3_dense' : 'core_decoder.module.gru_3_init',
+ 'decoder_stack_layer1_dense' : 'core_decoder.module.dense_1',
+ 'decoder_stack_layer3_dense' : 'core_decoder.module.dense_2',
+ 'decoder_stack_layer5_dense' : 'core_decoder.module.dense_3',
+ 'decoder_stack_layer7_dense' : 'core_decoder.module.dense_4',
+ 'decoder_stack_layer8_dense' : 'core_decoder.module.dense_5',
+ 'decoder_stack_layer9_dense' : 'core_decoder.module.output',
+ 'decoder_stack_layer2_gru' : 'core_decoder.module.gru_1',
+ 'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2',
+ 'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3'
+ }
+
+ name_to_exchange_name = {value : key for key, value in exchange_name_to_name.items()}+
+ for name, exchange_name in name_to_exchange_name.items():
+ print(f"printing layer {name}...")+ dump_torch_weights(os.path.join(args.output_dir, exchange_name), model.get_submodule(name))
+
+
+if __name__ == "__main__":
+
+
+ os.makedirs(args.output_dir, exist_ok=True)
+
+
+ # load model from checkpoint
+ checkpoint = torch.load(args.checkpoint, map_location='cpu')
+ model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+ missing_keys, unmatched_keys = model.load_state_dict(checkpoint['state_dict'], strict=False)
+
+ if len(missing_keys) > 0:
+ raise ValueError(f"error: missing keys in state dict")
+
+ if len(unmatched_keys) > 0:
+ print(f"warning: the following keys were unmatched {unmatched_keys}")+
+ if args.format == 'C':
+ c_export(args, model)
+ elif args.format == 'numpy':
+ numpy_export(args, model)
+ else:
+ raise ValueError(f'error: unknown export format {args.format}')\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/fec_encoder.py
@@ -1,0 +1,213 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe and Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import subprocess
+import argparse
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames')
+
+parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)')+parser.add_argument('checkpoint', metavar='<weights>', help='model checkpoint')+parser.add_argument('q0', metavar='<quant level 0>', type=int, help='quantization level for most recent frame')+parser.add_argument('q1', metavar='<quant level 1>', type=int, help='quantization level for oldest frame')+parser.add_argument('output', type=str, help='output file (will be extended with .fec)')+
+parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)')+parser.add_argument('--num-redundancy-frames', default=52, type=int, help='number of redundancy frames per packet (default 52)')+parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)")+parser.add_argument('--lossfile', type=str, help='file containing loss trace (0 for frame received, 1 for lost)')+parser.add_argument('--debug-output', action='store_true', help='if set, differently assembled features are written to disk')+
+args = parser.parse_args()
+
+import numpy as np
+from scipy.io import wavfile
+import torch
+
+from rdovae import RDOVAE
+from packets import write_fec_packets
+
+torch.set_num_threads(4)
+
+checkpoint = torch.load(args.checkpoint, map_location="cpu")
+model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+model.load_state_dict(checkpoint['state_dict'], strict=False)
+model.to("cpu")+
+lpc_order = 16
+
+## prepare input signal
+# SILK frame size is 20ms and LPCNet subframes are 10ms
+subframe_size = 160
+frame_size = 2 * subframe_size
+
+# 91 samples delay to align with SILK decoded frames
+silk_delay = 91
+
+# prepend zeros to have enough history to produce the first package
+zero_history = (args.num_redundancy_frames - 1) * frame_size
+
+# dump data has a (feature) delay of 10ms
+dump_data_delay = 160
+
+total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
+
+# load signal
+if args.input.endswith('.raw') or args.input.endswith('.pcm'):+ signal = np.fromfile(args.input, dtype='int16')
+
+elif args.input.endswith('.wav'):+ fs, signal = wavfile.read(args.input)
+else:
+ raise ValueError(f'unknown input signal format: {args.input}')+
+# fill up last frame with zeros
+padded_signal_length = len(signal) + total_delay
+tail = padded_signal_length % frame_size
+right_padding = (frame_size - tail) % frame_size
+
+signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
+
+padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
+signal.tofile(padded_signal_file)
+
+# write signal and call dump_data to create features
+
+feature_file = os.path.splitext(args.input)[0] + '_features.f32'
+command = f"{args.dump_data} -test {padded_signal_file} {feature_file}"+r = subprocess.run(command, shell=True)
+if r.returncode != 0:
+ raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")+
+# load features
+nb_features = model.feature_dim + lpc_order
+nb_used_features = model.feature_dim
+
+# load features
+features = np.fromfile(feature_file, dtype='float32')
+num_subframes = len(features) // nb_features
+num_subframes = 2 * (num_subframes // 2)
+num_frames = num_subframes // 2
+
+features = np.reshape(features, (1, -1, nb_features))
+features = features[:, :, :nb_used_features]
+features = features[:, :num_subframes, :]
+
+# quant_ids in reverse decoding order
+quant_ids = torch.round((args.q1 + (args.q0 - args.q1) * torch.arange(args.num_redundancy_frames // 2) / (args.num_redundancy_frames // 2 - 1))).long()
+
+print(f"using quantization levels {quant_ids}...")+
+# convert input to torch tensors
+features = torch.from_numpy(features)
+
+
+# run encoder
+print("running fec encoder...")+with torch.no_grad():
+
+ # encoding
+ z, states, state_size = model.encode(features)
+
+
+ # decoder on packet chunks
+ input_length = args.num_redundancy_frames // 2
+ offset = args.num_redundancy_frames - 1
+
+ packets = []
+ packet_sizes = []
+
+ for i in range(offset, num_frames):
+ print(f"processing frame {i - offset}...")+ # quantize / unquantize latent vectors
+ zi = torch.clone(z[:, i - 2 * input_length + 2: i + 1 : 2, :])
+ zi, rates = model.quantize(zi, quant_ids)
+ zi = model.unquantize(zi, quant_ids)
+
+ features = model.decode(zi, states[:, i : i + 1, :])
+ packets.append(features.squeeze(0).numpy())
+ packet_size = 8 * int((torch.sum(rates) + 7 + state_size) / 8)
+ packet_sizes.append(packet_size)
+
+
+# write packets
+packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output+write_fec_packets(packet_file, packets, packet_sizes)
+
+
+print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps")+
+# assemble features according to loss file
+if args.lossfile != None:
+ num_packets = len(packets)
+ loss = np.loadtxt(args.lossfile, dtype='int16')
+ fec_out = np.zeros((num_packets * 2, packets[0].shape[-1]), dtype='float32')
+ foffset = -2
+ ptr = 0
+ count = 2
+ for i in range(num_packets):
+ if (loss[i] == 0) or (i == num_packets - 1):
+
+ fec_out[ptr:ptr+count,:] = packets[i][foffset:, :]
+
+ ptr += count
+ foffset = -2
+ count = 2
+ else:
+ count += 2
+ foffset -= 2
+
+ fec_out_full = np.zeros((fec_out.shape[0], 36), dtype=np.float32)
+ fec_out_full[:, : fec_out.shape[-1]] = fec_out
+
+ fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
+
+
+if args.debug_output:
+ import itertools
+
+ batches = [4]
+ offsets = [0, 2 * args.num_redundancy_frames - 4]
+
+ # sanity checks
+ # 1. concatenate features at offset 0
+ for batch, offset in itertools.product(batches, offsets):
+
+ stop = packets[0].shape[1] - offset
+ test_features = np.concatenate([packet[stop - batch: stop, :] for packet in packets[::batch//2]], axis=0)
+
+ test_features_full = np.zeros((test_features.shape[0], nb_features), dtype=np.float32)
+ test_features_full[:, :nb_used_features] = test_features[:, :]
+
+ print(f"writing debug output {packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32'}")+ test_features_full.tofile(packet_file[:-4] + f'_torch_batch{batch}_offset{offset}.f32')+
--- /dev/null
+++ b/dnn/torch/rdovae/import_rdovae_weights.py
@@ -1,0 +1,143 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+import argparse
+
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('exchange_folder', type=str, help='exchange folder path')+parser.add_argument('output', type=str, help='path to output model checkpoint')+
+model_group = parser.add_argument_group(title="model parameters")
+model_group.add_argument('--num-features', type=int, help="number of features, default: 20", default=20)+model_group.add_argument('--latent-dim', type=int, help="number of symbols produces by encoder, default: 80", default=80)+model_group.add_argument('--cond-size', type=int, help="first conditioning size, default: 256", default=256)+model_group.add_argument('--cond-size2', type=int, help="second conditioning size, default: 256", default=256)+model_group.add_argument('--state-dim', type=int, help="dimensionality of transfered state, default: 24", default=24)+model_group.add_argument('--quant-levels', type=int, help="number of quantization levels, default: 40", default=40)+
+args = parser.parse_args()
+
+import torch
+from rdovae import RDOVAE
+from wexchange.torch import load_torch_weights
+
+exchange_name_to_name = {+ 'encoder_stack_layer1_dense' : 'core_encoder.module.dense_1',
+ 'encoder_stack_layer3_dense' : 'core_encoder.module.dense_2',
+ 'encoder_stack_layer5_dense' : 'core_encoder.module.dense_3',
+ 'encoder_stack_layer7_dense' : 'core_encoder.module.dense_4',
+ 'encoder_stack_layer8_dense' : 'core_encoder.module.dense_5',
+ 'encoder_state_layer1_dense' : 'core_encoder.module.state_dense_1',
+ 'encoder_state_layer2_dense' : 'core_encoder.module.state_dense_2',
+ 'encoder_stack_layer2_gru' : 'core_encoder.module.gru_1',
+ 'encoder_stack_layer4_gru' : 'core_encoder.module.gru_2',
+ 'encoder_stack_layer6_gru' : 'core_encoder.module.gru_3',
+ 'encoder_stack_layer9_conv' : 'core_encoder.module.conv1',
+ 'statistical_model_embedding' : 'statistical_model.quant_embedding',
+ 'decoder_state1_dense' : 'core_decoder.module.gru_1_init',
+ 'decoder_state2_dense' : 'core_decoder.module.gru_2_init',
+ 'decoder_state3_dense' : 'core_decoder.module.gru_3_init',
+ 'decoder_stack_layer1_dense' : 'core_decoder.module.dense_1',
+ 'decoder_stack_layer3_dense' : 'core_decoder.module.dense_2',
+ 'decoder_stack_layer5_dense' : 'core_decoder.module.dense_3',
+ 'decoder_stack_layer7_dense' : 'core_decoder.module.dense_4',
+ 'decoder_stack_layer8_dense' : 'core_decoder.module.dense_5',
+ 'decoder_stack_layer9_dense' : 'core_decoder.module.output',
+ 'decoder_stack_layer2_gru' : 'core_decoder.module.gru_1',
+ 'decoder_stack_layer4_gru' : 'core_decoder.module.gru_2',
+ 'decoder_stack_layer6_gru' : 'core_decoder.module.gru_3'
+}
+
+if __name__ == "__main__":
+ checkpoint = dict()
+
+ # parameters
+ num_features = args.num_features
+ latent_dim = args.latent_dim
+ quant_levels = args.quant_levels
+ cond_size = args.cond_size
+ cond_size2 = args.cond_size2
+ state_dim = args.state_dim
+
+
+ # model
+ checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
+ checkpoint['model_kwargs'] = {'state_dim': state_dim}+ model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+
+ dense_layer_names = [
+ 'encoder_stack_layer1_dense',
+ 'encoder_stack_layer3_dense',
+ 'encoder_stack_layer5_dense',
+ 'encoder_stack_layer7_dense',
+ 'encoder_stack_layer8_dense',
+ 'encoder_state_layer1_dense',
+ 'encoder_state_layer2_dense',
+ 'decoder_state1_dense',
+ 'decoder_state2_dense',
+ 'decoder_state3_dense',
+ 'decoder_stack_layer1_dense',
+ 'decoder_stack_layer3_dense',
+ 'decoder_stack_layer5_dense',
+ 'decoder_stack_layer7_dense',
+ 'decoder_stack_layer8_dense',
+ 'decoder_stack_layer9_dense'
+ ]
+
+ gru_layer_names = [
+ 'encoder_stack_layer2_gru',
+ 'encoder_stack_layer4_gru',
+ 'encoder_stack_layer6_gru',
+ 'decoder_stack_layer2_gru',
+ 'decoder_stack_layer4_gru',
+ 'decoder_stack_layer6_gru'
+ ]
+
+ conv1d_layer_names = [
+ 'encoder_stack_layer9_conv'
+ ]
+
+ embedding_layer_names = [
+ 'statistical_model_embedding'
+ ]
+
+ for name in dense_layer_names + gru_layer_names + conv1d_layer_names + embedding_layer_names:
+ print(f"loading weights for layer {exchange_name_to_name[name]}")+ layer = model.get_submodule(exchange_name_to_name[name])
+ load_torch_weights(os.path.join(args.exchange_folder, name), layer)
+
+ checkpoint['state_dict'] = model.state_dict()
+
+ torch.save(checkpoint, args.output)
\ No newline at end of file
binary files /dev/null b/dnn/torch/rdovae/libs/wexchange-1.0-py3-none-any.whl differ
binary files /dev/null b/dnn/torch/rdovae/libs/wexchange-1.2-py3-none-any.whl differ
--- /dev/null
+++ b/dnn/torch/rdovae/packets/__init__.py
@@ -1,0 +1,1 @@
+from .fec_packets import write_fec_packets, read_fec_packets
\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.c
@@ -1,0 +1,142 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "fec_packets.h"
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index)
+{+
+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets || subframe_index >= subframes_per_packet)
+ {+ fprintf(stderr, "get_fec_frame: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read features */
+ if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error;
+
+ fclose(fid);
+ return 0;
+
+error:
+ fclose(fid);
+ return 1;
+}
+
+int get_fec_rate(const char * const filename, int packet_index)
+{+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+ int16_t rate;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets)
+ {+ fprintf(stderr, "get_fec_rate: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read rate */
+ if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error;
+
+ fclose(fid);
+ return (int) rate;
+
+error:
+ fclose(fid);
+ return -1;
+}
+
+#if 0
+int main()
+{+ float features[20];
+ int i;
+
+ if (get_fec_frame("../test.fec", &features[0], 0, 127))+ {+ return 1;
+ }
+
+ for (i = 0; i < 20; i ++)
+ {+ printf("%d %f\n", i, features[i]);+ }
+
+ printf("rate: %d\n", get_fec_rate("../test.fec", 0));+
+}
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.h
@@ -1,0 +1,34 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FEC_PACKETS_H
+#define _FEC_PACKETS_H
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index);
+int get_fec_rate(const char * const filename, int packet_index);
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/packets/fec_packets.py
@@ -1,0 +1,108 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import numpy as np
+
+
+
+def write_fec_packets(filename, packets, rates=None):
+ """ writes packets in binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ # derive some sizes
+ num_packets = len(packets)
+ subframes_per_packet = packets[0].shape[-2]
+ num_features = packets[0].shape[-1]
+
+ # size of float is 4
+ subframe_size = num_features * 4
+ packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
+
+ version = 1
+ # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
+ header_size = 14
+
+ with open(filename, 'wb') as f:
+
+ # header
+ f.write(np.int16(version).tobytes())
+ f.write(np.int16(header_size).tobytes())
+ f.write(np.int16(num_packets).tobytes())
+ f.write(np.int16(packet_size).tobytes())
+ f.write(np.int16(subframe_size).tobytes())
+ f.write(np.int16(subframes_per_packet).tobytes())
+ f.write(np.int16(num_features).tobytes())
+
+ # packets
+ for i, packet in enumerate(packets):
+ if type(rates) == type(None):
+ rate = 0
+ else:
+ rate = rates[i]
+
+ f.write(np.int16(rate).tobytes())
+
+ features = np.flip(packet, axis=-2)
+ f.write(features.astype(np.float32).tobytes())
+
+
+def read_fec_packets(filename):
+ """ reads packets from binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ with open(filename, 'rb') as f:
+
+ # header
+ version = np.frombuffer(f.read(2), dtype=np.int16).item()
+ header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_packets = np.frombuffer(f.read(2), dtype=np.int16).item()
+ packet_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
+
+ dummy_features = np.zeros((subframes_per_packet, num_features), dtype=np.float32)
+
+ # packets
+ rates = []
+ packets = []
+ for i in range(num_packets):
+
+ rate = np.frombuffer(f.read(2), dtype=np.int16).item
+ rates.append(rate)
+
+ features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
+ packet = np.flip(features, axis=-2)
+ packets.append(packet)
+
+ return packets
\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/__init__.py
@@ -1,0 +1,2 @@
+from .rdovae import RDOVAE, distortion_loss, hard_rate_estimate, soft_rate_estimate
+from .dataset import RDOVAEDataset
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/dataset.py
@@ -1,0 +1,68 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import torch
+import numpy as np
+
+class RDOVAEDataset(torch.utils.data.Dataset):
+ def __init__(self,
+ feature_file,
+ sequence_length,
+ num_used_features=20,
+ num_features=36,
+ lambda_min=0.0002,
+ lambda_max=0.0135,
+ quant_levels=16,
+ enc_stride=2):
+
+ self.sequence_length = sequence_length
+ self.lambda_min = lambda_min
+ self.lambda_max = lambda_max
+ self.enc_stride = enc_stride
+ self.quant_levels = quant_levels
+ self.denominator = (quant_levels - 1) / np.log(lambda_max / lambda_min)
+
+ if sequence_length % enc_stride:
+ raise ValueError(f"RDOVAEDataset.__init__: enc_stride {enc_stride} does not divide sequence length {sequence_length}")+
+ self.features = np.reshape(np.fromfile(feature_file, dtype=np.float32), (-1, num_features))
+ self.features = self.features[:, :num_used_features]
+ self.num_sequences = self.features.shape[0] // sequence_length
+
+ def __len__(self):
+ return self.num_sequences
+
+ def __getitem__(self, index):
+ features = self.features[index * self.sequence_length: (index + 1) * self.sequence_length, :]
+ q_ids = np.random.randint(0, self.quant_levels, (1)).astype(np.int64)
+ q_ids = np.repeat(q_ids, self.sequence_length // self.enc_stride, axis=0)
+ rate_lambda = self.lambda_min * np.exp(q_ids.astype(np.float32) / self.denominator).astype(np.float32)
+
+ return features, rate_lambda, q_ids
+
--- /dev/null
+++ b/dnn/torch/rdovae/rdovae/rdovae.py
@@ -1,0 +1,614 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+""" Pytorch implementations of rate distortion optimized variational autoencoder """
+
+import math as m
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+# Quantization and rate related utily functions
+
+def soft_pvq(x, k):
+ """ soft pyramid vector quantizer """
+
+ # L2 normalization
+ x_norm2 = x / (1e-15 + torch.norm(x, dim=-1, keepdim=True))
+
+
+ with torch.no_grad():
+ # quantization loop, no need to track gradients here
+ x_norm1 = x / torch.sum(torch.abs(x), dim=-1, keepdim=True)
+
+ # set initial scaling factor to k
+ scale_factor = k
+ x_scaled = scale_factor * x_norm1
+ x_quant = torch.round(x_scaled)
+
+ # we aim for ||x_quant||_L1 = k
+ for _ in range(10):
+ # remove signs and calculate L1 norm
+ abs_x_quant = torch.abs(x_quant)
+ abs_x_scaled = torch.abs(x_scaled)
+ l1_x_quant = torch.sum(abs_x_quant, axis=-1)
+
+ # increase, where target is too small and decrease, where target is too large
+ plus = 1.0001 * torch.min((abs_x_quant + 0.5) / (abs_x_scaled + 1e-15), dim=-1).values
+ minus = 0.9999 * torch.max((abs_x_quant - 0.5) / (abs_x_scaled + 1e-15), dim=-1).values
+ factor = torch.where(l1_x_quant > k, minus, plus)
+ factor = torch.where(l1_x_quant == k, torch.ones_like(factor), factor)
+ scale_factor = scale_factor * factor.unsqueeze(-1)
+
+ # update x
+ x_scaled = scale_factor * x_norm1
+ x_quant = torch.round(x_quant)
+
+ # L2 normalization of quantized x
+ x_quant_norm2 = x_quant / (1e-15 + torch.norm(x_quant, dim=-1, keepdim=True))
+ quantization_error = x_quant_norm2 - x_norm2
+
+ return x_norm2 + quantization_error.detach()
+
+def cache_parameters(func):
+ cache = dict()
+ def cached_func(*args):
+ if args in cache:
+ return cache[args]
+ else:
+ cache[args] = func(*args)
+
+ return cache[args]
+ return cached_func
+
+@cache_parameters
+def pvq_codebook_size(n, k):
+
+ if k == 0:
+ return 1
+
+ if n == 0:
+ return 0
+
+ return pvq_codebook_size(n - 1, k) + pvq_codebook_size(n, k - 1) + pvq_codebook_size(n - 1, k - 1)
+
+
+def soft_rate_estimate(z, r, reduce=True):
+ """ rate approximation with dependent theta Eq. (7)"""
+
+ rate = torch.sum(
+ - torch.log2((1 - r)/(1 + r) * r ** torch.abs(z) + 1e-6),
+ dim=-1
+ )
+
+ if reduce:
+ rate = torch.mean(rate)
+
+ return rate
+
+
+def hard_rate_estimate(z, r, theta, reduce=True):
+ """ hard rate approximation """
+
+ z_q = torch.round(z)
+ p0 = 1 - r ** (0.5 + 0.5 * theta)
+ alpha = torch.relu(1 - torch.abs(z_q)) ** 2
+ rate = - torch.sum(
+ (alpha * torch.log2(p0 * r ** torch.abs(z_q) + 1e-6)
+ + (1 - alpha) * torch.log2(0.5 * (1 - p0) * (1 - r) * r ** (torch.abs(z_q) - 1) + 1e-6)),
+ dim=-1
+ )
+
+ if reduce:
+ rate = torch.mean(rate)
+
+ return rate
+
+
+
+def soft_dead_zone(x, dead_zone):
+ """ approximates application of a dead zone to x """
+ d = dead_zone * 0.05
+ return x - d * torch.tanh(x / (0.1 + d))
+
+
+def hard_quantize(x):
+ """ round with copy gradient trick """
+ return x + (torch.round(x) - x).detach()
+
+
+def noise_quantize(x):
+ """ simulates quantization with addition of random uniform noise """
+ return x + (torch.rand_like(x) - 0.5)
+
+
+# loss functions
+
+
+def distortion_loss(y_true, y_pred, rate_lambda=None):
+ """ custom distortion loss for LPCNet features """
+
+ if y_true.size(-1) != 20:
+ raise ValueError('distortion loss is designed to work with 20 features')+
+ ceps_error = y_pred[..., :18] - y_true[..., :18]
+ pitch_error = 2 * (y_pred[..., 18:19] - y_true[..., 18:19]) / (2 + y_true[..., 18:19])
+ corr_error = y_pred[..., 19:] - y_true[..., 19:]
+ pitch_weight = torch.relu(y_true[..., 19:] + 0.5) ** 2
+
+ loss = torch.mean(ceps_error ** 2 + (10/18) * torch.abs(pitch_error) * pitch_weight + (1/18) * corr_error ** 2, dim=-1)
+
+ if type(rate_lambda) != type(None):
+ loss = loss / torch.sqrt(rate_lambda)
+
+ loss = torch.mean(loss)
+
+ return loss
+
+
+# sampling functions
+
+import random
+
+
+def random_split(start, stop, num_splits=3, min_len=3):
+ get_min_len = lambda x : min([x[i+1] - x[i] for i in range(len(x) - 1)])
+ candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
+
+ while get_min_len(candidate) < min_len:
+ candidate = [start] + sorted([random.randint(start, stop-1) for i in range(num_splits)]) + [stop]
+
+ return candidate
+
+
+
+# weight initialization and clipping
+def init_weights(module):
+
+ if isinstance(module, nn.GRU):
+ for p in module.named_parameters():
+ if p[0].startswith('weight_hh_'):+ nn.init.orthogonal_(p[1])
+
+
+def weight_clip_factory(max_value):
+ """ weight clipping function concerning sum of abs values of adjecent weights """
+ def clip_weight_(w):
+ stop = w.size(1)
+ # omit last column if stop is odd
+ if stop % 2:
+ stop -= 1
+ max_values = max_value * torch.ones_like(w[:, :stop])
+ factor = max_value / torch.maximum(max_values,
+ torch.repeat_interleave(
+ torch.abs(w[:, :stop:2]) + torch.abs(w[:, 1:stop:2]),
+ 2,
+ 1))
+ with torch.no_grad():
+ w[:, :stop] *= factor
+
+ def clip_weights(module):
+ if isinstance(module, nn.GRU) or isinstance(module, nn.Linear):
+ for name, w in module.named_parameters():
+ if name.startswith('weight'):+ clip_weight_(w)
+
+ return clip_weights
+
+# RDOVAE module and submodules
+
+
+class CoreEncoder(nn.Module):
+ STATE_HIDDEN = 128
+ FRAMES_PER_STEP = 2
+ CONV_KERNEL_SIZE = 4
+
+ def __init__(self, feature_dim, output_dim, cond_size, cond_size2, state_size=24):
+ """ core encoder for RDOVAE
+
+ Computes latents, initial states, and rate estimates from features and lambda parameter
+
+ """
+
+ super(CoreEncoder, self).__init__()
+
+ # hyper parameters
+ self.feature_dim = feature_dim
+ self.output_dim = output_dim
+ self.cond_size = cond_size
+ self.cond_size2 = cond_size2
+ self.state_size = state_size
+
+ # derived parameters
+ self.input_dim = self.FRAMES_PER_STEP * self.feature_dim
+ self.conv_input_channels = 5 * cond_size + 3 * cond_size2
+
+ # layers
+ self.dense_1 = nn.Linear(self.input_dim, self.cond_size2)
+ self.gru_1 = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+ self.dense_2 = nn.Linear(self.cond_size, self.cond_size2)
+ self.gru_2 = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+ self.dense_3 = nn.Linear(self.cond_size, self.cond_size2)
+ self.gru_3 = nn.GRU(self.cond_size2, self.cond_size, batch_first=True)
+ self.dense_4 = nn.Linear(self.cond_size, self.cond_size)
+ self.dense_5 = nn.Linear(self.cond_size, self.cond_size)
+ self.conv1 = nn.Conv1d(self.conv_input_channels, self.output_dim, kernel_size=self.CONV_KERNEL_SIZE, padding='valid')
+
+ self.state_dense_1 = nn.Linear(self.conv_input_channels, self.STATE_HIDDEN)
+
+ self.state_dense_2 = nn.Linear(self.STATE_HIDDEN, self.state_size)
+
+ # initialize weights
+ self.apply(init_weights)
+
+
+ def forward(self, features):
+
+ # reshape features
+ x = torch.reshape(features, (features.size(0), features.size(1) // self.FRAMES_PER_STEP, self.FRAMES_PER_STEP * features.size(2)))
+
+ batch = x.size(0)
+ device = x.device
+
+ # run encoding layer stack
+ x1 = torch.tanh(self.dense_1(x))
+ x2, _ = self.gru_1(x1, torch.zeros((1, batch, self.cond_size)).to(device))
+ x3 = torch.tanh(self.dense_2(x2))
+ x4, _ = self.gru_2(x3, torch.zeros((1, batch, self.cond_size)).to(device))
+ x5 = torch.tanh(self.dense_3(x4))
+ x6, _ = self.gru_3(x5, torch.zeros((1, batch, self.cond_size)).to(device))
+ x7 = torch.tanh(self.dense_4(x6))
+ x8 = torch.tanh(self.dense_5(x7))
+
+ # concatenation of all hidden layer outputs
+ x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
+
+ # init state for decoder
+ states = torch.tanh(self.state_dense_1(x9))
+ states = torch.tanh(self.state_dense_2(states))
+
+ # latent representation via convolution
+ x9 = F.pad(x9.permute(0, 2, 1), [self.CONV_KERNEL_SIZE - 1, 0])
+ z = self.conv1(x9).permute(0, 2, 1)
+
+ return z, states
+
+
+
+
+class CoreDecoder(nn.Module):
+
+ FRAMES_PER_STEP = 4
+
+ def __init__(self, input_dim, output_dim, cond_size, cond_size2, state_size=24):
+ """ core decoder for RDOVAE
+
+ Computes features from latents, initial state, and quantization index
+
+ """
+
+ super(CoreDecoder, self).__init__()
+
+ # hyper parameters
+ self.input_dim = input_dim
+ self.output_dim = output_dim
+ self.cond_size = cond_size
+ self.cond_size2 = cond_size2
+ self.state_size = state_size
+
+ self.input_size = self.input_dim
+
+ self.concat_size = 4 * self.cond_size + 4 * self.cond_size2
+
+ # layers
+ self.dense_1 = nn.Linear(self.input_size, cond_size2)
+ self.gru_1 = nn.GRU(cond_size2, cond_size, batch_first=True)
+ self.dense_2 = nn.Linear(cond_size, cond_size2)
+ self.gru_2 = nn.GRU(cond_size2, cond_size, batch_first=True)
+ self.dense_3 = nn.Linear(cond_size, cond_size2)
+ self.gru_3 = nn.GRU(cond_size2, cond_size, batch_first=True)
+ self.dense_4 = nn.Linear(cond_size, cond_size2)
+ self.dense_5 = nn.Linear(cond_size2, cond_size2)
+
+ self.output = nn.Linear(self.concat_size, self.FRAMES_PER_STEP * self.output_dim)
+
+
+ self.gru_1_init = nn.Linear(self.state_size, self.cond_size)
+ self.gru_2_init = nn.Linear(self.state_size, self.cond_size)
+ self.gru_3_init = nn.Linear(self.state_size, self.cond_size)
+
+ # initialize weights
+ self.apply(init_weights)
+
+ def forward(self, z, initial_state):
+
+ gru_1_state = torch.tanh(self.gru_1_init(initial_state).permute(1, 0, 2))
+ gru_2_state = torch.tanh(self.gru_2_init(initial_state).permute(1, 0, 2))
+ gru_3_state = torch.tanh(self.gru_3_init(initial_state).permute(1, 0, 2))
+
+ # run decoding layer stack
+ x1 = torch.tanh(self.dense_1(z))
+ x2, _ = self.gru_1(x1, gru_1_state)
+ x3 = torch.tanh(self.dense_2(x2))
+ x4, _ = self.gru_2(x3, gru_2_state)
+ x5 = torch.tanh(self.dense_3(x4))
+ x6, _ = self.gru_3(x5, gru_3_state)
+ x7 = torch.tanh(self.dense_4(x6))
+ x8 = torch.tanh(self.dense_5(x7))
+ x9 = torch.cat((x1, x2, x3, x4, x5, x6, x7, x8), dim=-1)
+
+ # output layer and reshaping
+ x10 = self.output(x9)
+ features = torch.reshape(x10, (x10.size(0), x10.size(1) * self.FRAMES_PER_STEP, x10.size(2) // self.FRAMES_PER_STEP))
+
+ return features
+
+
+class StatisticalModel(nn.Module):
+ def __init__(self, quant_levels, latent_dim):
+ """ Statistical model for latent space
+
+ Computes scaling, deadzone, r, and theta
+
+ """
+
+ super(StatisticalModel, self).__init__()
+
+ # copy parameters
+ self.latent_dim = latent_dim
+ self.quant_levels = quant_levels
+ self.embedding_dim = 6 * latent_dim
+
+ # quantization embedding
+ self.quant_embedding = nn.Embedding(quant_levels, self.embedding_dim)
+
+ # initialize embedding to 0
+ with torch.no_grad():
+ self.quant_embedding.weight[:] = 0
+
+
+ def forward(self, quant_ids):
+ """ takes quant_ids and returns statistical model parameters"""
+
+ x = self.quant_embedding(quant_ids)
+
+ # CAVE: theta_soft is not used anymore. Kick it out?
+ quant_scale = F.softplus(x[..., 0 * self.latent_dim : 1 * self.latent_dim])
+ dead_zone = F.softplus(x[..., 1 * self.latent_dim : 2 * self.latent_dim])
+ theta_soft = torch.sigmoid(x[..., 2 * self.latent_dim : 3 * self.latent_dim])
+ r_soft = torch.sigmoid(x[..., 3 * self.latent_dim : 4 * self.latent_dim])
+ theta_hard = torch.sigmoid(x[..., 4 * self.latent_dim : 5 * self.latent_dim])
+ r_hard = torch.sigmoid(x[..., 5 * self.latent_dim : 6 * self.latent_dim])
+
+
+ return {+ 'quant_embedding' : x,
+ 'quant_scale' : quant_scale,
+ 'dead_zone' : dead_zone,
+ 'r_hard' : r_hard,
+ 'theta_hard' : theta_hard,
+ 'r_soft' : r_soft,
+ 'theta_soft' : theta_soft
+ }
+
+
+class RDOVAE(nn.Module):
+ def __init__(self,
+ feature_dim,
+ latent_dim,
+ quant_levels,
+ cond_size,
+ cond_size2,
+ state_dim=24,
+ split_mode='split',
+ clip_weights=True,
+ pvq_num_pulses=82,
+ state_dropout_rate=0):
+
+ super(RDOVAE, self).__init__()
+
+ self.feature_dim = feature_dim
+ self.latent_dim = latent_dim
+ self.quant_levels = quant_levels
+ self.cond_size = cond_size
+ self.cond_size2 = cond_size2
+ self.split_mode = split_mode
+ self.state_dim = state_dim
+ self.pvq_num_pulses = pvq_num_pulses
+ self.state_dropout_rate = state_dropout_rate
+
+ # submodules encoder and decoder share the statistical model
+ self.statistical_model = StatisticalModel(quant_levels, latent_dim)
+ self.core_encoder = nn.DataParallel(CoreEncoder(feature_dim, latent_dim, cond_size, cond_size2, state_size=state_dim))
+ self.core_decoder = nn.DataParallel(CoreDecoder(latent_dim, feature_dim, cond_size, cond_size2, state_size=state_dim))
+
+ self.enc_stride = CoreEncoder.FRAMES_PER_STEP
+ self.dec_stride = CoreDecoder.FRAMES_PER_STEP
+
+ if clip_weights:
+ self.weight_clip_fn = weight_clip_factory(0.496)
+ else:
+ self.weight_clip_fn = None
+
+ if self.dec_stride % self.enc_stride != 0:
+ raise ValueError(f"get_decoder_chunks_generic: encoder stride does not divide decoder stride")
+
+ def clip_weights(self):
+ if not type(self.weight_clip_fn) == type(None):
+ self.apply(self.weight_clip_fn)
+
+ def get_decoder_chunks(self, z_frames, mode='split', chunks_per_offset = 4):
+
+ enc_stride = self.enc_stride
+ dec_stride = self.dec_stride
+
+ stride = dec_stride // enc_stride
+
+ chunks = []
+
+ for offset in range(stride):
+ # start is the smalles number = offset mod stride that decodes to a valid range
+ start = offset
+ while enc_stride * (start + 1) - dec_stride < 0:
+ start += stride
+
+ # check if start is a valid index
+ if start >= z_frames:
+ raise ValueError("get_decoder_chunks_generic: range too small")+
+ # stop is the smallest number outside [0, num_enc_frames] that's congruent to offset mod stride
+ stop = z_frames - (z_frames % stride) + offset
+ while stop < z_frames:
+ stop += stride
+
+ # calculate split points
+ length = (stop - start)
+ if mode == 'split':
+ split_points = [start + stride * int(i * length / chunks_per_offset / stride) for i in range(chunks_per_offset)] + [stop]
+ elif mode == 'random_split':
+ split_points = [stride * x + start for x in random_split(0, (stop - start)//stride - 1, chunks_per_offset - 1, 1)]
+ else:
+ raise ValueError(f"get_decoder_chunks_generic: unknown mode {mode}")+
+
+ for i in range(chunks_per_offset):
+ # (enc_frame_start, enc_frame_stop, enc_frame_stride, stride, feature_frame_start, feature_frame_stop)
+ # encoder range(i, j, stride) maps to feature range(enc_stride * (i + 1) - dec_stride, enc_stride * j)
+ # provided that i - j = 1 mod stride
+ chunks.append({+ 'z_start' : split_points[i],
+ 'z_stop' : split_points[i + 1] - stride + 1,
+ 'z_stride' : stride,
+ 'features_start' : enc_stride * (split_points[i] + 1) - dec_stride,
+ 'features_stop' : enc_stride * (split_points[i + 1] - stride + 1)
+ })
+
+ return chunks
+
+
+ def forward(self, features, q_id):
+
+ # calculate statistical model from quantization ID
+ statistical_model = self.statistical_model(q_id)
+
+ # run encoder
+ z, states = self.core_encoder(features)
+
+ # scaling, dead-zone and quantization
+ z = z * statistical_model['quant_scale']
+ z = soft_dead_zone(z, statistical_model['dead_zone'])
+
+ # quantization
+ z_q = hard_quantize(z) / statistical_model['quant_scale']
+ z_n = noise_quantize(z) / statistical_model['quant_scale']
+ states_q = soft_pvq(states, self.pvq_num_pulses)
+
+ if self.state_dropout_rate > 0:
+ drop = torch.rand(states_q.size(0)) < self.state_dropout_rate
+ mask = torch.ones_like(states_q)
+ mask[drop] = 0
+ states_q = states_q * mask
+
+ # decoder
+ chunks = self.get_decoder_chunks(z.size(1), mode=self.split_mode)
+
+ outputs_hq = []
+ outputs_sq = []
+ for chunk in chunks:
+ # decoder with hard quantized input
+ z_dec_reverse = torch.flip(z_q[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
+ dec_initial_state = states_q[..., chunk['z_stop'] - 1 : chunk['z_stop'], :]
+ features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state)
+ outputs_hq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
+
+
+ # decoder with soft quantized input
+ z_dec_reverse = torch.flip(z_n[..., chunk['z_start'] : chunk['z_stop'] : chunk['z_stride'], :], [1])
+ features_reverse = self.core_decoder(z_dec_reverse, dec_initial_state)
+ outputs_sq.append((torch.flip(features_reverse, [1]), chunk['features_start'], chunk['features_stop']))
+
+ return {+ 'outputs_hard_quant' : outputs_hq,
+ 'outputs_soft_quant' : outputs_sq,
+ 'z' : z,
+ 'statistical_model' : statistical_model
+ }
+
+ def encode(self, features):
+ """ encoder with quantization and rate estimation """
+
+ z, states = self.core_encoder(features)
+
+ # quantization of initial states
+ states = soft_pvq(states, self.pvq_num_pulses)
+ state_size = m.log2(pvq_codebook_size(self.state_dim, self.pvq_num_pulses))
+
+ return z, states, state_size
+
+ def decode(self, z, initial_state):
+ """ decoder (flips sequences by itself) """
+
+ z_reverse = torch.flip(z, [1])
+ features_reverse = self.core_decoder(z_reverse, initial_state)
+ features = torch.flip(features_reverse, [1])
+
+ return features
+
+ def quantize(self, z, q_ids):
+ """ quantization of latent vectors """
+
+ stats = self.statistical_model(q_ids)
+
+ zq = z * stats['quant_scale']
+ zq = soft_dead_zone(zq, stats['dead_zone'])
+ zq = torch.round(zq)
+
+ sizes = hard_rate_estimate(zq, stats['r_hard'], stats['theta_hard'], reduce=False)
+
+ return zq, sizes
+
+ def unquantize(self, zq, q_ids):
+ """ re-scaling of latent vector """
+
+ stats = self.statistical_model(q_ids)
+
+ z = zq / stats['quant_scale']
+
+ return z
+
+ def freeze_model(self):
+
+ # freeze all parameters
+ for p in self.parameters():
+ p.requires_grad = False
+
+ for p in self.statistical_model.parameters():
+ p.requires_grad = True
+
--- /dev/null
+++ b/dnn/torch/rdovae/requirements.txt
@@ -1,0 +1,5 @@
+numpy
+scipy
+torch
+tqdm
+libs/wexchange-1.2-py3-none-any.whl
\ No newline at end of file
--- /dev/null
+++ b/dnn/torch/rdovae/train_rdovae.py
@@ -1,0 +1,270 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import os
+import argparse
+
+import torch
+import tqdm
+
+from rdovae import RDOVAE, RDOVAEDataset, distortion_loss, hard_rate_estimate, soft_rate_estimate
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('features', type=str, help='path to feature file in .f32 format')+parser.add_argument('output', type=str, help='path to output folder')+
+parser.add_argument('--cuda-visible-devices', type=str, help="comma separates list of cuda visible device indices, default: ''", default="")+
+
+model_group = parser.add_argument_group(title="model parameters")
+model_group.add_argument('--latent-dim', type=int, help="number of symbols produces by encoder, default: 80", default=80)+model_group.add_argument('--cond-size', type=int, help="first conditioning size, default: 256", default=256)+model_group.add_argument('--cond-size2', type=int, help="second conditioning size, default: 256", default=256)+model_group.add_argument('--state-dim', type=int, help="dimensionality of transfered state, default: 24", default=24)+model_group.add_argument('--quant-levels', type=int, help="number of quantization levels, default: 16", default=16)+model_group.add_argument('--lambda-min', type=float, help="minimal value for rate lambda, default: 0.0002", default=2e-4)+model_group.add_argument('--lambda-max', type=float, help="maximal value for rate lambda, default: 0.0104", default=0.0104)+model_group.add_argument('--pvq-num-pulses', type=int, help="number of pulses for PVQ, default: 82", default=82)+model_group.add_argument('--state-dropout-rate', type=float, help="state dropout rate, default: 0", default=0.0)+
+training_group = parser.add_argument_group(title="training parameters")
+training_group.add_argument('--batch-size', type=int, help="batch size, default: 32", default=32)+training_group.add_argument('--lr', type=float, help='learning rate, default: 3e-4', default=3e-4)+training_group.add_argument('--epochs', type=int, help='number of training epochs, default: 100', default=100)+training_group.add_argument('--sequence-length', type=int, help='sequence length, needs to be divisible by 4, default: 256', default=256)+training_group.add_argument('--lr-decay-factor', type=float, help='learning rate decay factor, default: 2.5e-5', default=2.5e-5)+training_group.add_argument('--split-mode', type=str, choices=['split', 'random_split'], help='splitting mode for decoder input, default: split', default='split')+training_group.add_argument('--enable-first-frame-loss', action='store_true', default=False, help='enables dedicated distortion loss on first 4 decoder frames')+training_group.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)+training_group.add_argument('--train-decoder-only', action='store_true', help='freeze encoder and statistical model and train decoder only')+
+args = parser.parse_args()
+
+# set visible devices
+os.environ['CUDA_VISIBLE_DEVICES'] = args.cuda_visible_devices
+
+# checkpoints
+checkpoint_dir = os.path.join(args.output, 'checkpoints')
+checkpoint = dict()
+os.makedirs(checkpoint_dir, exist_ok=True)
+
+# training parameters
+batch_size = args.batch_size
+lr = args.lr
+epochs = args.epochs
+sequence_length = args.sequence_length
+lr_decay_factor = args.lr_decay_factor
+split_mode = args.split_mode
+# not exposed
+adam_betas = [0.9, 0.99]
+adam_eps = 1e-8
+
+checkpoint['batch_size'] = batch_size
+checkpoint['lr'] = lr
+checkpoint['lr_decay_factor'] = lr_decay_factor
+checkpoint['split_mode'] = split_mode
+checkpoint['epochs'] = epochs
+checkpoint['sequence_length'] = sequence_length
+checkpoint['adam_betas'] = adam_betas
+
+# logging
+log_interval = 10
+
+# device
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")+
+# model parameters
+cond_size = args.cond_size
+cond_size2 = args.cond_size2
+latent_dim = args.latent_dim
+quant_levels = args.quant_levels
+lambda_min = args.lambda_min
+lambda_max = args.lambda_max
+state_dim = args.state_dim
+# not expsed
+num_features = 20
+
+
+# training data
+feature_file = args.features
+
+# model
+checkpoint['model_args'] = (num_features, latent_dim, quant_levels, cond_size, cond_size2)
+checkpoint['model_kwargs'] = {'state_dim': state_dim, 'split_mode' : split_mode, 'pvq_num_pulses': args.pvq_num_pulses, 'state_dropout_rate': args.state_dropout_rate}+model = RDOVAE(*checkpoint['model_args'], **checkpoint['model_kwargs'])
+
+if type(args.initial_checkpoint) != type(None):
+ checkpoint = torch.load(args.initial_checkpoint, map_location='cpu')
+ model.load_state_dict(checkpoint['state_dict'], strict=False)
+
+checkpoint['state_dict'] = model.state_dict()
+
+if args.train_decoder_only:
+ if args.initial_checkpoint is None:
+ print("warning: training decoder only without providing initial checkpoint")+
+ for p in model.core_encoder.module.parameters():
+ p.requires_grad = False
+
+ for p in model.statistical_model.parameters():
+ p.requires_grad = False
+
+# dataloader
+checkpoint['dataset_args'] = (feature_file, sequence_length, num_features, 36)
+checkpoint['dataset_kwargs'] = {'lambda_min': lambda_min, 'lambda_max': lambda_max, 'enc_stride': model.enc_stride, 'quant_levels': quant_levels}+dataset = RDOVAEDataset(*checkpoint['dataset_args'], **checkpoint['dataset_kwargs'])
+dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
+
+
+
+# optimizer
+params = [p for p in model.parameters() if p.requires_grad]
+optimizer = torch.optim.Adam(params, lr=lr, betas=adam_betas, eps=adam_eps)
+
+
+# learning rate scheduler
+scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer=optimizer, lr_lambda=lambda x : 1 / (1 + lr_decay_factor * x))
+
+if __name__ == '__main__':
+
+ # push model to device
+ model.to(device)
+
+ # training loop
+
+ for epoch in range(1, epochs + 1):
+
+ print(f"training epoch {epoch}...")+
+ # running stats
+ running_rate_loss = 0
+ running_soft_dist_loss = 0
+ running_hard_dist_loss = 0
+ running_hard_rate_loss = 0
+ running_soft_rate_loss = 0
+ running_total_loss = 0
+ running_rate_metric = 0
+ previous_total_loss = 0
+ running_first_frame_loss = 0
+
+ with tqdm.tqdm(dataloader, unit='batch') as tepoch:
+ for i, (features, rate_lambda, q_ids) in enumerate(tepoch):
+
+ # zero out gradients
+ optimizer.zero_grad()
+
+ # push inputs to device
+ features = features.to(device)
+ q_ids = q_ids.to(device)
+ rate_lambda = rate_lambda.to(device)
+
+
+ rate_lambda_upsamp = torch.repeat_interleave(rate_lambda, 2, 1)
+
+ # run model
+ model_output = model(features, q_ids)
+
+ # collect outputs
+ z = model_output['z']
+ outputs_hard_quant = model_output['outputs_hard_quant']
+ outputs_soft_quant = model_output['outputs_soft_quant']
+ statistical_model = model_output['statistical_model']
+
+ # rate loss
+ hard_rate = hard_rate_estimate(z, statistical_model['r_hard'], statistical_model['theta_hard'], reduce=False)
+ soft_rate = soft_rate_estimate(z, statistical_model['r_soft'], reduce=False)
+ soft_rate_loss = torch.mean(torch.sqrt(rate_lambda) * soft_rate)
+ hard_rate_loss = torch.mean(torch.sqrt(rate_lambda) * hard_rate)
+ rate_loss = (soft_rate_loss + 0.1 * hard_rate_loss)
+ hard_rate_metric = torch.mean(hard_rate)
+
+ ## distortion losses
+
+ # hard quantized decoder input
+ distortion_loss_hard_quant = torch.zeros_like(rate_loss)
+ for dec_features, start, stop in outputs_hard_quant:
+ distortion_loss_hard_quant += distortion_loss(features[..., start : stop, :], dec_features, rate_lambda_upsamp[..., start : stop]) / len(outputs_hard_quant)
+
+ first_frame_loss = torch.zeros_like(rate_loss)
+ for dec_features, start, stop in outputs_hard_quant:
+ first_frame_loss += distortion_loss(features[..., stop-4 : stop, :], dec_features[..., -4:, :], rate_lambda_upsamp[..., stop - 4 : stop]) / len(outputs_hard_quant)
+
+ # soft quantized decoder input
+ distortion_loss_soft_quant = torch.zeros_like(rate_loss)
+ for dec_features, start, stop in outputs_soft_quant:
+ distortion_loss_soft_quant += distortion_loss(features[..., start : stop, :], dec_features, rate_lambda_upsamp[..., start : stop]) / len(outputs_soft_quant)
+
+ # total loss
+ total_loss = rate_loss + (distortion_loss_hard_quant + distortion_loss_soft_quant) / 2
+
+ if args.enable_first_frame_loss:
+ total_loss = total_loss + 0.5 * torch.relu(first_frame_loss - distortion_loss_hard_quant)
+
+
+ total_loss.backward()
+
+ optimizer.step()
+
+ model.clip_weights()
+
+ scheduler.step()
+
+ # collect running stats
+ running_hard_dist_loss += float(distortion_loss_hard_quant.detach().cpu())
+ running_soft_dist_loss += float(distortion_loss_soft_quant.detach().cpu())
+ running_rate_loss += float(rate_loss.detach().cpu())
+ running_rate_metric += float(hard_rate_metric.detach().cpu())
+ running_total_loss += float(total_loss.detach().cpu())
+ running_first_frame_loss += float(first_frame_loss.detach().cpu())
+ running_soft_rate_loss += float(soft_rate_loss.detach().cpu())
+ running_hard_rate_loss += float(hard_rate_loss.detach().cpu())
+
+ if (i + 1) % log_interval == 0:
+ current_loss = (running_total_loss - previous_total_loss) / log_interval
+ tepoch.set_postfix(
+ current_loss=current_loss,
+ total_loss=running_total_loss / (i + 1),
+ dist_hq=running_hard_dist_loss / (i + 1),
+ dist_sq=running_soft_dist_loss / (i + 1),
+ rate_loss=running_rate_loss / (i + 1),
+ rate=running_rate_metric / (i + 1),
+ ffloss=running_first_frame_loss / (i + 1),
+ rateloss_hard=running_hard_rate_loss / (i + 1),
+ rateloss_soft=running_soft_rate_loss / (i + 1)
+ )
+ previous_total_loss = running_total_loss
+
+ # save checkpoint
+ checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')+ checkpoint['state_dict'] = model.state_dict()
+ checkpoint['loss'] = running_total_loss / len(dataloader)
+ checkpoint['epoch'] = epoch
+ torch.save(checkpoint, checkpoint_path)
--- /dev/null
+++ b/dnn/training_tf2/decode_rdovae.py
@@ -1,0 +1,111 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('bits', metavar='<bits file>', help='binary features file (int16)')+parser.add_argument('output', metavar='<output>', help='output features')+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--weights', metavar='<input weights>', help='model weights')+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')+parser.add_argument('--batch-size', metavar='<batch size>', default=1, type=int, help='batch size to use (default 128)')+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+from rdovae import pvq_quantize
+from rdovae import apply_dead_zone
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+model, encoder, decoder, qembedding = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+nbits=80
+
+
+bits_file = args.bits
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+bits = np.memmap(bits_file + "-syms.f32", dtype='float32', mode='r')
+nb_sequences = len(bits)//(40*sequence_size)//batch_size*batch_size
+bits = bits[:nb_sequences*sequence_size*40]
+
+bits = np.reshape(bits, (nb_sequences, sequence_size//2, 20*4))
+print(bits.shape)
+
+lambda_val = 0.001 * np.ones((nb_sequences, sequence_size//2, 1))
+quant_id = np.round(3.8*np.log(lambda_val/.0002)).astype('int16')+quant_id = quant_id[:,:,0]
+quant_embed = qembedding(quant_id)
+quant_scale = tf.math.softplus(quant_embed[:,:,:nbits])
+dead_zone = tf.math.softplus(quant_embed[:, :, nbits : 2 * nbits])
+
+bits = bits*quant_scale
+bits = np.round(apply_dead_zone([bits, dead_zone]).numpy())
+bits = bits/quant_scale
+
+
+state = np.memmap(bits_file + "-state.f32", dtype='float32', mode='r')
+
+state = np.reshape(state, (nb_sequences, sequence_size//2, 24))
+state = state[:,-1,:]
+state = pvq_quantize(state, 82)
+#state = state/(1e-15+tf.norm(state, axis=-1,keepdims=True))
+
+print("shapes are:")+print(bits.shape)
+print(state.shape)
+
+bits = bits[:,1::2,:]
+features = decoder.predict([bits, state], batch_size=batch_size)
+
+features.astype('float32').tofile(args.output)--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -26,6 +26,7 @@
'''
import os
+import io
import lpcnet
import sys
import numpy as np
@@ -52,11 +53,17 @@
max_mdense_tmp = 1
def printVector(f, vector, name, dtype='float', dotp=False):
+ global array_list
if dotp:
vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
vector = vector.transpose((2, 0, 3, 1))
v = np.reshape(vector, (-1));
#print('static const float ', name, '[', len(v), '] = \n', file=f)+ if name not in array_list:
+ array_list.append(name)
+ f.write('#ifndef USE_WEIGHTS_FILE\n')+ f.write('#define WEIGHTS_{}_DEFINED\n'.format(name))+ f.write('#define WEIGHTS_{}_TYPE WEIGHT_TYPE_{}\n'.format(name, dtype)) f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))for i in range(0, len(v)):
f.write('{}'.format(v[i]))@@ -69,7 +76,8 @@
else:
f.write(" ")#print(v, file=f)
- f.write('\n};\n\n')+ f.write('\n};\n')+ f.write('#endif\n\n')return;
def printSparseVector(f, A, name, have_diag=True):
@@ -133,11 +141,11 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('extern const SparseGRULayer {};\n\n'.format(name));+ model_struct.write(' SparseGRULayer {};\n'.format(name));+ model_init.write(' if (sparse_gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_recurrent_weights_diag", "{}_recurrent_weights", "{}_recurrent_weights_idx", {}, ACTIVATION_{}, {})) return 1;\n'+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
return True
def dump_grub(self, f, hf, gru_a_size):
@@ -169,9 +177,9 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const GRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_weights,\n {}_weights_idx,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'+ model_struct.write(' GRULayer {};\n'.format(name));+ model_init.write(' if (gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_weights", "{}_weights_idx", "{}_recurrent_weights", {}, {}, ACTIVATION_{}, {})) return 1;\n'.format(name, name, name, name, name, name, gru_a_size, weights[0].shape[1]//3, activation, reset_after))
- hf.write('extern const GRULayer {};\n\n'.format(name));return True
def dump_gru_layer_dummy(self, f, hf):
@@ -186,10 +194,10 @@
def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
printVector(f, weights, name + '_weights')
printVector(f, bias, name + '_bias')
- f.write('const DenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights.shape[0], weights.shape[1], activation))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))- hf.write('extern const DenseLayer {};\n\n'.format(name));+ model_struct.write(' DenseLayer {};\n'.format(name));+ model_init.write(' if (dense_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, ACTIVATION_{})) return 1;\n'+ .format(name, name, name, weights.shape[0], weights.shape[1], activation))
def dump_dense_layer(self, f, hf):
name = self.name
@@ -211,10 +219,10 @@
printVector(f, np.transpose(weights[2], (1, 0)), name + '_factor')
activation = self.activation.__name__.upper()
max_mdense_tmp = max(max_mdense_tmp, weights[0].shape[0]*weights[0].shape[2])
- f.write('const MDenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}_factor,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[0]))- hf.write('extern const MDenseLayer {};\n\n'.format(name));+ model_struct.write(' MDenseLayer {};\n'.format(name));+ model_init.write(' if (mdense_init(&model->{}, arrays, "{}_bias", "{}_weights", "{}_factor", {}, {}, {}, ACTIVATION_{})) return 1;\n'+ .format(name, name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
return False
MDense.dump_layer = dump_mdense_layer
@@ -227,12 +235,12 @@
printVector(f, weights[-1], name + '_bias')
activation = self.activation.__name__.upper()
max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
- f.write('const Conv1DLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2])) hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1))) hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))- hf.write('extern const Conv1DLayer {};\n\n'.format(name));+ model_struct.write(' Conv1DLayer {};\n'.format(name));+ model_init.write(' if (conv1d_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, {}, ACTIVATION_{})) return 1;\n'+ .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
return True
Conv1D.dump_layer = dump_conv1d_layer
@@ -239,10 +247,10 @@
def dump_embedding_layer_impl(name, weights, f, hf):
printVector(f, weights, name + '_weights')
- f.write('const EmbeddingLayer {} = {{\n {}_weights,\n {}, {}\n}};\n\n'- .format(name, name, weights.shape[0], weights.shape[1]))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))- hf.write('extern const EmbeddingLayer {};\n\n'.format(name));+ model_struct.write(' EmbeddingLayer {};\n'.format(name));+ model_init.write(' if (embedding_init(&model->{}, arrays, "{}_weights", {}, {})) return 1;\n'+ .format(name, name, weights.shape[0], weights.shape[1]))
def dump_embedding_layer(self, f, hf):
name = self.name
@@ -281,6 +289,12 @@
f = open(cfile, 'w')
hf = open(hfile, 'w')
+ model_struct = io.StringIO()
+ model_init = io.StringIO()
+ model_struct.write('typedef struct {\n')+ model_init.write('#ifndef DUMP_BINARY_WEIGHTS\n')+ model_init.write('int init_lpcnet_model(LPCNetModel *model, const WeightArray *arrays) {\n')+ array_list = []
f.write('/*This file is automatically generated from a Keras model*/\n') f.write('/*based on model {}*/\n\n'.format(sys.argv[1]))@@ -326,13 +340,13 @@
W = model.get_layer('gru_a').get_weights()[0][3*embed_size:,:]#FIXME: dump only half the biases
b = model.get_layer('gru_a').get_weights()[2]- dump_dense_layer_impl('gru_a_dense_feature', W, b, 'LINEAR', f, hf)+ dump_dense_layer_impl('gru_a_dense_feature', W, b[:len(b)//2], 'LINEAR', f, hf) W = model.get_layer('gru_b').get_weights()[0][model.rnn_units1:,:] b = model.get_layer('gru_b').get_weights()[2]# Set biases to zero because they'll be included in the GRU input part
# (we need regular and SU biases)
- dump_dense_layer_impl('gru_b_dense_feature', W, 0*b, 'LINEAR', f, hf)+ dump_dense_layer_impl('gru_b_dense_feature', W, 0*b[:len(b)//2], 'LINEAR', f, hf) dump_grub(model.get_layer('gru_b'), f, hf, model.rnn_units1)layer_list = []
@@ -342,6 +356,19 @@
dump_sparse_gru(model.get_layer('gru_a'), f, hf)+ f.write('#ifndef USE_WEIGHTS_FILE\n')+ f.write('const WeightArray lpcnet_arrays[] = {\n')+ for name in array_list:
+ f.write('#ifdef WEIGHTS_{}_DEFINED\n'.format(name))+ f.write(' {{"{}", WEIGHTS_{}_TYPE, sizeof({}), {}}},\n'.format(name, name, name, name))+ f.write('#endif\n')+ f.write(' {NULL, 0, 0, NULL}\n};\n')+ f.write('#endif\n')+
+ model_init.write(' return 0;\n}\n')+ model_init.write('#endif\n')+ f.write(model_init.getvalue())
+
hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons)) hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs)) hf.write('#define MAX_MDENSE_TMP {}\n\n'.format(max_mdense_tmp))@@ -350,8 +377,11 @@
hf.write('typedef struct {\n')for i, name in enumerate(layer_list):
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) - hf.write('} NNetState;\n')+ hf.write('} NNetState;\n\n')+ model_struct.write('} LPCNetModel;\n\n')+ hf.write(model_struct.getvalue())
+ hf.write('int init_lpcnet_model(LPCNetModel *model, const WeightArray *arrays);\n\n') hf.write('\n\n#endif\n')f.close()
--- a/dnn/training_tf2/dump_plc.py
+++ b/dnn/training_tf2/dump_plc.py
@@ -27,6 +27,7 @@
'''
import lpcnet_plc
+import io
import sys
import numpy as np
from tensorflow.keras.optimizers import Adam
@@ -41,11 +42,17 @@
max_conv_inputs = 1
def printVector(f, vector, name, dtype='float', dotp=False):
+ global array_list
if dotp:
vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
vector = vector.transpose((2, 0, 3, 1))
v = np.reshape(vector, (-1));
#print('static const float ', name, '[', len(v), '] = \n', file=f)+ if name not in array_list:
+ array_list.append(name)
+ f.write('#ifndef USE_WEIGHTS_FILE\n')+ f.write('#define WEIGHTS_{}_DEFINED\n'.format(name))+ f.write('#define WEIGHTS_{}_TYPE WEIGHT_TYPE_{}\n'.format(name, dtype)) f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))for i in range(0, len(v)):
f.write('{}'.format(v[i]))@@ -58,7 +65,8 @@
else:
f.write(" ")#print(v, file=f)
- f.write('\n};\n\n')+ f.write('\n};\n')+ f.write('#endif\n\n')return;
def printSparseVector(f, A, name, have_diag=True):
@@ -122,11 +130,11 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('extern const SparseGRULayer {};\n\n'.format(name));+ model_struct.write(' SparseGRULayer {};\n'.format(name));+ model_init.write(' if (sparse_gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_recurrent_weights_diag", "{}_recurrent_weights", "{}_recurrent_weights_idx", {}, ACTIVATION_{}, {})) return 1;\n'+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
return True
def dump_gru_layer(self, f, hf):
@@ -158,11 +166,11 @@
reset_after = 1
neurons = weights[0].shape[1]//3
max_rnn_neurons = max(max_rnn_neurons, neurons)
- f.write('const GRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_weights,\n {}_weights_idx,\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'- .format(name, name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))- hf.write('extern const GRULayer {};\n\n'.format(name));+ model_struct.write(' GRULayer {};\n'.format(name));+ model_init.write(' if (gru_init(&model->{}, arrays, "{}_bias", "{}_subias", "{}_weights", "{}_weights_idx", "{}_recurrent_weights", {}, {}, ACTIVATION_{}, {})) return 1;\n'+ .format(name, name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
return True
GRU.dump_layer = dump_gru_layer
@@ -178,10 +186,10 @@
def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
printVector(f, weights, name + '_weights')
printVector(f, bias, name + '_bias')
- f.write('const DenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights.shape[0], weights.shape[1], activation))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))- hf.write('extern const DenseLayer {};\n\n'.format(name));+ model_struct.write(' DenseLayer {};\n'.format(name));+ model_init.write(' if (dense_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, ACTIVATION_{})) return 1;\n'+ .format(name, name, name, weights.shape[0], weights.shape[1], activation))
def dump_dense_layer(self, f, hf):
name = self.name
@@ -202,12 +210,12 @@
printVector(f, weights[-1], name + '_bias')
activation = self.activation.__name__.upper()
max_conv_inputs = max(max_conv_inputs, weights[0].shape[1]*weights[0].shape[0])
- f.write('const Conv1DLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'- .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2])) hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1))) hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))- hf.write('extern const Conv1DLayer {};\n\n'.format(name));+ model_struct.write(' Conv1DLayer {};\n'.format(name));+ model_init.write(' if (conv1d_init(&model->{}, arrays, "{}_bias", "{}_weights", {}, {}, {}, ACTIVATION_{})) return 1;\n'+ .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
return True
Conv1D.dump_layer = dump_conv1d_layer
@@ -235,6 +243,12 @@
f = open(cfile, 'w')
hf = open(hfile, 'w')
+model_struct = io.StringIO()
+model_init = io.StringIO()
+model_struct.write('typedef struct {\n')+model_init.write('#ifndef DUMP_BINARY_WEIGHTS\n')+model_init.write('int init_plc_model(PLCModel *model, const WeightArray *arrays) {\n')+array_list = []
f.write('/*This file is automatically generated from a Keras model*/\n')@@ -250,7 +264,20 @@
layer_list.append(layer.name)
#dump_sparse_gru(model.get_layer('gru_a'), f, hf)+f.write('#ifndef USE_WEIGHTS_FILE\n')+f.write('const WeightArray lpcnet_plc_arrays[] = {\n')+for name in array_list:
+ f.write('#ifdef WEIGHTS_{}_DEFINED\n'.format(name))+ f.write(' {{"{}", WEIGHTS_{}_TYPE, sizeof({}), {}}},\n'.format(name, name, name, name))+ f.write('#endif\n')+f.write(' {NULL, 0, 0, NULL}\n};\n')+f.write('#endif\n')+model_init.write(' return 0;\n}\n')+model_init.write('#endif\n')+f.write(model_init.getvalue())
+
+
hf.write('#define PLC_MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons)) #hf.write('#define PLC_MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))@@ -257,7 +284,11 @@
hf.write('typedef struct {\n')for i, name in enumerate(layer_list):
hf.write(' float {}_state[{}_STATE_SIZE];\n'.format(name, name.upper())) -hf.write('} PLCNetState;\n')+hf.write('} PLCNetState;\n\n')+
+model_struct.write('} PLCModel;\n\n')+hf.write(model_struct.getvalue())
+hf.write('int init_plc_model(PLCModel *model, const WeightArray *arrays);\n\n') hf.write('\n\n#endif\n')--- /dev/null
+++ b/dnn/training_tf2/dump_rdovae.py
@@ -1,0 +1,306 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+from ftplib import parse150
+import os
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)+
+args = parser.parse_args()
+
+# now import the heavy stuff
+import tensorflow as tf
+import numpy as np
+from keraslayerdump import dump_conv1d_layer, dump_dense_layer, dump_gru_layer, printVector
+from rdovae import new_rdovae_model
+
+def start_header(header_fid, header_name):
+ header_guard = os.path.basename(header_name)[:-2].upper() + "_H"
+ header_fid.write(
+f"""
+#ifndef {header_guard}+#define {header_guard}+
+"""
+ )
+
+def finish_header(header_fid):
+ header_fid.write(
+"""
+#endif
+
+"""
+ )
+
+def start_source(source_fid, header_name, weight_file):
+ source_fid.write(
+f"""
+/* this source file was automatically generated from weight file {weight_file} */+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "{header_name}"+
+"""
+ )
+
+def finish_source(source_fid):
+ pass
+
+
+def dump_statistical_model(qembedding, f, fh):
+ w = qembedding.weights[0].numpy()
+ levels, dim = w.shape
+ N = dim // 6
+
+ print("dumping statistical model")+ quant_scales = tf.math.softplus(w[:, : N]).numpy()
+ dead_zone = 0.05 * tf.math.softplus(w[:, N : 2 * N]).numpy()
+ r = tf.math.sigmoid(w[:, 5 * N : 6 * N]).numpy()
+ p0 = tf.math.sigmoid(w[:, 4 * N : 5 * N]).numpy()
+ p0 = 1 - r ** (0.5 + 0.5 * p0)
+
+ quant_scales_q8 = np.round(quant_scales * 2**8).astype(np.uint16)
+ dead_zone_q10 = np.round(dead_zone * 2**10).astype(np.uint16)
+ r_q15 = np.round(r * 2**15).astype(np.uint16)
+ p0_q15 = np.round(p0 * 2**15).astype(np.uint16)
+
+ printVector(f, quant_scales_q8, 'dred_quant_scales_q8', dtype='opus_uint16', static=False)
+ printVector(f, dead_zone_q10, 'dred_dead_zone_q10', dtype='opus_uint16', static=False)
+ printVector(f, r_q15, 'dred_r_q15', dtype='opus_uint16', static=False)
+ printVector(f, p0_q15, 'dred_p0_q15', dtype='opus_uint16', static=False)
+
+ fh.write(
+f"""
+extern const opus_uint16 dred_quant_scales_q8[{levels * N}];+extern const opus_uint16 dred_dead_zone_q10[{levels * N}];+extern const opus_uint16 dred_r_q15[{levels * N}];+extern const opus_uint16 dred_p0_q15[{levels * N}];+
+"""
+ )
+
+if __name__ == "__main__":
+
+ model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+ model.load_weights(args.weights)
+
+
+
+
+ # encoder
+ encoder_dense_names = [
+ 'enc_dense1',
+ 'enc_dense3',
+ 'enc_dense5',
+ 'enc_dense7',
+ 'enc_dense8',
+ 'gdense1',
+ 'gdense2'
+ ]
+
+ encoder_gru_names = [
+ 'enc_dense2',
+ 'enc_dense4',
+ 'enc_dense6'
+ ]
+
+ encoder_conv1d_names = [
+ 'bits_dense'
+ ]
+
+ source_fid = open("dred_rdovae_enc_data.c", 'w')+ header_fid = open("dred_rdovae_enc_data.h", 'w')+
+ start_header(header_fid, "dred_rdovae_enc_data.h")
+ start_source(source_fid, "dred_rdovae_enc_data.h", os.path.basename(args.weights))
+
+ header_fid.write(
+f"""
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+ )
+
+ # dump GRUs
+ max_rnn_neurons_enc = max(
+ [
+ dump_gru_layer(encoder.get_layer(name), source_fid, header_fid, dotp=True, sparse=True)
+ for name in encoder_gru_names
+ ]
+ )
+
+ # dump conv layers
+ max_conv_inputs = max(
+ [
+ dump_conv1d_layer(encoder.get_layer(name), source_fid, header_fid)
+ for name in encoder_conv1d_names
+ ]
+ )
+
+ # dump Dense layers
+ for name in encoder_dense_names:
+ layer = encoder.get_layer(name)
+ dump_dense_layer(layer, source_fid, header_fid)
+
+ # some global constants
+ header_fid.write(
+f"""
+
+#define DRED_ENC_MAX_RNN_NEURONS {max_rnn_neurons_enc}+
+#define DRED_ENC_MAX_CONV_INPUTS {max_conv_inputs}+
+"""
+ )
+
+ finish_header(header_fid)
+ finish_source(source_fid)
+
+ header_fid.close()
+ source_fid.close()
+
+ # statistical model
+ source_fid = open("dred_rdovae_stats_data.c", 'w')+ header_fid = open("dred_rdovae_stats_data.h", 'w')+
+ start_header(header_fid, "dred_rdovae_stats_data.h")
+ start_source(source_fid, "dred_rdovae_stats_data.h", os.path.basename(args.weights))
+
+ header_fid.write(
+"""
+
+#include "opus_types.h"
+
+"""
+ )
+
+ dump_statistical_model(qembedding, source_fid, header_fid)
+
+ finish_header(header_fid)
+ finish_source(source_fid)
+
+ header_fid.close()
+ source_fid.close()
+
+ # decoder
+ decoder_dense_names = [
+ 'state1',
+ 'state2',
+ 'state3',
+ 'dec_dense1',
+ 'dec_dense3',
+ 'dec_dense5',
+ 'dec_dense7',
+ 'dec_dense8',
+ 'dec_final'
+ ]
+
+ decoder_gru_names = [
+ 'dec_dense2',
+ 'dec_dense4',
+ 'dec_dense6'
+ ]
+
+ source_fid = open("dred_rdovae_dec_data.c", 'w')+ header_fid = open("dred_rdovae_dec_data.h", 'w')+
+ start_header(header_fid, "dred_rdovae_dec_data.h")
+ start_source(source_fid, "dred_rdovae_dec_data.h", os.path.basename(args.weights))
+
+ header_fid.write(
+f"""
+#include "dred_rdovae_constants.h"
+
+#include "nnet.h"
+"""
+ )
+
+
+ # dump GRUs
+ max_rnn_neurons_dec = max(
+ [
+ dump_gru_layer(decoder.get_layer(name), source_fid, header_fid, dotp=True, sparse=True)
+ for name in decoder_gru_names
+ ]
+ )
+
+ # dump Dense layers
+ for name in decoder_dense_names:
+ layer = decoder.get_layer(name)
+ dump_dense_layer(layer, source_fid, header_fid)
+
+ # some global constants
+ header_fid.write(
+f"""
+
+#define DRED_DEC_MAX_RNN_NEURONS {max_rnn_neurons_dec}+
+"""
+ )
+
+ finish_header(header_fid)
+ finish_source(source_fid)
+
+ header_fid.close()
+ source_fid.close()
+
+ # common constants
+ header_fid = open("dred_rdovae_constants.h", 'w')+ start_header(header_fid, "dred_rdovae_constants.h")
+
+ header_fid.write(
+f"""
+#define DRED_NUM_FEATURES 20
+
+#define DRED_LATENT_DIM {args.latent_dim}+
+#define DRED_STATE_DIM {24}+
+#define DRED_NUM_QUANTIZATION_LEVELS {qembedding.weights[0].shape[0]}+
+#define DRED_MAX_RNN_NEURONS {max(max_rnn_neurons_enc, max_rnn_neurons_dec)}+
+#define DRED_MAX_CONV_INPUTS {max_conv_inputs}+"""
+ )
+
+ finish_header(header_fid)
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/encode_rdovae.py
@@ -1,0 +1,125 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a PLC model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--weights', metavar='<input weights>', help='model weights')+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')+parser.add_argument('--batch-size', metavar='<batch size>', default=1, type=int, help='batch size to use (default 128)')+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+from rdovae import apply_dead_zone
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+import tensorflow as tf
+from rdovae import pvq_quantize
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+model, encoder, decoder, qembedding = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+print(features.shape)
+features = features[:, :, :nb_used_features]
+#features = np.random.randn(73600, 1000, 17)
+
+
+bits, gru_state_dec = encoder.predict([features], batch_size=batch_size)
+(gru_state_dec).astype('float32').tofile(args.output + "-state.f32")+
+
+#dist = rdovae.feat_dist_loss(features, quant_out)
+#rate = rdovae.sq1_rate_loss(features, model_bits)
+#rate2 = rdovae.sq_rate_metric(features, model_bits)
+#print(dist, rate, rate2)
+
+print("shapes are:")+print(bits.shape)
+print(gru_state_dec.shape)
+
+features.astype('float32').tofile(args.output + "-input.f32")+#quant_out.astype('float32').tofile(args.output + "-enc_dec.f32")+nbits=80
+bits.astype('float32').tofile(args.output + "-syms.f32")+
+lambda_val = 0.0002 * np.ones((nb_sequences, sequence_size//2, 1))
+quant_id = np.round(3.8*np.log(lambda_val/.0002)).astype('int16')+quant_id = quant_id[:,:,0]
+quant_embed = qembedding(quant_id)
+quant_scale = tf.math.softplus(quant_embed[:,:,:nbits])
+dead_zone = tf.math.softplus(quant_embed[:, :, nbits : 2 * nbits])
+
+bits = bits*quant_scale
+bits = np.round(apply_dead_zone([bits, dead_zone]).numpy())
+bits = bits/quant_scale
+
+gru_state_dec = pvq_quantize(gru_state_dec, 82)
+#gru_state_dec = gru_state_dec/(1e-15+tf.norm(gru_state_dec, axis=-1,keepdims=True))
+gru_state_dec = gru_state_dec[:,-1,:]
+dec_out = decoder([bits[:,1::2,:], gru_state_dec])
+
+print(dec_out.shape)
+
+dec_out.numpy().astype('float32').tofile(args.output + "-quant_out.f32")--- /dev/null
+++ b/dnn/training_tf2/fec_encoder.py
@@ -1,0 +1,257 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe and Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+import os
+import subprocess
+import argparse
+
+
+import numpy as np
+from scipy.io import wavfile
+import tensorflow as tf
+
+from rdovae import new_rdovae_model, pvq_quantize, apply_dead_zone, sq_rate_metric
+from fec_packets import write_fec_packets, read_fec_packets
+
+
+debug = False
+
+if debug:
+ args = type('dummy', (object,),+ {+ 'input' : 'item1.wav',
+ 'weights' : 'testout/rdovae_alignment_fix_1024_120.h5',
+ 'enc_lambda' : 0.0007,
+ 'output' : "test_0007.fec",
+ 'cond_size' : 1024,
+ 'num_redundancy_frames' : 64,
+ 'extra_delay' : 0,
+ 'dump_data' : './dump_data'
+ })()
+ os.environ['CUDA_VISIBLE_DEVICES']=""
+else:
+ parser = argparse.ArgumentParser(description='Encode redundancy for Opus neural FEC. Designed for use with voip application and 20ms frames')
+
+ parser.add_argument('input', metavar='<input signal>', help='audio input (.wav or .raw or .pcm as int16)')+ parser.add_argument('weights', metavar='<weights>', help='trained model file (.h5)')+# parser.add_argument('enc_lambda', metavar='<lambda>', type=float, help='lambda for controlling encoder rate')+ parser.add_argument('output', type=str, help='output file (will be extended with .fec)')+
+ parser.add_argument('--dump-data', type=str, default='./dump_data', help='path to dump data executable (default ./dump_data)')+ parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')+ parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 40)", default=40)+ parser.add_argument('--num-redundancy-frames', default=64, type=int, help='number of redundancy frames (20ms) per packet (default 64)')+ parser.add_argument('--extra-delay', default=0, type=int, help="last features in packet are calculated with the decoder aligned samples, use this option to add extra delay (in samples at 16kHz)")+ parser.add_argument('--lossfile', type=str, help='file containing loss trace (0 for frame received, 1 for lost)')+
+ parser.add_argument('--debug-output', action='store_true', help='if set, differently assembled features are written to disk')+
+ args = parser.parse_args()
+
+model, encoder, decoder, qembedding = new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=1, nb_quant=args.quant_levels, cond_size=args.cond_size)
+model.load_weights(args.weights)
+
+lpc_order = 16
+
+## prepare input signal
+# SILK frame size is 20ms and LPCNet subframes are 10ms
+subframe_size = 160
+frame_size = 2 * subframe_size
+
+# 91 samples delay to align with SILK decoded frames
+silk_delay = 91
+
+# prepend zeros to have enough history to produce the first package
+zero_history = (args.num_redundancy_frames - 1) * frame_size
+
+# dump data has a (feature) delay of 10ms
+dump_data_delay = 160
+
+total_delay = silk_delay + zero_history + args.extra_delay - dump_data_delay
+
+# load signal
+if args.input.endswith('.raw') or args.input.endswith('.pcm') or args.input.endswith('.sw'):+ signal = np.fromfile(args.input, dtype='int16')
+
+elif args.input.endswith('.wav'):+ fs, signal = wavfile.read(args.input)
+else:
+ raise ValueError(f'unknown input signal format: {args.input}')+
+# fill up last frame with zeros
+padded_signal_length = len(signal) + total_delay
+tail = padded_signal_length % frame_size
+right_padding = (frame_size - tail) % frame_size
+
+signal = np.concatenate((np.zeros(total_delay, dtype=np.int16), signal, np.zeros(right_padding, dtype=np.int16)))
+
+padded_signal_file = os.path.splitext(args.input)[0] + '_padded.raw'
+signal.tofile(padded_signal_file)
+
+# write signal and call dump_data to create features
+
+feature_file = os.path.splitext(args.input)[0] + '_features.f32'
+command = f"{args.dump_data} -test {padded_signal_file} {feature_file}"+r = subprocess.run(command, shell=True)
+if r.returncode != 0:
+ raise RuntimeError(f"command '{command}' failed with exit code {r.returncode}")+
+# load features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+
+# load features
+features = np.fromfile(feature_file, dtype='float32')
+num_subframes = len(features) // nb_features
+num_subframes = 2 * (num_subframes // 2)
+num_frames = num_subframes // 2
+
+features = np.reshape(features, (1, -1, nb_features))
+features = features[:, :, :nb_used_features]
+features = features[:, :num_subframes, :]
+
+#variable quantizer depending on the delay
+q0 = 3
+q1 = 15
+quant_id = np.round(q1 + (q0-q1)*np.arange(args.num_redundancy_frames//2)/args.num_redundancy_frames).astype('int16')+#print(quant_id)
+
+quant_embed = qembedding(quant_id)
+
+# run encoder
+print("running fec encoder...")+symbols, gru_state_dec = encoder.predict(features)
+
+# apply quantization
+nsymbols = 80
+quant_scale = tf.math.softplus(quant_embed[:, :nsymbols]).numpy()
+dead_zone = tf.math.softplus(quant_embed[:, nsymbols : 2 * nsymbols]).numpy()
+#symbols = apply_dead_zone([symbols, dead_zone]).numpy()
+#qsymbols = np.round(symbols)
+quant_gru_state_dec = pvq_quantize(gru_state_dec, 82)
+
+# rate estimate
+hard_distr_embed = tf.math.sigmoid(quant_embed[:, 4 * nsymbols : ]).numpy()
+#rate_input = np.concatenate((qsymbols, hard_distr_embed, enc_lambda), axis=-1)
+#rates = sq_rate_metric(None, rate_input, reduce=False).numpy()
+
+# run decoder
+input_length = args.num_redundancy_frames // 2
+offset = args.num_redundancy_frames - 1
+
+packets = []
+packet_sizes = []
+
+sym_batch = np.zeros((num_frames-offset, args.num_redundancy_frames//2, nsymbols), dtype='float32')
+quant_state = quant_gru_state_dec[0, offset:num_frames, :]
+#pack symbols for batch processing
+for i in range(offset, num_frames):
+ sym_batch[i-offset, :, :] = symbols[0, i - 2 * input_length + 2 : i + 1 : 2, :]
+
+#quantize symbols
+sym_batch = sym_batch * quant_scale
+sym_batch = apply_dead_zone([sym_batch, dead_zone]).numpy()
+sym_batch = np.round(sym_batch)
+
+hard_distr_embed = np.broadcast_to(hard_distr_embed, (sym_batch.shape[0], sym_batch.shape[1], 2*sym_batch.shape[2]))
+fake_lambda = np.ones((sym_batch.shape[0], sym_batch.shape[1], 1), dtype='float32')
+rate_input = np.concatenate((sym_batch, hard_distr_embed, fake_lambda), axis=-1)
+rates = sq_rate_metric(None, rate_input, reduce=False).numpy()
+#print(rates.shape)
+print("average rate = ", np.mean(rates[args.num_redundancy_frames:,:]))+
+#sym_batch.tofile('qsyms.f32')+
+sym_batch = sym_batch / quant_scale
+#print(sym_batch.shape, quant_state.shape)
+#features = decoder.predict([sym_batch, quant_state])
+features = decoder([sym_batch, quant_state])
+
+#for i in range(offset, num_frames):
+# print(f"processing frame {i - offset}...")+# features = decoder.predict([qsymbols[:, i - 2 * input_length + 2 : i + 1 : 2, :], quant_embed_dec[:, i - 2 * input_length + 2 : i + 1 : 2, :], quant_gru_state_dec[:, i, :]])
+# packets.append(features)
+# packet_size = 8 * int((np.sum(rates[:, i - 2 * input_length + 2 : i + 1 : 2]) + 7) / 8) + 64
+# packet_sizes.append(packet_size)
+
+
+# write packets
+packet_file = args.output + '.fec' if not args.output.endswith('.fec') else args.output+#write_fec_packets(packet_file, packets, packet_sizes)
+
+
+#print(f"average redundancy rate: {int(round(sum(packet_sizes) / len(packet_sizes) * 50 / 1000))} kbps")+
+if args.lossfile != None:
+ loss = np.loadtxt(args.lossfile, dtype='int16')
+ fec_out = np.zeros((features.shape[0]*2, features.shape[-1]), dtype='float32')
+ foffset = -2
+ ptr = 0;
+ count = 2;
+ for i in range(features.shape[0]):
+ if (loss[i] == 0) or (i == features.shape[0]-1):
+ fec_out[ptr:ptr+count,:] = features[i, foffset:, :]
+ #print("filled ", count)+ foffset = -2
+ ptr = ptr+count
+ count = 2
+ else:
+ count = count + 2
+ foffset = foffset - 2
+
+ fec_out_full = np.zeros((fec_out.shape[0], nb_features), dtype=np.float32)
+ fec_out_full[:, :nb_used_features] = fec_out
+
+ fec_out_full.tofile(packet_file[:-4] + f'_fec.f32')
+
+
+#create packets array like in the original version for debugging purposes
+for i in range(offset, num_frames):
+ packets.append(features[i-offset:i-offset+1, :, :])
+
+if args.debug_output:
+ import itertools
+
+ #batches = [2, 4]
+ batches = [4]
+ #offsets = [0, 4, 20]
+ offsets = [0, (args.num_redundancy_frames - 2)*2]
+ # sanity checks
+ # 1. concatenate features at offset 0
+ for batch, offset in itertools.product(batches, offsets):
+
+ stop = packets[0].shape[1] - offset
+ print(batch, offset, stop)
+ test_features = np.concatenate([packet[:,stop - batch: stop, :] for packet in packets[::batch//2]], axis=1)
+
+ test_features_full = np.zeros((test_features.shape[1], nb_features), dtype=np.float32)
+ test_features_full[:, :nb_used_features] = test_features[0, :, :]
+
+ print(f"writing debug output {packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32'}")+ test_features_full.tofile(packet_file[:-4] + f'_tf_batch{batch}_offset{offset}.f32')+
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.c
@@ -1,0 +1,142 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "fec_packets.h"
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index)
+{+
+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets || subframe_index >= subframes_per_packet)
+ {+ fprintf(stderr, "get_fec_frame: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size + 2 + subframe_index * subframe_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read features */
+ if (fread(features, sizeof(*features), num_features, fid) != num_features) goto error;
+
+ fclose(fid);
+ return 0;
+
+error:
+ fclose(fid);
+ return 1;
+}
+
+int get_fec_rate(const char * const filename, int packet_index)
+{+ int16_t version;
+ int16_t header_size;
+ int16_t num_packets;
+ int16_t packet_size;
+ int16_t subframe_size;
+ int16_t subframes_per_packet;
+ int16_t num_features;
+ long offset;
+ int16_t rate;
+
+ FILE *fid = fopen(filename, "rb");
+
+ /* read header */
+ if (fread(&version, sizeof(version), 1, fid) != 1) goto error;
+ if (fread(&header_size, sizeof(header_size), 1, fid) != 1) goto error;
+ if (fread(&num_packets, sizeof(num_packets), 1, fid) != 1) goto error;
+ if (fread(&packet_size, sizeof(packet_size), 1, fid) != 1) goto error;
+ if (fread(&subframe_size, sizeof(subframe_size), 1, fid) != 1) goto error;
+ if (fread(&subframes_per_packet, sizeof(subframes_per_packet), 1, fid) != 1) goto error;
+ if (fread(&num_features, sizeof(num_features), 1, fid) != 1) goto error;
+
+ /* check if indices are valid */
+ if (packet_index >= num_packets)
+ {+ fprintf(stderr, "get_fec_rate: index out of bounds\n");
+ goto error;
+ }
+
+ /* calculate offset in file (+ 2 is for rate) */
+ offset = header_size + packet_index * packet_size;
+ fseek(fid, offset, SEEK_SET);
+
+ /* read rate */
+ if (fread(&rate, sizeof(rate), 1, fid) != 1) goto error;
+
+ fclose(fid);
+ return (int) rate;
+
+error:
+ fclose(fid);
+ return -1;
+}
+
+#if 0
+int main()
+{+ float features[20];
+ int i;
+
+ if (get_fec_frame("../test.fec", &features[0], 0, 127))+ {+ return 1;
+ }
+
+ for (i = 0; i < 20; i ++)
+ {+ printf("%d %f\n", i, features[i]);+ }
+
+ printf("rate: %d\n", get_fec_rate("../test.fec", 0));+
+}
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.h
@@ -1,0 +1,34 @@
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef _FEC_PACKETS_H
+#define _FEC_PACKETS_H
+
+int get_fec_frame(const char * const filename, float *features, int packet_index, int subframe_index);
+int get_fec_rate(const char * const filename, int packet_index);
+
+#endif
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/fec_packets.py
@@ -1,0 +1,108 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+import numpy as np
+
+
+
+def write_fec_packets(filename, packets, rates=None):
+ """ writes packets in binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ # derive some sizes
+ num_packets = len(packets)
+ subframes_per_packet = packets[0].shape[-2]
+ num_features = packets[0].shape[-1]
+
+ # size of float is 4
+ subframe_size = num_features * 4
+ packet_size = subframe_size * subframes_per_packet + 2 # two bytes for rate
+
+ version = 1
+ # header size (version, header_size, num_packets, packet_size, subframe_size, subrames_per_packet, num_features)
+ header_size = 14
+
+ with open(filename, 'wb') as f:
+
+ # header
+ f.write(np.int16(version).tobytes())
+ f.write(np.int16(header_size).tobytes())
+ f.write(np.int16(num_packets).tobytes())
+ f.write(np.int16(packet_size).tobytes())
+ f.write(np.int16(subframe_size).tobytes())
+ f.write(np.int16(subframes_per_packet).tobytes())
+ f.write(np.int16(num_features).tobytes())
+
+ # packets
+ for i, packet in enumerate(packets):
+ if type(rates) == type(None):
+ rate = 0
+ else:
+ rate = rates[i]
+
+ f.write(np.int16(rate).tobytes())
+
+ features = np.flip(packet, axis=-2)
+ f.write(features.astype(np.float32).tobytes())
+
+
+def read_fec_packets(filename):
+ """ reads packets from binary format """
+
+ assert np.dtype(np.float32).itemsize == 4
+ assert np.dtype(np.int16).itemsize == 2
+
+ with open(filename, 'rb') as f:
+
+ # header
+ version = np.frombuffer(f.read(2), dtype=np.int16).item()
+ header_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_packets = np.frombuffer(f.read(2), dtype=np.int16).item()
+ packet_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframe_size = np.frombuffer(f.read(2), dtype=np.int16).item()
+ subframes_per_packet = np.frombuffer(f.read(2), dtype=np.int16).item()
+ num_features = np.frombuffer(f.read(2), dtype=np.int16).item()
+
+ dummy_features = np.zeros((1, subframes_per_packet, num_features), dtype=np.float32)
+
+ # packets
+ rates = []
+ packets = []
+ for i in range(num_packets):
+
+ rate = np.frombuffer(f.read(2), dtype=np.int16).item
+ rates.append(rate)
+
+ features = np.reshape(np.frombuffer(f.read(subframe_size * subframes_per_packet), dtype=np.float32), dummy_features.shape)
+ packet = np.flip(features, axis=-2)
+ packets.append(packet)
+
+ return packets
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/keraslayerdump.py
@@ -1,0 +1,189 @@
+'''Copyright (c) 2017-2018 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+""" helper functions for dumping some Keras layers to C files """
+
+import numpy as np
+
+
+def printVector(f, vector, name, dtype='float', dotp=False, static=True):
+ """ prints vector as one-dimensional C array """
+ if dotp:
+ vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+ vector = vector.transpose((2, 0, 3, 1))
+ v = np.reshape(vector, (-1))
+ if static:
+ f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v)))+ else:
+ f.write('const {} {}[{}] = {{\n '.format(dtype, name, len(v)))+ for i in range(0, len(v)):
+ f.write('{}'.format(v[i]))+ if (i!=len(v)-1):
+ f.write(',')+ else:
+ break;
+ if (i%8==7):
+ f.write("\n ")+ else:
+ f.write(" ")+ f.write('\n};\n\n')+ return vector
+
+def printSparseVector(f, A, name, have_diag=True):
+ N = A.shape[0]
+ M = A.shape[1]
+ W = np.zeros((0,), dtype='int')
+ W0 = np.zeros((0,))
+ if have_diag:
+ diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+ A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+ A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+ A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+ printVector(f, diag, name + '_diag')
+ AQ = np.minimum(127, np.maximum(-128, np.round(A*128))).astype('int')+ idx = np.zeros((0,), dtype='int')
+ for i in range(M//8):
+ pos = idx.shape[0]
+ idx = np.append(idx, -1)
+ nb_nonzero = 0
+ for j in range(N//4):
+ block = A[j*4:(j+1)*4, i*8:(i+1)*8]
+ qblock = AQ[j*4:(j+1)*4, i*8:(i+1)*8]
+ if np.sum(np.abs(block)) > 1e-10:
+ nb_nonzero = nb_nonzero + 1
+ idx = np.append(idx, j*4)
+ vblock = qblock.transpose((1,0)).reshape((-1,))
+ W0 = np.concatenate([W0, block.reshape((-1,))])
+ W = np.concatenate([W, vblock])
+ idx[pos] = nb_nonzero
+ f.write('#ifdef DOT_PROD\n')+ printVector(f, W, name, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')+ printVector(f, W0, name, dtype='qweight')
+ f.write('#endif /*DOT_PROD*/\n')+ printVector(f, idx, name + '_idx', dtype='int')
+ return AQ
+
+def dump_sparse_gru(self, f, hf):
+ name = 'sparse_' + self.name
+ print("printing layer " + name + " of type sparse " + self.__class__.__name__)+ weights = self.get_weights()
+ qweights = printSparseVector(f, weights[1], name + '_recurrent_weights')
+ printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[1,:] = subias[1,:] - np.sum(qweights*(1./128),axis=0)
+ printVector(f, subias, name + '_subias')
+ if hasattr(self, 'activation'):
+ activation = self.activation.__name__.upper()
+ else:
+ activation = 'TANH'
+ if hasattr(self, 'reset_after') and not self.reset_after:
+ reset_after = 0
+ else:
+ reset_after = 1
+ neurons = weights[0].shape[1]//3
+ max_rnn_neurons = neurons
+ f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'+ .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))+ hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))+ hf.write('extern const SparseGRULayer {};\n\n'.format(name));+ return max_rnn_neurons
+
+def dump_gru_layer(self, f, hf, dotp=False, sparse=False):
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)+ weights = self.get_weights()
+ if sparse:
+ qweight = printSparseVector(f, weights[0], name + '_weights', have_diag=False)
+ else:
+ qweight = printVector(f, weights[0], name + '_weights')
+
+ if dotp:
+ f.write('#ifdef DOT_PROD\n')+ qweight2 = np.clip(np.round(128.*weights[1]).astype('int'), -128, 127)+ printVector(f, qweight2, name + '_recurrent_weights', dotp=True, dtype='qweight')
+ f.write('#else /*DOT_PROD*/\n')+ else:
+ qweight2 = weights[1]
+
+ printVector(f, weights[1], name + '_recurrent_weights')
+ if dotp:
+ f.write('#endif /*DOT_PROD*/\n')+
+ printVector(f, weights[-1], name + '_bias')
+ subias = weights[-1].copy()
+ subias[0,:] = subias[0,:] - np.sum(qweight*(1./128.),axis=0)
+ subias[1,:] = subias[1,:] - np.sum(qweight2*(1./128.),axis=0)
+ printVector(f, subias, name + '_subias')
+ if hasattr(self, 'activation'):
+ activation = self.activation.__name__.upper()
+ else:
+ activation = 'TANH'
+ if hasattr(self, 'reset_after') and not self.reset_after:
+ reset_after = 0
+ else:
+ reset_after = 1
+ neurons = weights[0].shape[1]//3
+ max_rnn_neurons = neurons
+ f.write('const GRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_weights,\n {},\n {}_recurrent_weights,\n {}, {}, ACTIVATION_{}, {}\n}};\n\n'+ .format(name, name, name, name, name + "_weights_idx" if sparse else "NULL", name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))+ hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))+ hf.write('extern const GRULayer {};\n\n'.format(name));+ return max_rnn_neurons
+
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+ printVector(f, weights, name + '_weights')
+ printVector(f, bias, name + '_bias')
+ f.write('const DenseLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, ACTIVATION_{}\n}};\n\n'+ .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))+ hf.write('extern const DenseLayer {};\n\n'.format(name));+
+def dump_dense_layer(self, f, hf):
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)+ weights = self.get_weights()
+ activation = self.activation.__name__.upper()
+ dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
+ return False
+
+def dump_conv1d_layer(self, f, hf):
+ name = self.name
+ print("printing layer " + name + " of type " + self.__class__.__name__)+ weights = self.get_weights()
+ printVector(f, weights[0], name + '_weights')
+ printVector(f, weights[-1], name + '_bias')
+ activation = self.activation.__name__.upper()
+ max_conv_inputs = weights[0].shape[1]*weights[0].shape[0]
+ f.write('const Conv1DLayer {} = {{\n {}_bias,\n {}_weights,\n {}, {}, {}, ACTIVATION_{}\n}};\n\n'+ .format(name, name, name, weights[0].shape[1], weights[0].shape[0], weights[0].shape[2], activation))
+ hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[2]))+ hf.write('#define {}_STATE_SIZE ({}*{})\n'.format(name.upper(), weights[0].shape[1], (weights[0].shape[0]-1)))+ hf.write('#define {}_DELAY {}\n'.format(name.upper(), (weights[0].shape[0]-1)//2))+ hf.write('extern const Conv1DLayer {};\n\n'.format(name));+ return max_conv_inputs
--- a/dnn/training_tf2/plc_loader.py
+++ b/dnn/training_tf2/plc_loader.py
@@ -47,19 +47,25 @@
def __getitem__(self, index):
features = self.features[self.indices[index*self.batch_size:(index+1)*self.batch_size], :, :]
- #lost = (np.random.rand(features.shape[0], features.shape[1]) > .2).astype('float')+ burg_lost = (np.random.rand(features.shape[0], features.shape[1]) > .1).astype('float')+ burg_lost = np.reshape(burg_lost, (features.shape[0], features.shape[1], 1))
+ burg_mask = np.tile(burg_lost, (1,1,self.nb_burg_features))
+
lost = self.lost_offset[self.lost_indices[index*self.batch_size:(index+1)*self.batch_size], :]
lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))
lost_mask = np.tile(lost, (1,1,features.shape[2]))
in_features = features*lost_mask
+ in_features[:,:,:self.nb_burg_features] = in_features[:,:,:self.nb_burg_features]*burg_mask
#For the first frame after a loss, we don't have valid features, but the Burg estimate is valid.
- in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
+ #in_features[:,1:,self.nb_burg_features:] = in_features[:,1:,self.nb_burg_features:]*lost_mask[:,:-1,self.nb_burg_features:]
out_lost = np.copy(lost)
- out_lost[:,1:,:] = out_lost[:,1:,:]*out_lost[:,:-1,:]
+ #out_lost[:,1:,:] = out_lost[:,1:,:]*out_lost[:,:-1,:]
out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-out_lost], axis=-1)
- inputs = [in_features*lost_mask, lost]
+ burg_sign = 2*burg_lost - 1
+ # last dim is 1 for received packet, 0 for lost packet, and -1 when just the Burg info is missing
+ inputs = [in_features*lost_mask, lost*burg_sign]
outputs = [out_features]
return (inputs, outputs)
--- /dev/null
+++ b/dnn/training_tf2/rdovae.py
@@ -1,0 +1,373 @@
+#!/usr/bin/python3
+'''Copyright (c) 2022 Amazon
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+import math
+import tensorflow as tf
+from tensorflow.keras.models import Model
+from tensorflow.keras.layers import Input, GRU, Dense, Embedding, Reshape, Concatenate, Lambda, Conv1D, Multiply, Add, Bidirectional, MaxPooling1D, Activation, GaussianNoise, AveragePooling1D, RepeatVector
+from tensorflow.compat.v1.keras.layers import CuDNNGRU
+from tensorflow.keras import backend as K
+from tensorflow.keras.constraints import Constraint
+from tensorflow.keras.initializers import Initializer
+from tensorflow.keras.callbacks import Callback
+from tensorflow.keras.regularizers import l1
+import numpy as np
+import h5py
+from uniform_noise import UniformNoise
+
+class WeightClip(Constraint):
+ '''Clips the weights incident to each hidden unit to be inside a range
+ '''
+ def __init__(self, c=2):
+ self.c = c
+
+ def __call__(self, p):
+ # Ensure that abs of adjacent weights don't sum to more than 127. Otherwise there's a risk of
+ # saturation when implementing dot products with SSSE3 or AVX2.
+ return self.c*p/tf.maximum(self.c, tf.repeat(tf.abs(p[:, 1::2])+tf.abs(p[:, 0::2]), 2, axis=1))
+ #return K.clip(p, -self.c, self.c)
+
+ def get_config(self):
+ return {'name': self.__class__.__name__,+ 'c': self.c}
+
+constraint = WeightClip(0.496)
+
+def soft_quantize(x):
+ #x = 4*x
+ #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
+ #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
+ #x = x - (.25/np.math.pi)*tf.math.sin(2*np.math.pi*x)
+ return x
+
+def noise_quantize(x):
+ return soft_quantize(x + (K.random_uniform((128, 16, 80))-.5) )
+
+def hard_quantize(x):
+ x = soft_quantize(x)
+ quantized = tf.round(x)
+ return x + tf.stop_gradient(quantized - x)
+
+def apply_dead_zone(x):
+ d = x[1]*.05
+ x = x[0]
+ y = x - d*tf.math.tanh(x/(.1+d))
+ return y
+
+def rate_loss(y_true,y_pred):
+ log2_e = 1.4427
+ n = y_pred.shape[-1]
+ C = n - log2_e*np.math.log(np.math.gamma(n))
+ k = K.sum(K.abs(y_pred), axis=-1)
+ p = 1.5
+ #rate = C + (n-1)*log2_e*tf.math.log((k**p + (n/5)**p)**(1/p))
+ rate = C + (n-1)*log2_e*tf.math.log(k + .112*n**2/(n/1.8+k) )
+ return K.mean(rate)
+
+eps=1e-6
+def safelog2(x):
+ log2_e = 1.4427
+ return log2_e*tf.math.log(eps+x)
+
+def feat_dist_loss(y_true,y_pred):
+ lambda_1 = 1./K.sqrt(y_pred[:,:,:,-1])
+ y_pred = y_pred[:,:,:,:-1]
+ ceps = y_pred[:,:,:,:18] - y_true[:,:,:18]
+ pitch = 2*(y_pred[:,:,:,18:19] - y_true[:,:,18:19])/(y_true[:,:,18:19] + 2)
+ corr = y_pred[:,:,:,19:] - y_true[:,:,19:]
+ pitch_weight = K.square(K.maximum(0., y_true[:,:,19:]+.5))
+ return K.mean(lambda_1*K.mean(K.square(ceps) + 10*(1/18.)*K.abs(pitch)*pitch_weight + (1/18.)*K.square(corr), axis=-1))
+
+def sq1_rate_loss(y_true,y_pred):
+ lambda_val = K.sqrt(y_pred[:,:,-1])
+ y_pred = y_pred[:,:,:-1]
+ log2_e = 1.4427
+ n = y_pred.shape[-1]//3
+ r = (y_pred[:,:,2*n:])
+ p0 = (y_pred[:,:,n:2*n])
+ p0 = 1-r**(.5+.5*p0)
+ y_pred = y_pred[:,:,:n]
+ y_pred = soft_quantize(y_pred)
+
+ y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+ rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+ rate = -safelog2(-.5*tf.math.log(r)*r**K.abs(y_pred))
+ rate = -safelog2((1-r)/(1+r)*r**K.abs(y_pred))
+ #rate = -safelog2(- tf.math.sinh(.5*tf.math.log(r))* r**K.abs(y_pred) - tf.math.cosh(K.maximum(0., .5 - K.abs(y_pred))*tf.math.log(r)) + 1)
+ rate = lambda_val*K.sum(rate, axis=-1)
+ return K.mean(rate)
+
+def sq2_rate_loss(y_true,y_pred):
+ lambda_val = K.sqrt(y_pred[:,:,-1])
+ y_pred = y_pred[:,:,:-1]
+ log2_e = 1.4427
+ n = y_pred.shape[-1]//3
+ r = y_pred[:,:,2*n:]
+ p0 = y_pred[:,:,n:2*n]
+ p0 = 1-r**(.5+.5*p0)
+ #theta = K.minimum(1., .5 + 0*p0 - 0.04*tf.math.log(r))
+ #p0 = 1-r**theta
+ y_pred = tf.round(y_pred[:,:,:n])
+ y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+ rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+ rate = lambda_val*K.sum(rate, axis=-1)
+ return K.mean(rate)
+
+def sq_rate_metric(y_true,y_pred, reduce=True):
+ y_pred = y_pred[:,:,:-1]
+ log2_e = 1.4427
+ n = y_pred.shape[-1]//3
+ r = y_pred[:,:,2*n:]
+ p0 = y_pred[:,:,n:2*n]
+ p0 = 1-r**(.5+.5*p0)
+ #theta = K.minimum(1., .5 + 0*p0 - 0.04*tf.math.log(r))
+ #p0 = 1-r**theta
+ y_pred = tf.round(y_pred[:,:,:n])
+ y0 = K.maximum(0., 1. - K.abs(y_pred))**2
+ rate = -y0*safelog2(p0*r**K.abs(y_pred)) - (1-y0)*safelog2(.5*(1-p0)*(1-r)*r**(K.abs(y_pred)-1))
+ rate = K.sum(rate, axis=-1)
+ if reduce:
+ rate = K.mean(rate)
+ return rate
+
+def pvq_quant_search(x, k):
+ x = x/tf.reduce_sum(tf.abs(x), axis=-1, keepdims=True)
+ kx = k*x
+ y = tf.round(kx)
+ newk = k
+
+ for j in range(10):
+ #print("y = ", y)+ #print("iteration ", j)+ abs_y = tf.abs(y)
+ abs_kx = tf.abs(kx)
+ kk=tf.reduce_sum(abs_y, axis=-1)
+ #print("sums = ", kk)+ plus = 1.000001*tf.reduce_min((abs_y+.5)/(abs_kx+1e-15), axis=-1)
+ minus = .999999*tf.reduce_max((abs_y-.5)/(abs_kx+1e-15), axis=-1)
+ #print("plus = ", plus)+ #print("minus = ", minus)+ factor = tf.where(kk>k, minus, plus)
+ factor = tf.where(kk==k, tf.ones_like(factor), factor)
+ #print("scale = ", factor)+ factor = tf.expand_dims(factor, axis=-1)
+ #newk = newk * (k/kk)**.2
+ newk = newk*factor
+ kx = newk*x
+ #print("newk = ", newk)+ #print("unquantized = ", newk*x)+ y = tf.round(kx)
+
+ #print(y)
+ #print(K.mean(K.sum(K.abs(y), axis=-1)))
+ return y
+
+def pvq_quantize(x, k):
+ x = x/(1e-15+tf.norm(x, axis=-1,keepdims=True))
+ quantized = pvq_quant_search(x, k)
+ quantized = quantized/(1e-15+tf.norm(quantized, axis=-1,keepdims=True))
+ return x + tf.stop_gradient(quantized - x)
+
+
+def var_repeat(x):
+ return tf.repeat(tf.expand_dims(x[0], 1), K.shape(x[1])[1], axis=1)
+
+nb_state_dim = 24
+
+def new_rdovae_encoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+ feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
+
+ gru = CuDNNGRU if training else GRU
+ enc_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense1')
+ enc_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense2')
+ enc_dense3 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense3')
+ enc_dense4 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense4')
+ enc_dense5 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='enc_dense5')
+ enc_dense6 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='enc_dense6')
+ enc_dense7 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='enc_dense7')
+ enc_dense8 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='enc_dense8')
+
+ #bits_dense = Dense(nb_bits, activation='linear', name='bits_dense')
+ bits_dense = Conv1D(nb_bits, 4, padding='causal', activation='linear', name='bits_dense')
+
+ zero_out = Lambda(lambda x: 0*x)
+ inputs = Reshape((-1, 2*nb_used_features))(feat)
+ d1 = enc_dense1(inputs)
+ d2 = enc_dense2(d1)
+ d3 = enc_dense3(d2)
+ d4 = enc_dense4(d3)
+ d5 = enc_dense5(d4)
+ d6 = enc_dense6(d5)
+ d7 = enc_dense7(d6)
+ d8 = enc_dense8(d7)
+ pre_out = Concatenate()([d1, d2, d3, d4, d5, d6, d7, d8])
+ enc_out = bits_dense(pre_out)
+ global_dense1 = Dense(128, activation='tanh', name='gdense1')
+ global_dense2 = Dense(nb_state_dim, activation='tanh', name='gdense2')
+ global_bits = global_dense2(global_dense1(pre_out))
+
+ encoder = Model([feat], [enc_out, global_bits], name='encoder')
+ return encoder
+
+def new_rdovae_decoder(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+ bits_input = Input(shape=(None, nb_bits), batch_size=batch_size, name="dec_bits")
+ gru_state_input = Input(shape=(nb_state_dim,), batch_size=batch_size, name="dec_state")
+
+
+ gru = CuDNNGRU if training else GRU
+ dec_dense1 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense1')
+ dec_dense2 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense2')
+ dec_dense3 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense3')
+ dec_dense4 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense4')
+ dec_dense5 = Dense(cond_size2, activation='tanh', kernel_constraint=constraint, name='dec_dense5')
+ dec_dense6 = gru(cond_size, return_sequences=True, kernel_constraint=constraint, recurrent_constraint=constraint, name='dec_dense6')
+ dec_dense7 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='dec_dense7')
+ dec_dense8 = Dense(cond_size, activation='tanh', kernel_constraint=constraint, name='dec_dense8')
+
+ dec_final = Dense(bunch*nb_used_features, activation='linear', name='dec_final')
+
+ time_reverse = Lambda(lambda x: K.reverse(x, 1))
+ #time_reverse = Lambda(lambda x: x)
+ #gru_state_rep = RepeatVector(64//bunch)(gru_state_input)
+
+ #gru_state_rep = Lambda(var_repeat, output_shape=(None, nb_state_dim)) ([gru_state_input, bits_input])
+ gru_state1 = Dense(cond_size, name="state1", activation='tanh')(gru_state_input)
+ gru_state2 = Dense(cond_size, name="state2", activation='tanh')(gru_state_input)
+ gru_state3 = Dense(cond_size, name="state3", activation='tanh')(gru_state_input)
+
+ dec1 = dec_dense1(time_reverse(bits_input))
+ dec2 = dec_dense2(dec1, initial_state=gru_state1)
+ dec3 = dec_dense3(dec2)
+ dec4 = dec_dense4(dec3, initial_state=gru_state2)
+ dec5 = dec_dense5(dec4)
+ dec6 = dec_dense6(dec5, initial_state=gru_state3)
+ dec7 = dec_dense7(dec6)
+ dec8 = dec_dense8(dec7)
+ output = Reshape((-1, nb_used_features))(dec_final(Concatenate()([dec1, dec2, dec3, dec4, dec5, dec6, dec7, dec8])))
+ decoder = Model([bits_input, gru_state_input], time_reverse(output), name='decoder')
+ decoder.nb_bits = nb_bits
+ decoder.bunch = bunch
+ return decoder
+
+def new_split_decoder(decoder):
+ nb_bits = decoder.nb_bits
+ bunch = decoder.bunch
+ bits_input = Input(shape=(None, nb_bits), name="split_bits")
+ gru_state_input = Input(shape=(None,nb_state_dim), name="split_state")
+
+ range_select = Lambda(lambda x: x[0][:,x[1]:x[2],:])
+ elem_select = Lambda(lambda x: x[0][:,x[1],:])
+ points = [0, 100, 200, 300, 400]
+ outputs = []
+ for i in range(len(points)-1):
+ begin = points[i]//bunch
+ end = points[i+1]//bunch
+ state = elem_select([gru_state_input, end-1])
+ bits = range_select([bits_input, begin, end])
+ outputs.append(decoder([bits, state]))
+ output = Concatenate(axis=1)(outputs)
+ split = Model([bits_input, gru_state_input], output, name="split")
+ return split
+
+def tensor_concat(x):
+ #n = x[1]//2
+ #x = x[0]
+ n=2
+ y = []
+ for i in range(n-1):
+ offset = 2 * (n-1-i)
+ tmp = K.concatenate([x[i][:, offset:, :], x[-1][:, -offset:, :]], axis=-2)
+ y.append(tf.expand_dims(tmp, axis=0))
+ y.append(tf.expand_dims(x[-1], axis=0))
+ return Concatenate(axis=0)(y)
+
+
+def new_rdovae_model(nb_used_features=20, nb_bits=17, bunch=4, nb_quant=40, batch_size=128, cond_size=128, cond_size2=256, training=False):
+
+ feat = Input(shape=(None, nb_used_features), batch_size=batch_size)
+ quant_id = Input(shape=(None,), batch_size=batch_size)
+ lambda_val = Input(shape=(None, 1), batch_size=batch_size)
+ lambda_bunched = AveragePooling1D(pool_size=bunch//2, strides=bunch//2, padding="valid")(lambda_val)
+ lambda_up = Lambda(lambda x: K.repeat_elements(x, 2, axis=-2))(lambda_val)
+
+ qembedding = Embedding(nb_quant, 6*nb_bits, name='quant_embed', embeddings_initializer='zeros')
+ quant_embed_dec = qembedding(quant_id)
+ quant_scale = Activation('softplus')(Lambda(lambda x: x[:,:,:nb_bits], name='quant_scale_embed')(quant_embed_dec))+
+ encoder = new_rdovae_encoder(nb_used_features, nb_bits, bunch, nb_quant, batch_size, cond_size, cond_size2, training=training)
+ ze, gru_state_dec = encoder([feat])
+ ze = Multiply()([ze, quant_scale])
+
+ decoder = new_rdovae_decoder(nb_used_features, nb_bits, bunch, nb_quant, batch_size, cond_size, cond_size2, training=training)
+ split_decoder = new_split_decoder(decoder)
+
+ dead_zone = Activation('softplus')(Lambda(lambda x: x[:,:,nb_bits:2*nb_bits], name='dead_zone_embed')(quant_embed_dec))+ soft_distr_embed = Activation('sigmoid')(Lambda(lambda x: x[:,:,2*nb_bits:4*nb_bits], name='soft_distr_embed')(quant_embed_dec))+ hard_distr_embed = Activation('sigmoid')(Lambda(lambda x: x[:,:,4*nb_bits:], name='hard_distr_embed')(quant_embed_dec))+
+ noisequant = UniformNoise()
+ hardquant = Lambda(hard_quantize)
+ dzone = Lambda(apply_dead_zone)
+ dze = dzone([ze,dead_zone])
+ ndze = noisequant(dze)
+ dze_quant = hardquant(dze)
+
+ div = Lambda(lambda x: x[0]/x[1])
+ dze_quant = div([dze_quant,quant_scale])
+ ndze_unquant = div([ndze,quant_scale])
+
+ mod_select = Lambda(lambda x: x[0][:,x[1]::bunch//2,:])
+ gru_state_dec = Lambda(lambda x: pvq_quantize(x, 82))(gru_state_dec)
+ combined_output = []
+ unquantized_output = []
+ cat = Concatenate(name="out_cat")
+ for i in range(bunch//2):
+ dze_select = mod_select([dze_quant, i])
+ ndze_select = mod_select([ndze_unquant, i])
+ state_select = mod_select([gru_state_dec, i])
+
+ tmp = split_decoder([dze_select, state_select])
+ tmp = cat([tmp, lambda_up])
+ combined_output.append(tmp)
+
+ tmp = split_decoder([ndze_select, state_select])
+ tmp = cat([tmp, lambda_up])
+ unquantized_output.append(tmp)
+
+ concat = Lambda(tensor_concat, name="output")
+ combined_output = concat(combined_output)
+ unquantized_output = concat(unquantized_output)
+
+ e2 = Concatenate(name="hard_bits")([dze, hard_distr_embed, lambda_val])
+ e = Concatenate(name="soft_bits")([dze, soft_distr_embed, lambda_val])
+
+
+ model = Model([feat, quant_id, lambda_val], [combined_output, unquantized_output, e, e2], name="end2end")
+ model.nb_used_features = nb_used_features
+
+ return model, encoder, decoder, qembedding
+
--- /dev/null
+++ b/dnn/training_tf2/rdovae_exchange.py
@@ -1,0 +1,138 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')+parser.add_argument('output', metavar="<output folder>", type=str, help='output exchange folder')+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)+
+args = parser.parse_args()
+
+# now import the heavy stuff
+from rdovae import new_rdovae_model
+from wexchange.tf import dump_tf_weights, load_tf_weights
+
+
+exchange_name = {+ 'enc_dense1' : 'encoder_stack_layer1_dense',
+ 'enc_dense3' : 'encoder_stack_layer3_dense',
+ 'enc_dense5' : 'encoder_stack_layer5_dense',
+ 'enc_dense7' : 'encoder_stack_layer7_dense',
+ 'enc_dense8' : 'encoder_stack_layer8_dense',
+ 'gdense1' : 'encoder_state_layer1_dense',
+ 'gdense2' : 'encoder_state_layer2_dense',
+ 'enc_dense2' : 'encoder_stack_layer2_gru',
+ 'enc_dense4' : 'encoder_stack_layer4_gru',
+ 'enc_dense6' : 'encoder_stack_layer6_gru',
+ 'bits_dense' : 'encoder_stack_layer9_conv',
+ 'qembedding' : 'statistical_model_embedding',
+ 'state1' : 'decoder_state1_dense',
+ 'state2' : 'decoder_state2_dense',
+ 'state3' : 'decoder_state3_dense',
+ 'dec_dense1' : 'decoder_stack_layer1_dense',
+ 'dec_dense3' : 'decoder_stack_layer3_dense',
+ 'dec_dense5' : 'decoder_stack_layer5_dense',
+ 'dec_dense7' : 'decoder_stack_layer7_dense',
+ 'dec_dense8' : 'decoder_stack_layer8_dense',
+ 'dec_final' : 'decoder_stack_layer9_dense',
+ 'dec_dense2' : 'decoder_stack_layer2_gru',
+ 'dec_dense4' : 'decoder_stack_layer4_gru',
+ 'dec_dense6' : 'decoder_stack_layer6_gru'
+}
+
+
+if __name__ == "__main__":
+
+ model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+ model.load_weights(args.weights)
+
+ os.makedirs(args.output, exist_ok=True)
+
+ # encoder
+ encoder_dense_names = [
+ 'enc_dense1',
+ 'enc_dense3',
+ 'enc_dense5',
+ 'enc_dense7',
+ 'enc_dense8',
+ 'gdense1',
+ 'gdense2'
+ ]
+
+ encoder_gru_names = [
+ 'enc_dense2',
+ 'enc_dense4',
+ 'enc_dense6'
+ ]
+
+ encoder_conv1d_names = [
+ 'bits_dense'
+ ]
+
+
+ for name in encoder_dense_names + encoder_gru_names + encoder_conv1d_names:
+ print(f"writing layer {exchange_name[name]}...")+ dump_tf_weights(os.path.join(args.output, exchange_name[name]), encoder.get_layer(name))
+
+ # qembedding
+ print(f"writing layer {exchange_name['qembedding']}...")+ dump_tf_weights(os.path.join(args.output, exchange_name['qembedding']), qembedding)
+
+ # decoder
+ decoder_dense_names = [
+ 'state1',
+ 'state2',
+ 'state3',
+ 'dec_dense1',
+ 'dec_dense3',
+ 'dec_dense5',
+ 'dec_dense7',
+ 'dec_dense8',
+ 'dec_final'
+ ]
+
+ decoder_gru_names = [
+ 'dec_dense2',
+ 'dec_dense4',
+ 'dec_dense6'
+ ]
+
+ for name in decoder_dense_names + decoder_gru_names:
+ print(f"writing layer {exchange_name[name]}...")+ dump_tf_weights(os.path.join(args.output, exchange_name[name]), decoder.get_layer(name))
--- /dev/null
+++ b/dnn/training_tf2/rdovae_import.py
@@ -1,0 +1,123 @@
+"""
+/* Copyright (c) 2022 Amazon
+ Written by Jan Buethe */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+"""
+
+
+import argparse
+import os
+import sys
+
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('input', metavar="<input folder>", type=str, help='input exchange folder')+parser.add_argument('weights', metavar="<weight file>", type=str, help='model weight file in hdf5 format')+parser.add_argument('--cond-size', type=int, help="conditioning size (default: 256)", default=256)+parser.add_argument('--latent-dim', type=int, help="dimension of latent space (default: 80)", default=80)+parser.add_argument('--quant-levels', type=int, help="number of quantization steps (default: 16)", default=16)+
+args = parser.parse_args()
+
+# now import the heavy stuff
+from rdovae import new_rdovae_model
+from wexchange.tf import load_tf_weights
+
+
+exchange_name = {+ 'enc_dense1' : 'encoder_stack_layer1_dense',
+ 'enc_dense3' : 'encoder_stack_layer3_dense',
+ 'enc_dense5' : 'encoder_stack_layer5_dense',
+ 'enc_dense7' : 'encoder_stack_layer7_dense',
+ 'enc_dense8' : 'encoder_stack_layer8_dense',
+ 'gdense1' : 'encoder_state_layer1_dense',
+ 'gdense2' : 'encoder_state_layer2_dense',
+ 'enc_dense2' : 'encoder_stack_layer2_gru',
+ 'enc_dense4' : 'encoder_stack_layer4_gru',
+ 'enc_dense6' : 'encoder_stack_layer6_gru',
+ 'bits_dense' : 'encoder_stack_layer9_conv',
+ 'qembedding' : 'statistical_model_embedding',
+ 'state1' : 'decoder_state1_dense',
+ 'state2' : 'decoder_state2_dense',
+ 'state3' : 'decoder_state3_dense',
+ 'dec_dense1' : 'decoder_stack_layer1_dense',
+ 'dec_dense3' : 'decoder_stack_layer3_dense',
+ 'dec_dense5' : 'decoder_stack_layer5_dense',
+ 'dec_dense7' : 'decoder_stack_layer7_dense',
+ 'dec_dense8' : 'decoder_stack_layer8_dense',
+ 'dec_final' : 'decoder_stack_layer9_dense',
+ 'dec_dense2' : 'decoder_stack_layer2_gru',
+ 'dec_dense4' : 'decoder_stack_layer4_gru',
+ 'dec_dense6' : 'decoder_stack_layer6_gru'
+}
+
+if __name__ == "__main__":
+
+ model, encoder, decoder, qembedding = new_rdovae_model(20, args.latent_dim, cond_size=args.cond_size, nb_quant=args.quant_levels)
+
+ encoder_layers = [
+ 'enc_dense1',
+ 'enc_dense3',
+ 'enc_dense5',
+ 'enc_dense7',
+ 'enc_dense8',
+ 'gdense1',
+ 'gdense2',
+ 'enc_dense2',
+ 'enc_dense4',
+ 'enc_dense6',
+ 'bits_dense'
+ ]
+
+ decoder_layers = [
+ 'state1',
+ 'state2',
+ 'state3',
+ 'dec_dense1',
+ 'dec_dense3',
+ 'dec_dense5',
+ 'dec_dense7',
+ 'dec_dense8',
+ 'dec_final',
+ 'dec_dense2',
+ 'dec_dense4',
+ 'dec_dense6'
+ ]
+
+ for name in encoder_layers:
+ print(f"loading weight for layer {name}...")+ load_tf_weights(os.path.join(args.input, exchange_name[name]), encoder.get_layer(name))
+
+ print(f"loading weight for layer qembedding...")
+ load_tf_weights(os.path.join(args.input, exchange_name['qembedding']), qembedding)
+
+ for name in decoder_layers:
+ print(f"loading weight for layer {name}...")+ load_tf_weights(os.path.join(args.input, exchange_name[name]), decoder.get_layer(name))
+
+ model.save(args.weights)
\ No newline at end of file
--- /dev/null
+++ b/dnn/training_tf2/train_rdovae.py
@@ -1,0 +1,151 @@
+#!/usr/bin/python3
+'''Copyright (c) 2021-2022 Amazon
+ Copyright (c) 2018-2019 Mozilla
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+'''
+
+# Train an LPCNet model
+import tensorflow as tf
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+
+import argparse
+#from plc_loader import PLCLoader
+
+parser = argparse.ArgumentParser(description='Train a quantization model')
+
+parser.add_argument('features', metavar='<features file>', help='binary features file (float32)')+parser.add_argument('output', metavar='<output>', help='trained model file (.h5)')+parser.add_argument('--model', metavar='<model>', default='rdovae', help='PLC model python definition (without .py)')+group1 = parser.add_mutually_exclusive_group()
+group1.add_argument('--quantize', metavar='<input weights>', help='quantize model')+group1.add_argument('--retrain', metavar='<input weights>', help='continue training model')+parser.add_argument('--cond-size', metavar='<units>', default=1024, type=int, help='number of units in conditioning network (default 1024)')+parser.add_argument('--epochs', metavar='<epochs>', default=120, type=int, help='number of epochs to train for (default 120)')+parser.add_argument('--batch-size', metavar='<batch size>', default=128, type=int, help='batch size to use (default 128)')+parser.add_argument('--seq-length', metavar='<sequence length>', default=1000, type=int, help='sequence length to use (default 1000)')+parser.add_argument('--lr', metavar='<learning rate>', type=float, help='learning rate')+parser.add_argument('--decay', metavar='<decay>', type=float, help='learning rate decay')+parser.add_argument('--logdir', metavar='<log dir>', help='directory for tensorboard log files')+
+
+args = parser.parse_args()
+
+import importlib
+rdovae = importlib.import_module(args.model)
+
+import sys
+import numpy as np
+from tensorflow.keras.optimizers import Adam
+from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
+import tensorflow.keras.backend as K
+import h5py
+
+#gpus = tf.config.experimental.list_physical_devices('GPU')+#if gpus:
+# try:
+# tf.config.experimental.set_virtual_device_configuration(gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
+# except RuntimeError as e:
+# print(e)
+
+nb_epochs = args.epochs
+
+# Try reducing batch_size if you run out of memory on your GPU
+batch_size = args.batch_size
+
+quantize = args.quantize is not None
+retrain = args.retrain is not None
+
+if quantize:
+ lr = 0.00003
+ decay = 0
+ input_model = args.quantize
+else:
+ lr = 0.001
+ decay = 2.5e-5
+
+if args.lr is not None:
+ lr = args.lr
+
+if args.decay is not None:
+ decay = args.decay
+
+if retrain:
+ input_model = args.retrain
+
+
+opt = Adam(lr, decay=decay, beta_2=0.99)
+
+with strategy.scope():
+ model, encoder, decoder, _ = rdovae.new_rdovae_model(nb_used_features=20, nb_bits=80, batch_size=batch_size, cond_size=args.cond_size, nb_quant=16)
+ model.compile(optimizer=opt, loss=[rdovae.feat_dist_loss, rdovae.feat_dist_loss, rdovae.sq1_rate_loss, rdovae.sq2_rate_loss], loss_weights=[.5, .5, 1., .1], metrics={'hard_bits':rdovae.sq_rate_metric})+ model.summary()
+
+lpc_order = 16
+
+feature_file = args.features
+nb_features = model.nb_used_features + lpc_order
+nb_used_features = model.nb_used_features
+sequence_size = args.seq_length
+
+# u for unquantised, load 16 bit PCM samples and convert to mu-law
+
+
+features = np.memmap(feature_file, dtype='float32', mode='r')
+nb_sequences = len(features)//(nb_features*sequence_size)//batch_size*batch_size
+features = features[:nb_sequences*sequence_size*nb_features]
+
+features = np.reshape(features, (nb_sequences, sequence_size, nb_features))
+print(features.shape)
+features = features[:, :, :nb_used_features]
+
+#lambda_val = np.repeat(np.random.uniform(.0007, .002, (features.shape[0], 1, 1)), features.shape[1]//2, axis=1)
+#quant_id = np.round(10*np.log(lambda_val/.0007)).astype('int16')+#quant_id = quant_id[:,:,0]
+quant_id = np.repeat(np.random.randint(16, size=(features.shape[0], 1, 1), dtype='int16'), features.shape[1]//2, axis=1)
+lambda_val = .0002*np.exp(quant_id/3.8)
+quant_id = quant_id[:,:,0]
+
+# dump models to disk as we go
+checkpoint = ModelCheckpoint('{}_{}_{}.h5'.format(args.output, args.cond_size, '{epoch:02d}'))+
+if args.retrain is not None:
+ model.load_weights(args.retrain)
+
+if quantize or retrain:
+ #Adapting from an existing model
+ model.load_weights(input_model)
+
+model.save_weights('{}_{}_initial.h5'.format(args.output, args.cond_size))+
+callbacks = [checkpoint]
+#callbacks = []
+
+if args.logdir is not None:
+ logdir = '{}/{}_{}_logs'.format(args.logdir, args.output, args.cond_size)+ tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir)
+ callbacks.append(tensorboard_callback)
+
+model.fit([features, quant_id, lambda_val], [features, features, features, features], batch_size=batch_size, epochs=nb_epochs, validation_split=0.0, callbacks=callbacks)
--- /dev/null
+++ b/dnn/training_tf2/uniform_noise.py
@@ -1,0 +1,78 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the UniformNoise layer."""
+
+
+import tensorflow.compat.v2 as tf
+
+from tensorflow.keras import backend
+
+from tensorflow.keras.layers import Layer
+
+class UniformNoise(Layer):
+ """Apply additive zero-centered uniform noise.
+
+ This is useful to mitigate overfitting
+ (you could see it as a form of random data augmentation).
+ Gaussian Noise (GS) is a natural choice as corruption process
+ for real valued inputs.
+
+ As it is a regularization layer, it is only active at training time.
+
+ Args:
+ stddev: Float, standard deviation of the noise distribution.
+ seed: Integer, optional random seed to enable deterministic behavior.
+
+ Call arguments:
+ inputs: Input tensor (of any rank).
+ training: Python boolean indicating whether the layer should behave in
+ training mode (adding noise) or in inference mode (doing nothing).
+
+ Input shape:
+ Arbitrary. Use the keyword argument `input_shape`
+ (tuple of integers, does not include the samples axis)
+ when using this layer as the first layer in a model.
+
+ Output shape:
+ Same shape as input.
+ """
+
+
+
+
+ def __init__(self, stddev=0.5, seed=None, **kwargs):
+ super().__init__(**kwargs)
+ self.supports_masking = True
+ self.stddev = stddev
+
+
+ def call(self, inputs, training=None):
+ def noised():
+ return inputs + backend.random_uniform(
+ shape=tf.shape(inputs),
+ minval=-self.stddev,
+ maxval=self.stddev,
+ dtype=inputs.dtype,
+ )
+
+ return backend.in_train_phase(noised, inputs, training=training)
+
+ def get_config(self):
+ config = {"stddev": self.stddev}+ base_config = super().get_config()
+ return dict(list(base_config.items()) + list(config.items()))
+
+ def compute_output_shape(self, input_shape):
+ return input_shape
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -37,7 +37,7 @@
#if defined(__AVX__) || defined(__SSE2__)
#include "vec_avx.h"
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && !defined(DISABLE_NEON)
#include "vec_neon.h"
#else
@@ -59,7 +59,7 @@
/* No AVX2/FMA support */
#ifndef LPCNET_TEST
-static inline float celt_exp2(float x)
+static inline float lpcnet_exp2(float x)
{int integer;
float frac;
@@ -77,7 +77,7 @@
res.i = (res.i + (integer<<23)) & 0x7fffffff;
return res.f;
}
-#define celt_exp(x) celt_exp2((x)*1.44269504f)
+#define lpcnet_exp(x) lpcnet_exp2((x)*1.44269504f)
static inline float tanh_approx(float x)
{@@ -107,7 +107,7 @@
{int i;
for (i=0;i<N;i++)
- y[i] = celt_exp(x[i]);
+ y[i] = lpcnet_exp(x[i]);
}
static inline void vec_tanh(float *y, const float *x, int N)
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@@ -33,6 +33,7 @@
#define VEC_AVX_H
#include <immintrin.h>
+#include <math.h>
/* Use 8-bit dot products unless disabled or if stuck with SSE2. */
#if (defined(__AVX2__) || defined(__SSSE3__)) && !defined(DISABLE_DOT_PROD)
@@ -41,7 +42,11 @@
#else
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")+#else
#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
#endif
@@ -81,7 +86,7 @@
#define _mm256_storeu_ps(dst, src) mm256_storeu_ps(dst, src)
-static inline mm256_emu mm256_setzero_ps() {+static inline mm256_emu mm256_setzero_ps(void) {mm256_emu ret;
ret.lo = _mm_setzero_ps();
ret.hi = ret.lo;
@@ -297,7 +302,7 @@
const __m256 K1 = _mm256_set1_ps(0.69583354f);
const __m256 K2 = _mm256_set1_ps(0.22606716f);
const __m256 K3 = _mm256_set1_ps(0.078024523f);
- const __m256 log2_E = _mm256_set1_ps(1.44269504);
+ const __m256 log2_E = _mm256_set1_ps(1.44269504f);
const __m256 max_in = _mm256_set1_ps(50.f);
const __m256 min_in = _mm256_set1_ps(-50.f);
__m256 XF, Y;
@@ -519,7 +524,7 @@
#endif
-static inline float celt_exp(float x)
+static inline float lpcnet_exp(float x)
{float out[8];
__m256 X, Y;
@@ -540,7 +545,7 @@
_mm256_storeu_ps(&y[i], Y);
}
for (;i<N;i++)
- y[i] = celt_exp(x[i]);
+ y[i] = lpcnet_exp(x[i]);
}
#ifdef __AVX__
--- a/dnn/vec_neon.h
+++ b/dnn/vec_neon.h
@@ -33,7 +33,12 @@
#ifndef DISABLE_DOT_PROD
#define DOT_PROD
#endif
+
+#ifdef DOT_PROD
typedef signed char qweight;
+#else
+typedef float qweight;
+#endif
#ifndef LPCNET_TEST
@@ -105,7 +110,7 @@
return vmaxq_f32(min_out, vminq_f32(max_out, num));
}
-static inline float celt_exp(float x)
+static inline float lpcnet_exp(float x)
{float out[4];
float32x4_t X, Y;
@@ -146,7 +151,7 @@
vst1q_f32(&y[i], Y);
}
for (;i<N;i++)
- y[i] = celt_exp(x[i]);
+ y[i] = lpcnet_exp(x[i]);
}
static inline void vec_tanh(float *y, const float *x, int N)
@@ -162,7 +167,7 @@
for (;i<N;i++)
{float ex2;
- ex2 = celt_exp(2*x[i]);
+ ex2 = lpcnet_exp(2*x[i]);
y[i] = (ex2-1)/(ex2+1);
}
}
@@ -180,7 +185,7 @@
for (;i<N;i++)
{float ex;
- ex = celt_exp(x[i]);
+ ex = lpcnet_exp(x[i]);
y[i] = (ex)/(ex+1);
}
}
--- /dev/null
+++ b/dnn/write_lpcnet_weights.c
@@ -1,0 +1,66 @@
+/* Copyright (c) 2023 Amazon */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "nnet.h"
+
+extern const WeightArray lpcnet_arrays[];
+extern const WeightArray lpcnet_plc_arrays[];
+
+void write_weights(const WeightArray *list, FILE *fout)
+{+ int i=0;
+ unsigned char zeros[WEIGHT_BLOCK_SIZE] = {0};+ while (list[i].name != NULL) {+ WeightHead h;
+ memcpy(h.head, "DNNw", 4);
+ h.version = WEIGHT_BLOB_VERSION;
+ h.type = list[i].type;
+ h.size = list[i].size;
+ h.block_size = (h.size+WEIGHT_BLOCK_SIZE-1)/WEIGHT_BLOCK_SIZE*WEIGHT_BLOCK_SIZE;
+ RNN_CLEAR(h.name, sizeof(h.name));
+ strncpy(h.name, list[i].name, sizeof(h.name));
+ h.name[sizeof(h.name)-1] = 0;
+ celt_assert(sizeof(h) == WEIGHT_BLOCK_SIZE);
+ fwrite(&h, 1, WEIGHT_BLOCK_SIZE, fout);
+ fwrite(list[i].data, 1, h.size, fout);
+ fwrite(zeros, 1, h.block_size-h.size, fout);
+ i++;
+ }
+}
+
+int main()
+{+ FILE *fout = fopen("weights_blob.bin", "w");+ write_weights(lpcnet_arrays, fout);
+ write_weights(lpcnet_plc_arrays, fout);
+ fclose(fout);
+ return 0;
+}
--
⑨