shithub: opus

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -138,9 +138,18 @@

   int encode = 0;

   int decode = 0;

   int quantize = 0;

+  int burg = 0;

   srand(getpid());

   st = lpcnet_encoder_create();

   argv0=argv[0];

+  if (argc == 5 && strcmp(argv[1], "-btrain")==0) {

+      burg = 1;

+      training = 1;

+  }

+  if (argc == 4 && strcmp(argv[1], "-btest")==0) {

+      burg = 1;

+      training = 0;

+  }

   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;

   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {

       training = 1;

@@ -236,7 +245,8 @@

     if (count*FRAME_SIZE_5MS>=10000000 && one_pass_completed) break;

     if (training && ++gain_change_count > 2821) {

       float tmp, tmp2;

-      speech_gain = pow(10., (-20+(rand()%40))/20.);

+      speech_gain = pow(10., (-30+(rand()%40))/20.);

+      if (rand()&1) speech_gain = -speech_gain;

       if (rand()%20==0) speech_gain *= .01;

       if (rand()%100==0) speech_gain = 0;

       gain_change_count = 0;

@@ -247,7 +257,6 @@

     biquad(x, mem_hp_x, x, b_hp, a_hp, FRAME_SIZE);

     biquad(x, mem_resp_x, x, b_sig, a_sig, FRAME_SIZE);

-    preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);

     for (i=0;i<FRAME_SIZE;i++) {

       float g;

       float f = (float)i/FRAME_SIZE;

@@ -254,6 +263,12 @@

       g = f*speech_gain + (1-f)*old_speech_gain;

       x[i] *= g;

+    if (burg) {

+      float ceps[2*NB_BANDS];

+      burg_cepstral_analysis(ceps, x);

+      fwrite(ceps, sizeof(float), 2*NB_BANDS, ffeat);

+    }

+    preemphasis(x, &mem_preemph, x, PREEMPHASIS, FRAME_SIZE);

     for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;

     /* PCM is delayed by 1/2 frame to make the features centered on the frames. */

     for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);

--- a/dnn/freq.c

+++ b/dnn/freq.c

@@ -155,7 +155,7 @@

-void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order) {

+void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order) {

   int i;

   float burg_in[FRAME_SIZE];

   float burg_lpc[LPC_ORDER];

@@ -188,6 +188,19 @@

   dct(burg_cepstrum, Ly);

   burg_cepstrum[0] += - 4;

+}

+void burg_cepstral_analysis(float *ceps, const float *x) {

+  int i;

+  compute_burg_cepstrum(x,                &ceps[0       ], FRAME_SIZE/2, LPC_ORDER);

+  compute_burg_cepstrum(&x[FRAME_SIZE/2], &ceps[NB_BANDS], FRAME_SIZE/2, LPC_ORDER);

+  for (i=0;i<NB_BANDS;i++) {

+    float c0, c1;

+    c0 = ceps[i];

+    c1 = ceps[NB_BANDS+i];

+    ceps[i         ] = .5*(c0+c1);

+    ceps[NB_BANDS+i] = (c0-c1);

+  }

 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P) {

--- a/dnn/freq.h

+++ b/dnn/freq.h

@@ -47,7 +47,8 @@

 void compute_band_energy(float *bandE, const kiss_fft_cpx *X);

 void compute_band_corr(float *bandE, const kiss_fft_cpx *X, const kiss_fft_cpx *P);

-void compute_burg_cepstrum(const short *pcm, float *burg_cepstrum, int len, int order);

+void compute_burg_cepstrum(const float *pcm, float *burg_cepstrum, int len, int order);

+void burg_cepstral_analysis(float *ceps, const float *x);

 void apply_window(float *x);

 void dct(float *out, const float *in);

--- a/dnn/lpcnet_plc.c

+++ b/dnn/lpcnet_plc.c

@@ -75,7 +75,9 @@

   float x[FRAME_SIZE];

   short output[FRAME_SIZE];

 #if PLC_DNN_PRED

-  float plc_features[NB_FEATURES+1];

+  float plc_features[2*NB_BANDS+NB_FEATURES+1];

+  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];

+  burg_cepstral_analysis(plc_features, x);

 #endif

   st->enc.pcount = 0;

   if (st->skip_analysis) {

@@ -105,8 +107,8 @@

   process_single_frame(&st->enc, NULL);

 #if PLC_DNN_PRED

   if (st->skip_analysis <= 1) {

-    RNN_COPY(plc_features, st->enc.features[0], NB_FEATURES);

-    plc_features[NB_FEATURES] = 1;

+    RNN_COPY(&plc_features[2*NB_BANDS], st->enc.features[0], NB_FEATURES);

+    plc_features[2*NB_BANDS+NB_FEATURES] = 1;

     compute_plc_pred(&st->plc_net, st->features, plc_features);

 #else

@@ -142,7 +144,7 @@

   int i;

 #endif

   short output[FRAME_SIZE];

-  float zeros[NB_FEATURES+1] = {0};

+  float zeros[2*NB_BANDS+NB_FEATURES+1] = {0};

   st->enc.pcount = 0;

   /* If we concealed the previous frame, finish synthesizing the rest of the samples. */

   /* FIXME: Copy/predict features. */

--- a/dnn/lpcnet_private.h

+++ b/dnn/lpcnet_private.h

@@ -64,6 +64,7 @@

   float features[4][NB_TOTAL_FEATURES];

   float sig_mem[LPC_ORDER];

   int exc_mem;

+  float burg_cepstrum[2*NB_BANDS];

};

 #define PLC_BUF_SIZE (FEATURES_DELAY*FRAME_SIZE + TRAINING_OFFSET)

--- a/dnn/training_tf2/lpcnet_plc.py

+++ b/dnn/training_tf2/lpcnet_plc.py

@@ -62,8 +62,8 @@

 constraint = WeightClip(0.992)

-def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):

-    feat = Input(shape=(None, nb_used_features), batch_size=batch_size)

+def new_lpcnet_plc_model(rnn_units=256, nb_used_features=20, nb_burg_features=36, batch_size=128, training=False, adaptation=False, quantize=False, cond_size=128):

+    feat = Input(shape=(None, nb_used_features+nb_burg_features), batch_size=batch_size)

     lost = Input(shape=(None, 1), batch_size=batch_size)

     fdense1 = Dense(cond_size, activation='tanh', name='plc_dense1')

@@ -96,5 +96,6 @@

     model.rnn_units = rnn_units

     model.cond_size = cond_size

     model.nb_used_features = nb_used_features

+    model.nb_burg_features = nb_burg_features

     return model

--- a/dnn/training_tf2/plc_loader.py

+++ b/dnn/training_tf2/plc_loader.py

@@ -29,12 +29,13 @@

 from tensorflow.keras.utils import Sequence

 class PLCLoader(Sequence):

-    def __init__(self, features, lost, batch_size):

+    def __init__(self, features, lost, nb_burg_features, batch_size):

         self.batch_size = batch_size

         self.nb_batches = features.shape[0]//self.batch_size

         self.features = features[:self.nb_batches*self.batch_size, :, :]

         self.lost = lost.astype('float')

         self.lost = self.lost[:(len(self.lost)//features.shape[1]-1)*features.shape[1]]

+        self.nb_burg_features = nb_burg_features

         self.on_epoch_end()

     def on_epoch_end(self):

@@ -51,7 +52,7 @@

         lost = np.reshape(lost, (features.shape[0], features.shape[1], 1))

         lost_mask = np.tile(lost, (1,1,features.shape[2]))

-        out_features = np.concatenate([features, 1.-lost], axis=-1)

+        out_features = np.concatenate([features[:,:,self.nb_burg_features:], 1.-lost], axis=-1)

         inputs = [features*lost_mask, lost]

         outputs = [out_features]

         return (inputs, outputs)

--- a/dnn/training_tf2/train_plc.py

+++ b/dnn/training_tf2/train_plc.py

@@ -140,8 +140,9 @@

 lpc_order = 16

 feature_file = args.features

-nb_features = model.nb_used_features + lpc_order

+nb_features = model.nb_used_features + lpc_order + model.nb_burg_features

 nb_used_features = model.nb_used_features

+nb_burg_features = model.nb_burg_features

 sequence_size = args.seq_length

 # u for unquantised, load 16 bit PCM samples and convert to mu-law

@@ -153,7 +154,7 @@

 features = np.reshape(features, (nb_sequences, sequence_size, nb_features))

-features = features[:, :, :nb_used_features]

+features = features[:, :, :nb_used_features+model.nb_burg_features]

 lost = np.memmap(args.lost_file, dtype='int8', mode='r')

@@ -169,7 +170,7 @@

 model.save_weights('{}_{}_initial.h5'.format(args.output, args.gru_size))

-loader = PLCLoader(features, lost, batch_size)

+loader = PLCLoader(features, lost, nb_burg_features, batch_size)

 callbacks = [checkpoint]

 if args.logdir is not None:

--

⑨