shithub: opus

--- a/dnn/lpcnet.py

+++ b/dnn/lpcnet.py

@@ -10,7 +10,8 @@

 import h5py

 import sys

-rnn_units=128

+rnn_units1=128

+rnn_units2=32

 pcm_bits = 8

 embed_size = 128

 pcm_levels = 2**pcm_bits

@@ -47,7 +48,8 @@

     feat = Input(shape=(None, nb_used_features))

     pitch = Input(shape=(None, 1))

     dec_feat = Input(shape=(None, 128))

-    dec_state = Input(shape=(rnn_units,))

+    dec_state1 = Input(shape=(rnn_units1,))

+    dec_state2 = Input(shape=(rnn_units2,))

     fconv1 = Conv1D(128, 3, padding='same', activation='tanh')

     fconv2 = Conv1D(102, 3, padding='same', activation='tanh')

@@ -70,18 +72,21 @@

     rep = Lambda(lambda x: K.repeat_elements(x, 160, 1))

-    rnn = CuDNNGRU(rnn_units, return_sequences=True, return_state=True)

+    rnn = CuDNNGRU(rnn_units1, return_sequences=True, return_state=True)

+    rnn2 = CuDNNGRU(rnn_units2, return_sequences=True, return_state=True)

     rnn_in = Concatenate()([cpcm, cexc, rep(cfeat)])

     md = MDense(pcm_levels, activation='softmax')

-    gru_out, state = rnn(rnn_in)

-    ulaw_prob = md(gru_out)

+    gru_out1, _ = rnn(rnn_in)

+    gru_out2, _ = rnn2(gru_out1)

+    ulaw_prob = md(gru_out2)

     model = Model([pcm, exc, feat, pitch], ulaw_prob)

     encoder = Model([feat, pitch], cfeat)

     dec_rnn_in = Concatenate()([cpcm, cexc, dec_feat])

-    dec_gru_out, state = rnn(dec_rnn_in, initial_state=dec_state)

-    dec_ulaw_prob = md(dec_gru_out)

+    dec_gru_out1, state1 = rnn(dec_rnn_in, initial_state=dec_state1)

+    dec_gru_out2, state2 = rnn2(dec_gru_out1, initial_state=dec_state2)

+    dec_ulaw_prob = md(dec_gru_out2)

-    decoder = Model([pcm, exc, dec_feat, dec_state], [dec_ulaw_prob, state])

+    decoder = Model([pcm, exc, dec_feat, dec_state1, dec_state2], [dec_ulaw_prob, state1, state2])

     return model, encoder, decoder

--- a/dnn/test_wavenet_audio.py

+++ b/dnn/test_wavenet_audio.py

@@ -59,7 +59,7 @@

 out_data = np.reshape(data, (nb_frames*pcm_chunk_size, 1))

-model.load_weights('wavenet4f2_30.h5')

+model.load_weights('wavenet5d0_19.h5')

 order = 16

@@ -66,7 +66,8 @@

 pcm = 0.*out_data

 fexc = np.zeros((1, 1, 2), dtype='float32')

 iexc = np.zeros((1, 1, 1), dtype='int16')

-state = np.zeros((1, lpcnet.rnn_units), dtype='float32')

+state1 = np.zeros((1, lpcnet.rnn_units1), dtype='float32')

+state2 = np.zeros((1, lpcnet.rnn_units2), dtype='float32')

 for c in range(1, nb_frames):

     cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])

     for fr in range(1, feature_chunk_size):

@@ -82,7 +83,7 @@

             pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1, 0])

             fexc[0, 0, 1] = lin2ulaw(pred)

-            p, state = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state])

+            p, state1, state2 = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2])

             #p = p*p

             #p = p/(1e-18 + np.sum(p))

             p = np.maximum(p-0.001, 0).astype('float64')

--- a/dnn/train_wavenet_audio.py

+++ b/dnn/train_wavenet_audio.py

@@ -86,7 +86,7 @@

 in_data = np.concatenate([in_data, pred], axis=-1)

-checkpoint = ModelCheckpoint('wavenet5b_{epoch:02d}.h5')

+checkpoint = ModelCheckpoint('wavenet5d0_{epoch:02d}.h5')

 #model.load_weights('wavenet4f2_30.h5')

 model.compile(optimizer=Adam(0.001, amsgrad=True, decay=2e-4), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

--

⑨