ref: f0ec990dba011b2862c60a8903954d782cb92d19
parent: da7f4c6c99d1f4e4153f97ee5a6bbd07199507f3
author: Jean-Marc Valin <jmvalin@amazon.com>
date: Sun Oct 1 22:23:41 EDT 2023
Switching to neural pitch estimator Remove old pitch estimator and retrain all models
--- a/autogen.sh
+++ b/autogen.sh
@@ -9,7 +9,7 @@
srcdir=`dirname $0`
test -n "$srcdir" && cd "$srcdir"
-dnn/download_model.sh 27663d3
+dnn/download_model.sh da7f4c6
echo "Updating build configuration files, please wait...."
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@@ -51,6 +51,7 @@
int lpcnet_encoder_init(LPCNetEncState *st) {memset(st, 0, sizeof(*st));
st->exc_mem = lin2ulaw(0.f);
+ pitchdnn_init(&st->pitchdnn);
return 0;
}
@@ -98,7 +99,6 @@
float Ex[NB_BANDS];
float xcorr[PITCH_MAX_PERIOD];
float ener0;
- int sub;
float ener;
/* [b,a]=ellip(2, 2, 20, 1200/8000); */
static const float lp_b[2] = {-0.84946f, 1.f};@@ -163,110 +163,21 @@
}
/*printf("\n");*/}
- /* Cross-correlation on half-frames. */
- for (sub=0;sub<2;sub++) {- int off = sub*FRAME_SIZE/2;
- double ener1;
- celt_pitch_xcorr(&st->exc_buf[PITCH_MAX_PERIOD+off], st->exc_buf+off, xcorr, FRAME_SIZE/2, PITCH_MAX_PERIOD, st->arch);
- ener0 = celt_inner_prod_c(&st->exc_buf[PITCH_MAX_PERIOD+off], &st->exc_buf[PITCH_MAX_PERIOD+off], FRAME_SIZE/2);
- ener1 = celt_inner_prod_c(&st->exc_buf[off], &st->exc_buf[off], FRAME_SIZE/2-1);
- st->frame_weight[sub] = ener0;
- /*printf("%f\n", st->frame_weight[sub]);*/- for (i=0;i<PITCH_MAX_PERIOD;i++) {- ener1 += st->exc_buf[i+off+FRAME_SIZE/2-1]*st->exc_buf[i+off+FRAME_SIZE/2-1];
- ener = 1 + ener0 + ener1;
- st->xc[sub][i] = 2*xcorr[i] / ener;
- ener1 -= st->exc_buf[i+off]*st->exc_buf[i+off];
- }
- if (1) {- /* Upsample correlation by 3x and keep the max. */
- float interpolated[PITCH_MAX_PERIOD]={0};- /* interp=sinc([-3:3]+1/3).*(.5+.5*cos(pi*[-3:3]/4.5)); interp=interp/sum(interp); */
- static const float interp[7] = {0.026184f, -0.098339f, 0.369938f, 0.837891f, -0.184969f, 0.070242f, -0.020947f};- for (i=4;i<PITCH_MAX_PERIOD-4;i++) {- float val1=0, val2=0;
- int j;
- for (j=0;j<7;j++) {- val1 += st->xc[sub][i-3+j]*interp[j];
- val2 += st->xc[sub][i+3-j]*interp[j];
- interpolated[i] = MAX16(st->xc[sub][i], MAX16(val1, val2));
- }
- }
- for (i=4;i<PITCH_MAX_PERIOD-4;i++) {- st->xc[sub][i] = interpolated[i];
- }
- }
-#if 0
- for (i=0;i<PITCH_MAX_PERIOD;i++)
- printf("%f ", st->xc[sub][i]);- printf("\n");-#endif
- }
+ st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);
}
void process_single_frame(LPCNetEncState *st, FILE *ffeat) {- int i;
- int sub;
- int best_i;
- int best[4];
- int pitch_prev[2][PITCH_MAX_PERIOD];
float frame_corr;
- float frame_weight_sum = 1e-15f;
- for(sub=0;sub<2;sub++) frame_weight_sum += st->frame_weight[sub];
- for(sub=0;sub<2;sub++) st->frame_weight[sub] *= (2.f/frame_weight_sum);
- for(sub=0;sub<2;sub++) {- float max_path_all = -1e15f;
- best_i = 0;
- for (i=0;i<PITCH_MAX_PERIOD-2*PITCH_MIN_PERIOD;i++) {- float xc_half = MAX16(MAX16(st->xc[sub][(PITCH_MAX_PERIOD+i)/2], st->xc[sub][(PITCH_MAX_PERIOD+i+2)/2]), st->xc[sub][(PITCH_MAX_PERIOD+i-1)/2]);
- if (st->xc[sub][i] < xc_half*1.1f) st->xc[sub][i] *= .8f;
- }
- for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) {- int j;
- float max_prev;
- max_prev = st->pitch_max_path_all - 6.f;
- pitch_prev[sub][i] = st->best_i;
- for (j=IMAX(-4, -i);j<=4 && i+j<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;j++) {- if (st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j) > max_prev) {- max_prev = st->pitch_max_path[0][i+j] - .02f*abs(j)*abs(j);
- pitch_prev[sub][i] = i+j;
- }
- }
- st->pitch_max_path[1][i] = max_prev + st->frame_weight[sub]*st->xc[sub][i];
- if (st->pitch_max_path[1][i] > max_path_all) {- max_path_all = st->pitch_max_path[1][i];
- best_i = i;
- }
- }
- /* Renormalize. */
- for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) st->pitch_max_path[1][i] -= max_path_all;
- /*for (i=0;i<PITCH_MAX_PERIOD-PITCH_MIN_PERIOD;i++) printf("%f ", st->pitch_max_path[1][i]);- printf("\n");*/- OPUS_COPY(&st->pitch_max_path[0][0], &st->pitch_max_path[1][0], PITCH_MAX_PERIOD);
- st->pitch_max_path_all = max_path_all;
- st->best_i = best_i;
- }
- best_i = st->best_i;
- frame_corr = 0;
- /* Backward pass. */
- for (sub=1;sub>=0;sub--) {- best[2+sub] = PITCH_MAX_PERIOD-best_i;
- frame_corr += st->frame_weight[sub]*st->xc[sub][best_i];
- best_i = pitch_prev[sub][best_i];
- }
- frame_corr /= 2;
- if (0) {- float xy, xx, yy;
- int pitch = (best[2]+best[3])/2;
- xx = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD], &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE);
- yy = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD-pitch], &st->lp_buf[PITCH_MAX_PERIOD-pitch], FRAME_SIZE);
- xy = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD], &st->lp_buf[PITCH_MAX_PERIOD-pitch], FRAME_SIZE);
- //printf("%f %f\n", frame_corr, xy/sqrt(1e-15+xx*yy));- frame_corr = xy/sqrt(1+xx*yy);
- //frame_corr = MAX32(0, xy/sqrt(1+xx*yy));
- frame_corr = log(1.f+exp(5.f*frame_corr))/log(1+exp(5.f));
- }
- st->features[NB_BANDS] = .01f*(IMAX(66, IMIN(510, best[2]+best[3]))-200);
+ float xy, xx, yy;
+ /*int pitch = (best[2]+best[3])/2;*/
+ int pitch = (int)floor(.5+256./pow(2.f,((1./60.)*((st->dnn_pitch+1.5)*60))));
+ xx = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD], &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE);
+ yy = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD-pitch], &st->lp_buf[PITCH_MAX_PERIOD-pitch], FRAME_SIZE);
+ xy = celt_inner_prod_c(&st->lp_buf[PITCH_MAX_PERIOD], &st->lp_buf[PITCH_MAX_PERIOD-pitch], FRAME_SIZE);
+ /*printf("%f %f\n", frame_corr, xy/sqrt(1e-15+xx*yy));*/+ frame_corr = xy/sqrt(1+xx*yy);
+ frame_corr = log(1.f+exp(5.f*frame_corr))/log(1+exp(5.f));
+ st->features[NB_BANDS] = st->dnn_pitch;
st->features[NB_BANDS + 1] = frame_corr-.5f;
if (ffeat) {fwrite(st->features, sizeof(float), NB_TOTAL_FEATURES, ffeat);
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@@ -7,9 +7,8 @@
#include "nnet_data.h"
#include "plc_data.h"
#include "kiss99.h"
+#include "pitchdnn.h"
-#define PITCH_MIN_PERIOD 32
-#define PITCH_MAX_PERIOD 256
#define PITCH_FRAME_SIZE 320
#define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
@@ -44,6 +43,7 @@
};
struct LPCNetEncState{+ PitchDNNState pitchdnn;
int arch;
float analysis_mem[OVERLAP_SIZE];
float mem_preemph;
@@ -50,16 +50,12 @@
kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
float if_features[PITCH_IF_FEATURES];
float xcorr_features[PITCH_MAX_PERIOD - PITCH_MIN_PERIOD];
+ float dnn_pitch;
float pitch_mem[LPC_ORDER];
float pitch_filt;
- float xc[2][PITCH_MAX_PERIOD+1];
- float frame_weight[2];
float exc_buf[PITCH_BUF_SIZE];
float lp_buf[PITCH_BUF_SIZE];
float lp_mem[4];
- float pitch_max_path[2][PITCH_MAX_PERIOD];
- float pitch_max_path_all;
- int best_i;
float last_gain;
int last_period;
float lpc[LPC_ORDER];
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -408,14 +408,14 @@
storing the output as [ out_channels x len2 ].
We assume that the output dimension along the ksize1 axis is 1,
i.e. processing one frame at a time. */
-void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int len2)
+void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)
{int i;
int in_stride;
- in_stride = len2+kheight-1;
- OPUS_CLEAR(out, out_channels*len2);
+ in_stride = height+kheight-1;
for (i=0;i<out_channels;i++) {int m;
+ OPUS_CLEAR(&out[i*hstride], height);
for (m=0;m<in_channels;m++) {int t;
for (t=0;t<ktime;t++) {@@ -422,8 +422,8 @@
int h;
for (h=0;h<kheight;h++) {int j;
- for (j=0;j<len2;j++) {- out[i*len2 + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
+ for (j=0;j<height;j++) {+ out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *
in[t*in_channels*in_stride + m*in_stride + j + h];
}
}
@@ -432,9 +432,9 @@
}
}
-#define MAX_CONV2D_INPUTS 2048
+#define MAX_CONV2D_INPUTS 8192
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int len2, int activation)
+void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)
{int i;
const float *bias;
@@ -441,17 +441,22 @@
float in_buf[MAX_CONV2D_INPUTS];
int time_stride;
celt_assert(in != out);
- time_stride = conv->in_channels*(len2+conv->kheight);
+ time_stride = conv->in_channels*(height+conv->kheight-1);
celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);
OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);
OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);
OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);
bias = conv->bias;
- conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, len2);
+ conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);
if (bias != NULL) {- for (i=0;i<conv->out_channels*len2;i++) out[i] += bias[i];
+ for (i=0;i<conv->out_channels;i++) {+ int j;
+ for (j=0;j<height;j++) out[i*hstride+j] += bias[i];
+ }
}
- compute_activation(out, out, conv->out_channels*len2, activation);
+ for (i=0;i<conv->out_channels;i++) {+ compute_activation(&out[i*hstride], &out[i*hstride], height, activation);
+ }
}
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -189,6 +189,14 @@
int nb_inputs,
int nb_outputs);
+int conv2d_init(Conv2dLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *float_weights,
+ int in_channels,
+ int out_channels,
+ int ktime,
+ int kheight);
+
int mdense_init(MDenseLayer *layer, const WeightArray *arrays,
const char *bias,
const char *input_weights,
@@ -234,7 +242,7 @@
int nb_neurons,
int activation);
-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int len2, int activation);
+void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
const char *embedding_weights,
--- a/dnn/parse_lpcnet_weights.c
+++ b/dnn/parse_lpcnet_weights.c
@@ -272,6 +272,31 @@
return 0;
}
+int conv2d_init(Conv2dLayer *layer, const WeightArray *arrays,
+ const char *bias,
+ const char *float_weights,
+ int in_channels,
+ int out_channels,
+ int ktime,
+ int kheight)
+{+ int err;
+ layer->bias = NULL;
+ layer->float_weights = NULL;
+ if (bias != NULL) {+ if ((layer->bias = find_array_check(arrays, bias, out_channels*sizeof(layer->bias[0]))) == NULL) return 1;
+ }
+ if (float_weights != NULL) {+ layer->float_weights = opt_array_check(arrays, float_weights, in_channels*out_channels*ktime*kheight*sizeof(layer->float_weights[0]), &err);
+ if (err) return 1;
+ }
+ layer->in_channels = in_channels;
+ layer->out_channels = out_channels;
+ layer->ktime = ktime;
+ layer->kheight = kheight;
+ return 0;
+}
+
int embedding_init(EmbeddingLayer *layer, const WeightArray *arrays,
const char *embedding_weights,
int nb_inputs,
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@@ -9,7 +9,7 @@
#include "lpcnet_private.h"
-int compute_pitchdnn(
+float compute_pitchdnn(
PitchDNNState *st,
const float *if_features,
const float *xcorr_features
@@ -18,35 +18,40 @@
float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
float downsampler_in[NB_XCORR_FEATURES + DENSE_IF_UPSAMPLER_2_OUT_SIZE];
float downsampler_out[DENSE_DOWNSAMPLER_OUT_SIZE];
- float conv1_tmp1[NB_XCORR_FEATURES + 2] = {0};- float conv1_tmp2[NB_XCORR_FEATURES + 2] = {0};+ float conv1_tmp1[(NB_XCORR_FEATURES + 2)*8] = {0};+ float conv1_tmp2[(NB_XCORR_FEATURES + 2)*8] = {0};float output[DENSE_FINAL_UPSAMPLER_OUT_SIZE];
int i;
int pos=0;
float maxval=-1;
+ float sum=0;
+ float count=0;
PitchDNN *model = &st->model;
-
/* IF */
compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);
compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);
-
/* xcorr*/
OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
- compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, ACTIVATION_TANH);
- compute_conv2d(&model->conv2d_1, &conv1_tmp1[1], st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, ACTIVATION_TANH);
- compute_conv2d(&model->conv2d_1, downsampler_in, st->xcorr_mem3, conv1_tmp1, NB_XCORR_FEATURES, ACTIVATION_TANH);
+ compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
+ compute_conv2d(&model->conv2d_2, &conv1_tmp1[1], st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
+ compute_conv2d(&model->conv2d_3, downsampler_in, st->xcorr_mem3, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);
compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);
compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);
-
- for (i=0;i<DENSE_FINAL_UPSAMPLER_OUT_SIZE;i++) {+ for (i=0;i<180;i++) { if (output[i] > maxval) {pos = i;
maxval = output[i];
}
}
- return (1.f/60.f)*pos - 1.5;
+ for (i=IMAX(0, pos-2); i<=IMIN(179, pos+2); i++) {+ float p = exp(output[i]);
+ sum += p*i;
+ count += p;
+ }
+ /*printf("%d %f\n", pos, sum/count);*/+ return (1.f/60.f)*(sum/count) - 1.5;
/*return 256.f/pow(2.f, (1.f/60.f)*i);*/
}
@@ -55,7 +60,11 @@
{int ret;
OPUS_CLEAR(st, 1);
+#ifndef USE_WEIGHTS_FILE
ret = init_pitchdnn(&st->model, pitchdnn_arrays);
+#else
+ ret = 0;
+#endif
celt_assert(ret == 0);
/* FIXME: perform arch detection. */
}
--- a/dnn/pitchdnn.h
+++ b/dnn/pitchdnn.h
@@ -5,8 +5,10 @@
typedef struct PitchDNN PitchDNN;
#include "pitchdnn_data.h"
-#include "lpcnet_private.h"
+#define PITCH_MIN_PERIOD 32
+#define PITCH_MAX_PERIOD 256
+
#define NB_XCORR_FEATURES (PITCH_MAX_PERIOD-PITCH_MIN_PERIOD)
@@ -21,7 +23,7 @@
void pitchdnn_init(PitchDNNState *st);
-int compute_pitchdnn(
+float compute_pitchdnn(
PitchDNNState *st,
const float *if_features,
const float *xcorr_features
--- a/dnn/torch/neural-pitch/training.py
+++ b/dnn/torch/neural-pitch/training.py
@@ -24,6 +24,7 @@
parser.add_argument('--epochs', type=int, help='Number of training epochs',default = 50,required = False) parser.add_argument('--choice_cel', type=str, help='Choice of Cross Entropy Loss (default or robust)',choices=['default','robust'],default = 'default',required = False) parser.add_argument('--prefix', type=str, help="prefix for model export, default: model", default='model')+parser.add_argument('--initial-checkpoint', type=str, help='initial checkpoint to start training from, default: None', default=None)args = parser.parse_args()
@@ -54,6 +55,11 @@
pitch_nn = PitchDNNXcorr(args.xcorr_dimension, args.gru_dim, args.output_dim)
else:
pitch_nn = PitchDNN(3 * args.freq_keep - 2, 224, args.gru_dim, args.output_dim)
+
+if type(args.initial_checkpoint) != type(None):
+ checkpoint = torch.load(args.initial_checkpoint, map_location='cpu')
+ pitch_nn.load_state_dict(checkpoint['state_dict'], strict=False)
+
dataset_training = PitchDNNDataloader(args.features,args.features_pitch,args.confidence_threshold,args.context,args.data_format)
--- a/dnn/torch/rdovae/rdovae/rdovae.py
+++ b/dnn/torch/rdovae/rdovae/rdovae.py
@@ -159,7 +159,7 @@
raise ValueError('distortion loss is designed to work with 20 features')ceps_error = y_pred[..., :18] - y_true[..., :18]
- pitch_error = 2 * (y_pred[..., 18:19] - y_true[..., 18:19]) / (2 + y_true[..., 18:19])
+ pitch_error = 2*(y_pred[..., 18:19] - y_true[..., 18:19])
corr_error = y_pred[..., 19:] - y_true[..., 19:]
pitch_weight = torch.relu(y_true[..., 19:] + 0.5) ** 2
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@@ -24,4 +24,6 @@
dnn/dred_rdovae_enc_data.h \
dnn/dred_rdovae_dec.h \
dnn/dred_rdovae_dec_data.h \
-dnn/dred_rdovae_stats_data.h
+dnn/dred_rdovae_stats_data.h \
+dnn/pitchdnn.h \
+dnn/pitchdnn_data.h
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@@ -18,6 +18,8 @@
dnn/dred_rdovae_dec.c \
dnn/dred_rdovae_dec_data.c \
dnn/dred_rdovae_stats_data.c \
+dnn/pitchdnn.c \
+dnn/pitchdnn_data.c \
silk/dred_encoder.c \
silk/dred_coding.c \
silk/dred_decoder.c
--- a/silk/dred_config.h
+++ b/silk/dred_config.h
@@ -32,7 +32,7 @@
#define DRED_EXTENSION_ID 126
/* Remove these two completely once DRED gets an extension number assigned. */
-#define DRED_EXPERIMENTAL_VERSION 4
+#define DRED_EXPERIMENTAL_VERSION 5
#define DRED_EXPERIMENTAL_BYTES 2
--
⑨