shithub: opus

--- a/Makefile.am

+++ b/Makefile.am

@@ -50,20 +50,32 @@

 if HAVE_RTCD

 CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)

 SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)

+if ENABLE_DEEP_PLC

+LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)

 endif

+endif

 if HAVE_SSE

 CELT_SOURCES += $(CELT_SOURCES_SSE)

 endif

 if HAVE_SSE2

 CELT_SOURCES += $(CELT_SOURCES_SSE2)

+if ENABLE_DEEP_PLC

+LPCNET_SOURCES += $(DNN_SOURCES_SSE2)

 endif

+endif

 if HAVE_SSE4_1

 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)

+if ENABLE_DEEP_PLC

+LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)

 endif

+endif

 if HAVE_AVX2

 CELT_SOURCES += $(CELT_SOURCES_AVX2)

+if ENABLE_DEEP_PLC

+LPCNET_SOURCES += $(DNN_SOURCES_AVX2)

 endif

 endif

+endif

 if CPU_ARM

 if HAVE_RTCD

@@ -398,12 +410,14 @@

 endif

 if HAVE_SSE2

-SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)

+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \

+           $(DNN_SOURCES_SSE2:.c=.lo)

 $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)

 endif

 if HAVE_SSE4_1

 SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \

+             $(DNN_SOURCES_SSE4_1:.c=.lo) \

              $(SILK_SOURCES_SSE4_1:.c=.lo) \

              $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)

 $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)

@@ -410,7 +424,8 @@

 endif

 if HAVE_AVX2

-AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo)

+AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \

+           $(DNN_SOURCES_AVX2:.c=.lo)

 $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)

 endif

--- a/celt/x86/x86cpu.h

+++ b/celt/x86/x86cpu.h

@@ -47,7 +47,7 @@

 # endif

 # if defined(OPUS_X86_MAY_HAVE_AVX2)

-#  define MAY_HAVE_AVX2(name) name ## _avx

+#  define MAY_HAVE_AVX2(name) name ## _avx2

 # else

 #  define MAY_HAVE_AVX2(name) name ## _c

 # endif

--- a/dnn/dred_rdovae_dec.c

+++ b/dnn/dred_rdovae_dec.c

@@ -42,12 +42,12 @@

     *init = 1;

-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)

+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)

     int i;

     RDOVAEDecState dec;

     memset(&dec, 0, sizeof(dec));

-    dred_rdovae_dec_init_states(&dec, model, state);

+    dred_rdovae_dec_init_states(&dec, model, state, arch);

     for (i = 0; i < 2*nb_latents; i += 2)

         dred_rdovae_decode_qframe(

@@ -54,7 +54,8 @@

             &dec,

             model,

             &features[2*i*DRED_NUM_FEATURES],

-            &latents[(i/2)*DRED_LATENT_DIM]);

+            &latents[(i/2)*DRED_LATENT_DIM],

+            arch);

@@ -61,14 +62,15 @@

 void dred_rdovae_dec_init_states(

     RDOVAEDecState *h,            /* io: state buffer handle */

     const RDOVAEDec *model,

-    const float *initial_state  /* i: initial state */

+    const float *initial_state,  /* i: initial state */

+    int arch

     float hidden[DEC_HIDDEN_INIT_OUT_SIZE];

     float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];

     int counter=0;

-    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH);

-    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH);

+    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);

+    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);

     OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);

     counter += DEC_GRU1_STATE_SIZE;

     OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);

@@ -86,7 +88,8 @@

     RDOVAEDecState *dec_state,       /* io: state buffer handle */

     const RDOVAEDec *model,

     float *qframe,              /* o: quadruple feature frame (four concatenated frames in reverse order) */

-    const float *input          /* i: latent vector */

+    const float *input,          /* i: latent vector */

+    int arch

     float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE

@@ -94,43 +97,43 @@

     int output_index = 0;

     /* run encoder stack and concatenate output in buffer*/

-    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH);

+    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);

     output_index += DEC_DENSE1_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer);

-    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state);

+    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);

+    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);

     output_index += DEC_GRU1_OUT_SIZE;

     conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);

-    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += DEC_CONV1_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer);

-    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state);

+    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);

+    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);

     output_index += DEC_GRU2_OUT_SIZE;

     conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);

-    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += DEC_CONV2_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer);

-    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state);

+    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);

+    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);

     output_index += DEC_GRU3_OUT_SIZE;

     conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);

-    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += DEC_CONV3_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer);

-    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state);

+    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);

+    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);

     output_index += DEC_GRU4_OUT_SIZE;

     conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);

-    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += DEC_CONV4_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer);

-    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state);

+    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);

+    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);

     output_index += DEC_GRU5_OUT_SIZE;

     conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);

-    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += DEC_CONV5_OUT_SIZE;

-    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR);

+    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);

--- a/dnn/dred_rdovae_dec.h

+++ b/dnn/dred_rdovae_dec.h

@@ -46,8 +46,8 @@

   float conv5_state[DEC_CONV5_STATE_SIZE];

};

-void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);

-void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);

-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);

+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);

+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);

+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);

 #endif

--- a/dnn/dred_rdovae_enc.c

+++ b/dnn/dred_rdovae_enc.c

@@ -50,7 +50,8 @@

     const RDOVAEEnc *model,

     float *latents,                 /* o: latent vector */

     float *initial_state,           /* o: initial state */

-    const float *input              /* i: double feature frame (concatenated) */

+    const float *input,              /* i: double feature frame (concatenated) */

+    int arch

     float padded_latents[DRED_PADDED_LATENT_DIM];

@@ -61,49 +62,49 @@

     int output_index = 0;

     /* run encoder stack and concatenate output in buffer*/

-    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH);

+    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);

     output_index += ENC_DENSE1_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer);

+    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);

     OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);

     output_index += ENC_GRU1_OUT_SIZE;

     conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);

-    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH);

+    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);

     output_index += ENC_CONV1_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer);

+    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);

     OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);

     output_index += ENC_GRU2_OUT_SIZE;

     conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);

-    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH);

+    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);

     output_index += ENC_CONV2_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer);

+    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);

     OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);

     output_index += ENC_GRU3_OUT_SIZE;

     conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);

-    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH);

+    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);

     output_index += ENC_CONV3_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer);

+    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);

     OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);

     output_index += ENC_GRU4_OUT_SIZE;

     conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);

-    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH);

+    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);

     output_index += ENC_CONV4_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer);

+    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);

     OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);

     output_index += ENC_GRU5_OUT_SIZE;

     conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);

-    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH);

+    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);

     output_index += ENC_CONV5_OUT_SIZE;

-    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR);

+    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);

     OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);

     /* next, calculate initial state */

-    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH);

-    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR);

+    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);

+    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);

     OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);

--- a/dnn/dred_rdovae_enc.h

+++ b/dnn/dred_rdovae_enc.h

@@ -46,7 +46,7 @@

     float conv5_state[2*ENC_CONV5_STATE_SIZE];

};

-void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);

+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);

 #endif

--- a/dnn/dump_data.c

+++ b/dnn/dump_data.c

@@ -42,6 +42,7 @@

 #include "lpcnet.h"

 #include "lpcnet_private.h"

 #include "os_support.h"

+#include "cpu_support.h"

 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {

@@ -135,7 +136,9 @@

   FILE *fnoise = NULL;

   float noise_gain = 0;

   long noise_size=0;

+  int arch;

   srand(getpid());

+  arch = opus_select_arch();

   st = lpcnet_encoder_create();

   argv0=argv[0];

   if (argc == 5 && strcmp(argv[1], "-btrain")==0) {

@@ -244,7 +247,7 @@

     for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;

     /* PCM is delayed by 1/2 frame to make the features centered on the frames. */

     for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);

-    compute_frame_features(st, x);

+    compute_frame_features(st, x, arch);

     if (fpcm) {

         compute_noise(noisebuf, noise_std);

--- a/dnn/fargan.c

+++ b/dnn/fargan.c

@@ -36,6 +36,7 @@

 #include "pitch.h"

 #include "nnet.h"

 #include "lpcnet_private.h"

+#include "cpu_support.h"

 #define FARGAN_FEATURES (NB_FEATURES)

@@ -52,9 +53,9 @@

   OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);

   OPUS_COPY(dense_in, features, NB_FEATURES);

-  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);

-  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);

-  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);

+  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);

+  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);

+  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);

 static void fargan_deemphasis(float *pcm, float *deemph_mem) {

@@ -84,7 +85,7 @@

   celt_assert(st->cont_initialized);

   model = &st->model;

-  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);

+  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);

   gain = exp(gain);

   gain_1 = 1.f/(1e-5f + gain);

@@ -100,26 +101,26 @@

   OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);

   OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);

+  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);

   celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);

-  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);

+  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);

-  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);

+  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);

   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];

   OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);

-  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);

+  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);

+  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);

   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];

   OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);

-  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);

+  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);

+  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);

   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];

   OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);

-  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);

+  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);

+  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);

   OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);

   OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);

@@ -127,10 +128,10 @@

   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];

   OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);

-  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);

+  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);

+  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);

-  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);

+  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);

   for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;

   OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);

@@ -174,6 +175,7 @@

   int ret;

   OPUS_CLEAR(st, 1);

+  st->arch = opus_select_arch();

 #ifndef USE_WEIGHTS_FILE

   ret = init_fargan(&st->model, fargan_arrays);

 #else

@@ -180,7 +182,6 @@

   ret = 0;

 #endif

   celt_assert(ret == 0);

-  /* FIXME: perform arch detection. */

 int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {

--- a/dnn/lpcnet.h

+++ b/dnn/lpcnet.h

@@ -120,7 +120,7 @@

   * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors

   * @retval 0 Success

*/

-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);

+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);

 /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.

@@ -129,7 +129,7 @@

   * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors

   * @retval 0 Success

*/

-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);

+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);

 /** Gets the size of an <code>LPCNetState</code> structure.

   * @returns The size in bytes.

--- a/dnn/lpcnet_demo.c

+++ b/dnn/lpcnet_demo.c

@@ -37,6 +37,7 @@

 #include "freq.h"

 #include "os_support.h"

 #include "fargan.h"

+#include "cpu_support.h"

 #ifdef USE_WEIGHTS_FILE

 # if __unix__

@@ -99,6 +100,7 @@

 int main(int argc, char **argv) {

     int mode=0;

+    int arch;

     FILE *fin, *fout;

 #ifdef USE_WEIGHTS_FILE

     int len;

@@ -105,6 +107,7 @@

     unsigned char *data;

     const char *filename = "weights_blob.bin";

 #endif

+    arch = opus_select_arch();

     if (argc < 4) usage();

     if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;

     else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;

@@ -137,7 +140,7 @@

             size_t ret;

             ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);

             if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;

-            lpcnet_compute_single_frame_features(net, pcm, features);

+            lpcnet_compute_single_frame_features(net, pcm, features, arch);

             fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);

         lpcnet_encoder_destroy(net);

--- a/dnn/lpcnet_enc.c

+++ b/dnn/lpcnet_enc.c

@@ -95,7 +95,7 @@

 #define celt_log10(x) (0.3010299957f*celt_log2(x))

-void compute_frame_features(LPCNetEncState *st, const float *in) {

+void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {

   float aligned_in[FRAME_SIZE];

   int i;

   float Ly[NB_BANDS];

@@ -142,7 +142,7 @@

   OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);

   OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);

   OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);

-  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch);

+  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);

   for (i=0;i<FRAME_SIZE;i++) {

     st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;

     st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];

@@ -152,7 +152,7 @@

     double ener1;

     float *buf = st->exc_buf;

-    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch);

+    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);

     ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);

     ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);

     /*printf("%f\n", st->frame_weight[sub]);*/

@@ -165,7 +165,7 @@

     /*printf("\n");*/

-  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);

+  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);

 void process_single_frame(LPCNetEncState *st, FILE *ffeat) {

@@ -196,26 +196,26 @@

-static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {

+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {

   preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);

-  compute_frame_features(st, x);

+  compute_frame_features(st, x, arch);

   process_single_frame(st, NULL);

   OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);

   return 0;

-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) {

+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {

   int i;

   float x[FRAME_SIZE];

   for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];

-  lpcnet_compute_single_frame_features_impl(st, x, features);

+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);

   return 0;

-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {

+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {

   int i;

   float x[FRAME_SIZE];

   for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];

-  lpcnet_compute_single_frame_features_impl(st, x, features);

+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);

   return 0;

--- a/dnn/lpcnet_plc.c

+++ b/dnn/lpcnet_plc.c

@@ -33,6 +33,7 @@

 #include "plc_data.h"

 #include "os_support.h"

 #include "common.h"

+#include "cpu_support.h"

 #ifndef M_PI

 #define M_PI 3.141592653

@@ -54,6 +55,7 @@

 int lpcnet_plc_init(LPCNetPLCState *st) {

   int ret;

+  st->arch = opus_select_arch();

   fargan_init(&st->fargan);

   lpcnet_encoder_init(&st->enc);

   st->analysis_pos = PLC_BUF_SIZE;

@@ -109,10 +111,10 @@

   float dense_out[PLC_DENSE1_OUT_SIZE];

   PLCNetState *net = &st->plc_net;

   celt_assert(st->loaded);

-  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);

-  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);

-  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);

-  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);

+  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);

+  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);

+  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);

+  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);

 static int get_fec_or_pred(LPCNetPLCState *st, float *out) {

@@ -164,7 +166,7 @@

       float plc_features[2*NB_BANDS+NB_FEATURES+1];

       for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];

       burg_cepstral_analysis(plc_features, x);

-      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features);

+      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);

       if ((st->analysis_gap && count > 0) || count > 1) {

         queue_features(st, st->features);

         OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);

--- a/dnn/lpcnet_private.h

+++ b/dnn/lpcnet_private.h

@@ -24,7 +24,6 @@

 struct LPCNetEncState{

   PitchDNNState pitchdnn;

-  int arch;

   float analysis_mem[OVERLAP_SIZE];

   float mem_preemph;

   kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];

@@ -67,7 +66,7 @@

 void preemphasis(float *y, float *mem, const float *x, float coef, int N);

-void compute_frame_features(LPCNetEncState *st, const float *in);

+void compute_frame_features(LPCNetEncState *st, const float *in, int arch);

 void lpcnet_reset_signal(LPCNetState *lpcnet);

 void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);

@@ -79,7 +78,6 @@

 void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);

 void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);

 void process_single_frame(LPCNetEncState *st, FILE *ffeat);

-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);

 void process_single_frame(LPCNetEncState *st, FILE *ffeat);

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -69,43 +69,9 @@

    return x < 0 ? 0 : x;

-static void compute_linear(const LinearLayer *linear, float *out, const float *in)

+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)

-   int i, M, N;

-   const float *bias;

-   celt_assert(in != out);

-   bias = linear->bias;

-   M = linear->nb_inputs;

-   N = linear->nb_outputs;

-   if (linear->float_weights != NULL) {

-     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);

-     else sgemv(out, linear->float_weights, N, M, N, in);

-   } else if (linear->weights != NULL) {

-     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);

-     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);

-     /* Only use SU biases on for integer matrices on SU archs. */

-#ifdef USE_SU_BIAS

-     bias = linear->subias;

-#endif

-   }

-   else OPUS_CLEAR(out, N);

-   if (bias != NULL) {

-      for (i=0;i<N;i++) out[i] += bias[i];

-   }

-   if (linear->diag) {

-      /* Diag is only used for GRU recurrent weights. */

-      celt_assert(3*M == N);

-      for (i=0;i<M;i++) {

-         out[i] += linear->diag[i]*in[i];

-         out[i+M] += linear->diag[i+M]*in[i];

-         out[i+2*M] += linear->diag[i+2*M]*in[i];

-      }

-   }

-}

-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)

-{

-   compute_linear(layer, output, input);

+   compute_linear(layer, output, input, arch);

    compute_activation(output, output, layer->nb_outputs, activation);

@@ -112,7 +78,7 @@

 #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)

-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in)

+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)

   int i;

   int N;

@@ -129,8 +95,8 @@

   h = &zrh[2*N];

   celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);

   celt_assert(in != state);

-  compute_linear(input_weights, zrh, in);

-  compute_linear(recurrent_weights, recur, state);

+  compute_linear(input_weights, zrh, in, arch);

+  compute_linear(recurrent_weights, recur, state, arch);

   for (i=0;i<2*N;i++)

      zrh[i] += recur[i];

   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);

@@ -143,12 +109,12 @@

      state[i] = h[i];

-void compute_glu(const LinearLayer *layer, float *output, const float *input)

+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)

    int i;

    float act2[MAX_INPUTS];

    celt_assert(layer->nb_inputs == layer->nb_outputs);

-   compute_linear(layer, act2, input);

+   compute_linear(layer, act2, input, arch);

    compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);

    if (input == output) {

      /* Give a vectorization hint to the compiler for the in-place case. */

@@ -194,7 +160,7 @@

-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)

+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)

    LinearLayer matrix;

    celt_assert(input != output);

@@ -207,7 +173,7 @@

    matrix.nb_inputs = layer->nb_inputs;

    matrix.nb_outputs = layer->nb_neurons;

    matrix.scale = NULL;

-   compute_linear(&matrix, output, input);

+   compute_linear(&matrix, output, input, arch);

    compute_activation(output, output, layer->nb_neurons, layer->activation);

@@ -218,7 +184,7 @@

 #endif

 #define MAX_IDX_SIZE 8192

-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)

+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)

   LinearLayer in_matrix, rec_matrix;

   int i, M, N;

@@ -262,13 +228,13 @@

   rec_matrix.float_weights = NULL;

 #endif

   rec_matrix.weights_idx = NULL;

-  compute_generic_gru(&in_matrix, &rec_matrix, state, input);

+  compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);

 #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS

-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation)

+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)

    float tmp[MAX_CONV_INPUTS_ALL];

    celt_assert(input != output);

@@ -275,12 +241,12 @@

    celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);

    OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);

    OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);

-   compute_linear(layer, output, tmp);

+   compute_linear(layer, output, tmp, arch);

    compute_activation(output, output, layer->nb_outputs, activation);

    OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);

-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation)

+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)

    float tmp[MAX_CONV_INPUTS_ALL];

    int ksize = layer->nb_inputs/input_size;

@@ -290,7 +256,7 @@

    if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);

    else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);

    OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);

-   compute_linear(layer, output, tmp);

+   compute_linear(layer, output, tmp, arch);

    compute_activation(output, output, layer->nb_outputs, activation);

    if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);

    else {

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -126,18 +126,18 @@

   int dim;

 } EmbeddingLayer;

-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);

-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);

-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);

-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation);

-void compute_glu(const LinearLayer *layer, float *output, const float *input);

-void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);

+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);

+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);

+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);

+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);

+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);

+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);

 void compute_activation(float *output, const float *input, int N, int activation);

-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);

+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);

-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);

+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);

@@ -187,6 +187,27 @@

   int reset_after);

 void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);

+void compute_linear_c(const LinearLayer *linear, float *out, const float *in);

+#if defined(OPUS_X86_MAY_HAVE_SSE2)

+#include "x86/dnn_x86.h"

+#endif

+#ifndef OVERRIDE_COMPUTE_LINEAR

+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))

+#endif

+#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)

+#if defined(_MSC_VER)

+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")

+#else

+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"

+#endif

+#endif

 #endif /* NNET_H_ */

--- /dev/null

+++ b/dnn/nnet_arch.h

@@ -1,0 +1,76 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifndef NNET_ARCH_H

+#define NNET_ARCH_H

+#include "nnet.h"

+#include "arch.h"

+#include "os_support.h"

+#include "vec.h"

+#define CAT_SUFFIX2(a,b) a ## b

+#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)

+#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)

+void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)

+{

+   int i, M, N;

+   const float *bias;

+   celt_assert(in != out);

+   bias = linear->bias;

+   M = linear->nb_inputs;

+   N = linear->nb_outputs;

+   if (linear->float_weights != NULL) {

+     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);

+     else sgemv(out, linear->float_weights, N, M, N, in);

+   } else if (linear->weights != NULL) {

+     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);

+     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);

+     /* Only use SU biases on for integer matrices on SU archs. */

+#ifdef USE_SU_BIAS

+     bias = linear->subias;

+#endif

+   }

+   else OPUS_CLEAR(out, N);

+   if (bias != NULL) {

+      for (i=0;i<N;i++) out[i] += bias[i];

+   }

+   if (linear->diag) {

+      /* Diag is only used for GRU recurrent weights. */

+      celt_assert(3*M == N);

+      for (i=0;i<M;i++) {

+         out[i] += linear->diag[i]*in[i];

+         out[i+M] += linear->diag[i+M]*in[i];

+         out[i+2*M] += linear->diag[i+2*M]*in[i];

+      }

+   }

+}

+#endif

--- /dev/null

+++ b/dnn/nnet_default.c

@@ -1,0 +1,35 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#define RTCD_ARCH c

+#include "nnet_arch.h"

--- a/dnn/pitchdnn.c

+++ b/dnn/pitchdnn.c

@@ -12,7 +12,8 @@

 float compute_pitchdnn(

     PitchDNNState *st,

     const float *if_features,

-    const float *xcorr_features

+    const float *xcorr_features,

+    int arch

   float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];

@@ -28,16 +29,16 @@

   float count=0;

   PitchDNN *model = &st->model;

   /* IF */

-  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);

-  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);

+  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);

+  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);

   /* xcorr*/

   OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);

   compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);

   compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);

-  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);

-  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);

-  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);

+  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);

+  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);

+  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);

   for (i=0;i<180;i++) {

     if (output[i] > maxval) {

       pos = i;

@@ -65,7 +66,6 @@

   ret = 0;

 #endif

   celt_assert(ret == 0);

-  /* FIXME: perform arch detection. */

 int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {

--- a/dnn/pitchdnn.h

+++ b/dnn/pitchdnn.h

@@ -27,7 +27,8 @@

 float compute_pitchdnn(

     PitchDNNState *st,

     const float *if_features,

-    const float *xcorr_features

+    const float *xcorr_features,

+    int arch

);

 #endif

--- a/dnn/vec_avx.h

+++ b/dnn/vec_avx.h

@@ -655,11 +655,6 @@

   return res;

-#if defined(_MSC_VER)

-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")

-#else

-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"

-#endif

 #else

--- /dev/null

+++ b/dnn/x86/dnn_x86.h

@@ -1,0 +1,78 @@

+/* Copyright (c) 2011-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifndef DNN_X86_H

+#define DNN_X86_H

+#include "cpu_support.h"

+#include "opus_types.h"

+#if defined(OPUS_X86_MAY_HAVE_SSE2)

+void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);

+#endif

+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)

+void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);

+#endif

+#if defined(OPUS_X86_MAY_HAVE_AVX2)

+void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);

+#endif

+#if defined(OPUS_X86_PRESUME_AVX2)

+#define OVERRIDE_COMPUTE_LINEAR

+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))

+#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)

+#define OVERRIDE_COMPUTE_LINEAR

+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))

+#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)

+#define OVERRIDE_COMPUTE_LINEAR

+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))

+#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))

+extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(

+                    const LinearLayer *linear,

+                    float *out,

+                    const float *in

+                    );

+#define OVERRIDE_COMPUTE_LINEAR

+#define compute_linear(linear, out, in, arch) \

+    ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))

+#endif

+#endif /* DNN_X86_H */

--- /dev/null

+++ b/dnn/x86/nnet_avx2.c

@@ -1,0 +1,38 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#ifndef __AVX2__

+#error nnet_avx2.c is being compiled without AVX2 enabled

+#endif

+#define RTCD_ARCH avx2

+#include "nnet_arch.h"

--- /dev/null

+++ b/dnn/x86/nnet_sse2.c

@@ -1,0 +1,38 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#ifndef __SSE2__

+#error nnet_sse2.c is being compiled without SSE2 enabled

+#endif

+#define RTCD_ARCH sse2

+#include "nnet_arch.h"

--- /dev/null

+++ b/dnn/x86/nnet_sse4_1.c

@@ -1,0 +1,38 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#ifndef __SSE4_1__

+#error nnet_sse4_1.c is being compiled without SSE4.1 enabled

+#endif

+#define RTCD_ARCH sse4_1

+#include "nnet_arch.h"

--- /dev/null

+++ b/dnn/x86/x86_dnn_map.c

@@ -1,0 +1,54 @@

+/* Copyright (c) 2018-2019 Mozilla

+                 2023 Amazon */

+/*

+   Redistribution and use in source and binary forms, with or without

+   modification, are permitted provided that the following conditions

+   are met:

+   - Redistributions of source code must retain the above copyright

+   notice, this list of conditions and the following disclaimer.

+   - Redistributions in binary form must reproduce the above copyright

+   notice, this list of conditions and the following disclaimer in the

+   documentation and/or other materials provided with the distribution.

+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR

+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+#ifdef HAVE_CONFIG_H

+#include "config.h"

+#endif

+#include "x86/x86cpu.h"

+#include "nnet.h"

+#if defined(OPUS_HAVE_RTCD)

+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))

+void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(

+         const LinearLayer *linear,

+         float *out,

+         const float *in

+) = {

+  compute_linear_c,                /* non-sse */

+  compute_linear_c,

+  MAY_HAVE_SSE2(compute_linear),

+  MAY_HAVE_SSE4_1(compute_linear), /* sse4.1  */

+  MAY_HAVE_AVX2(compute_linear)  /* avx  */

+};

+#endif

+#endif

--- a/lpcnet_headers.mk

+++ b/lpcnet_headers.mk

@@ -12,7 +12,9 @@

 dnn/vec_avx.h \

 dnn/vec_neon.h \

 dnn/pitchdnn.h \

-dnn/pitchdnn_data.h

+dnn/pitchdnn_data.h \

+dnn/x86/dnn_x86.h \

+dnn/nnet_arch.h

 DRED_HEAD = \

 silk/dred_coding.h \

--- a/lpcnet_sources.mk

+++ b/lpcnet_sources.mk

@@ -7,6 +7,7 @@

 dnn/lpcnet_plc.c \

 dnn/lpcnet_tables.c \

 dnn/nnet.c \

+dnn/nnet_default.c \

 dnn/plc_data.c \

 dnn/parse_lpcnet_weights.c \

 dnn/pitchdnn.c \

@@ -21,3 +22,8 @@

 silk/dred_encoder.c \

 silk/dred_coding.c \

 silk/dred_decoder.c

+DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c

+DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c

+DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c

+DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c

--- a/silk/dred_encoder.c

+++ b/silk/dred_encoder.c

@@ -87,7 +87,7 @@

     dred_encoder_reset(enc);

-static void dred_process_frame(DREDEnc *enc)

+static void dred_process_frame(DREDEnc *enc, int arch)

     float feature_buffer[2 * 36];

     float input_buffer[2*DRED_NUM_FEATURES] = {0};

@@ -97,8 +97,8 @@

     OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);

     /* calculate LPCNet features */

-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer);

-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36);

+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);

+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);

     /* prepare input buffer (discard LPC coefficients) */

     OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);

@@ -105,7 +105,7 @@

     OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);

     /* run RDOVAE encoder */

-    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer);

+    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);

     enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);

@@ -188,7 +188,7 @@

-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)

+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)

     int curr_offset16k;

     int frame_size16k = frame_size * 16000 / enc->Fs;

@@ -206,7 +206,7 @@

         if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)

             curr_offset16k += 320;

-            dred_process_frame(enc);

+            dred_process_frame(enc, arch);

             enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;

             OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);

             /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */

--- a/silk/dred_encoder.h

+++ b/silk/dred_encoder.h

@@ -64,7 +64,7 @@

 void dred_deinit_encoder(DREDEnc *enc);

-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);

+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);

 int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);

--- a/src/opus_decoder.c

+++ b/src/opus_decoder.c

@@ -1424,7 +1424,7 @@

       OPUS_COPY(dst, src, 1);

    if (dst->process_stage == 2)

       return OPUS_OK;

-   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents);

+   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);

    dst->process_stage = 2;

    return OPUS_OK;

 #else

--- a/src/opus_encoder.c

+++ b/src/opus_encoder.c

@@ -1715,7 +1715,7 @@

 #ifdef ENABLE_DRED

     if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {

         /* DRED Encoder */

-        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );

+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );

     } else {

         st->dred_encoder.latents_buffer_fill = 0;

--

⑨