shithub: opus

--- a/dnn/nnet.c

+++ b/dnn/nnet.c

@@ -212,91 +212,3 @@

      OPUS_COPY(&mem[input_size*dilation*(ksize-1)-input_size], input, input_size);

-/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],

-   kernel [ out_channels x in_channels x ksize1 x ksize2 ],

-   storing the output as [ out_channels x len2 ].

-   We assume that the output dimension along the ksize1 axis is 1,

-   i.e. processing one frame at a time. */

-static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)

-{

-   int i;

-   int in_stride;

-   in_stride = height+kheight-1;

-   for (i=0;i<out_channels;i++) {

-      int m;

-      OPUS_CLEAR(&out[i*hstride], height);

-      for (m=0;m<in_channels;m++) {

-         int t;

-         for (t=0;t<ktime;t++) {

-            int h;

-            for (h=0;h<kheight;h++) {

-               int j;

-               for (j=0;j<height;j++) {

-                  out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *

-                                     in[t*in_channels*in_stride + m*in_stride + j + h];

-               }

-            }

-         }

-      }

-   }

-}

-static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)

-{

-   int i;

-   int in_stride;

-   int kheight, ktime;

-   kheight = ktime = 3;

-   in_stride = height+kheight-1;

-   for (i=0;i<out_channels;i++) {

-      int m;

-      OPUS_CLEAR(&out[i*hstride], height);

-      for (m=0;m<in_channels;m++) {

-         int j;

-         for (j=0;j<height;j++) {

-            /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */

-            out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]

-                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];

-               }

-      }

-   }

-}

-#define MAX_CONV2D_INPUTS 8192

-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch)

-{

-   int i;

-   const float *bias;

-   float in_buf[MAX_CONV2D_INPUTS];

-   int time_stride;

-   celt_assert(in != out);

-   time_stride = conv->in_channels*(height+conv->kheight-1);

-   celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);

-   OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);

-   OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);

-   OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);

-   bias = conv->bias;

-   if (conv->kheight == 3 && conv->ktime == 3)

-     conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);

-   else

-     conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);

-   if (bias != NULL) {

-     for (i=0;i<conv->out_channels;i++) {

-       int j;

-       for (j=0;j<height;j++) out[i*hstride+j] += bias[i];

-     }

-   }

-   for (i=0;i<conv->out_channels;i++) {

-     compute_activation(&out[i*hstride], &out[i*hstride], height, activation, arch);

-   }

-}

--- a/dnn/nnet.h

+++ b/dnn/nnet.h

@@ -185,13 +185,12 @@

   int activation,

   int reset_after);

-void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation, int arch);

 void compute_linear_c(const LinearLayer *linear, float *out, const float *in);

 void compute_activation_c(float *output, const float *input, int N, int activation);

+void compute_conv2d_c(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);

 #if defined(OPUS_X86_MAY_HAVE_SSE2)

 #include "x86/dnn_x86.h"

 #endif

@@ -204,6 +203,9 @@

 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_c(output, input, N, activation))

 #endif

+#ifndef OVERRIDE_COMPUTE_CONV2D

+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_c(conv, out, mem, in, height, hstride, activation))

+#endif

 #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)

 #if defined(_MSC_VER)

--- a/dnn/nnet_arch.h

+++ b/dnn/nnet_arch.h

@@ -127,5 +127,93 @@

+/* Computes non-padded convolution for input [ ksize1 x in_channels x (len2+ksize2) ],

+   kernel [ out_channels x in_channels x ksize1 x ksize2 ],

+   storing the output as [ out_channels x len2 ].

+   We assume that the output dimension along the ksize1 axis is 1,

+   i.e. processing one frame at a time. */

+static void conv2d_float(float *out, const float *weights, int in_channels, int out_channels, int ktime, int kheight, const float *in, int height, int hstride)

+{

+   int i;

+   int in_stride;

+   in_stride = height+kheight-1;

+   for (i=0;i<out_channels;i++) {

+      int m;

+      OPUS_CLEAR(&out[i*hstride], height);

+      for (m=0;m<in_channels;m++) {

+         int t;

+         for (t=0;t<ktime;t++) {

+            int h;

+            for (h=0;h<kheight;h++) {

+               int j;

+               for (j=0;j<height;j++) {

+                  out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + t*kheight + h] *

+                                     in[t*in_channels*in_stride + m*in_stride + j + h];

+               }

+            }

+         }

+      }

+   }

+}

+/* There's no intrinsics in this function (or the one above) because the gcc (and hopefully other compiler) auto-vectorizer is smart enough to

+   produce the right code by itself based on the compile flags. */

+static void conv2d_3x3_float(float *out, const float *weights, int in_channels, int out_channels, const float *in, int height, int hstride)

+{

+   int i;

+   int in_stride;

+   int kheight, ktime;

+   kheight = ktime = 3;

+   in_stride = height+kheight-1;

+   for (i=0;i<out_channels;i++) {

+      int m;

+      OPUS_CLEAR(&out[i*hstride], height);

+      for (m=0;m<in_channels;m++) {

+         int j;

+         for (j=0;j<height;j++) {

+            /* Unrolled version of previous function -- compiler will figure out the indexing simplifications. */

+            out[i*hstride + j] += weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 0]*in[0*in_channels*in_stride + m*in_stride + j + 0]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 1]*in[0*in_channels*in_stride + m*in_stride + j + 1]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 0*kheight + 2]*in[0*in_channels*in_stride + m*in_stride + j + 2]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 0]*in[1*in_channels*in_stride + m*in_stride + j + 0]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 1]*in[1*in_channels*in_stride + m*in_stride + j + 1]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 1*kheight + 2]*in[1*in_channels*in_stride + m*in_stride + j + 2]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 0]*in[2*in_channels*in_stride + m*in_stride + j + 0]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 1]*in[2*in_channels*in_stride + m*in_stride + j + 1]

+                                + weights[i*in_channels*ktime*kheight + m*ktime*kheight + 2*kheight + 2]*in[2*in_channels*in_stride + m*in_stride + j + 2];

+               }

+      }

+   }

+}

+#define MAX_CONV2D_INPUTS 8192

+void RTCD_SUF(compute_conv2d_)(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation)

+{

+   int i;

+   const float *bias;

+   float in_buf[MAX_CONV2D_INPUTS];

+   int time_stride;

+   celt_assert(in != out);

+   time_stride = conv->in_channels*(height+conv->kheight-1);

+   celt_assert(conv->ktime*time_stride <= MAX_CONV2D_INPUTS);

+   OPUS_COPY(in_buf, mem, (conv->ktime-1)*time_stride);

+   OPUS_COPY(&in_buf[(conv->ktime-1)*time_stride], in, time_stride);

+   OPUS_COPY(mem, &in_buf[time_stride], (conv->ktime-1)*time_stride);

+   bias = conv->bias;

+   if (conv->kheight == 3 && conv->ktime == 3)

+     conv2d_3x3_float(out, conv->float_weights, conv->in_channels, conv->out_channels, in_buf, height, hstride);

+   else

+     conv2d_float(out, conv->float_weights, conv->in_channels, conv->out_channels, conv->ktime, conv->kheight, in_buf, height, hstride);

+   if (bias != NULL) {

+     for (i=0;i<conv->out_channels;i++) {

+       int j;

+       for (j=0;j<height;j++) out[i*hstride+j] += bias[i];

+     }

+   }

+   for (i=0;i<conv->out_channels;i++) {

+     RTCD_SUF(compute_activation_)(&out[i*hstride], &out[i*hstride], height, activation);

+   }

+}

 #endif

--- a/dnn/x86/dnn_x86.h

+++ b/dnn/x86/dnn_x86.h

@@ -34,16 +34,19 @@

 #if defined(OPUS_X86_MAY_HAVE_SSE2)

 void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);

 void compute_activation_sse2(float *output, const float *input, int N, int activation);

+void compute_conv2d_sse2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);

 #endif

 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)

 void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);

 void compute_activation_sse4_1(float *output, const float *input, int N, int activation);

+void compute_conv2d_sse4_1(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);

 #endif

 #if defined(OPUS_X86_MAY_HAVE_AVX2)

 void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);

 void compute_activation_avx2(float *output, const float *input, int N, int activation);

+void compute_conv2d_avx2(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);

 #endif

@@ -53,6 +56,8 @@

 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))

 #define OVERRIDE_COMPUTE_ACTIVATION

 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_avx2(output, input, N, activation))

+#define OVERRIDE_COMPUTE_CONV2D

+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_avx2(conv, out, mem, in, height, hstride, activation))

 #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)

@@ -60,6 +65,8 @@

 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))

 #define OVERRIDE_COMPUTE_ACTIVATION

 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse4_1(output, input, N, activation))

+#define OVERRIDE_COMPUTE_CONV2D

+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse4_1(conv, out, mem, in, height, hstride, activation))

 #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)

@@ -67,6 +74,8 @@

 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))

 #define OVERRIDE_COMPUTE_ACTIVATION

 #define compute_activation(output, input, N, activation, arch) ((void)(arch),compute_activation_sse2(output, input, N, activation))

+#define OVERRIDE_COMPUTE_CONV2D

+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) ((void)(arch),compute_conv2d_sse2(conv, out, mem, in, height, hstride, activation))

 #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))

@@ -89,6 +98,20 @@

 #define OVERRIDE_COMPUTE_ACTIVATION

 #define compute_activation(output, input, N, activation, arch) \

     ((*DNN_COMPUTE_ACTIVATION_IMPL[(arch) & OPUS_ARCHMASK])(output, input, N, activation))

+extern void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])(

+                    const Conv2dLayer *conv,

+                    float *out,

+                    float *mem,

+                    const float *in,

+                    int height,

+                    int hstride,

+                    int activation

+                    );

+#define OVERRIDE_COMPUTE_CONV2D

+#define compute_conv2d(conv, out, mem, in, height, hstride, activation, arch) \

+    ((*DNN_COMPUTE_CONV2D_IMPL[(arch) & OPUS_ARCHMASK])(conv, out, mem, in, height, hstride, activation))

 #endif

--- a/dnn/x86/x86_dnn_map.c

+++ b/dnn/x86/x86_dnn_map.c

@@ -61,6 +61,22 @@

   MAY_HAVE_AVX2(compute_activation)  /* avx  */

};

+void (*const DNN_COMPUTE_CONV2D_IMPL[OPUS_ARCHMASK + 1])(

+         const Conv2dLayer *conv,

+         float *out,

+         float *mem,

+         const float *in,

+         int height,

+         int hstride,

+         int activation

+) = {

+  compute_conv2d_c,                /* non-sse */

+  compute_conv2d_c,

+  MAY_HAVE_SSE2(compute_conv2d),

+  MAY_HAVE_SSE4_1(compute_conv2d), /* sse4.1  */

+  MAY_HAVE_AVX2(compute_conv2d)  /* avx  */

+};

 #endif

--

⑨