shithub: libvpx

--- a/test/convolve_test.cc

+++ b/test/convolve_test.cc

@@ -33,9 +33,9 @@

 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x, int filter_x_stride,

-                             const int16_t *filter_y, int filter_y_stride,

-                             int w, int h);

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

+                             int h);

 typedef void (*WrapperFilterBlock2d8Func)(

     const uint8_t *src_ptr, const unsigned int src_stride,

@@ -550,7 +550,7 @@

   vpx_usec_timer_start(&timer);

   for (int n = 0; n < kNumTests; ++n) {

-    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,

+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,

                    width, height);

   vpx_usec_timer_mark(&timer);

@@ -570,7 +570,7 @@

   vpx_usec_timer_start(&timer);

   for (int n = 0; n < kNumTests; ++n) {

-    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,

+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,

                    width, height);

   vpx_usec_timer_mark(&timer);

@@ -585,7 +585,7 @@

   uint8_t *const out = output();

   ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,

-                                          NULL, 0, NULL, 0, Width(), Height()));

+                                          NULL, 0, 0, 0, 0, Width(), Height()));

   CheckGuardBlocks();

@@ -604,7 +604,7 @@

   CopyOutputToRef();

   ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,

-                                          NULL, 0, NULL, 0, Width(), Height()));

+                                          NULL, 0, 0, 0, 0, Width(), Height()));

   CheckGuardBlocks();

@@ -621,12 +621,10 @@

 TEST_P(ConvolveTest, CopyHoriz) {

   uint8_t *const in = input();

   uint8_t *const out = output();

-  DECLARE_ALIGNED(256, const int16_t,

-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

   ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,

-                                         filter8, 16, filter8, 16, Width(),

-                                         Height()));

+                                         vp9_filter_kernels[0], 0, 16, 0, 16,

+                                         Width(), Height()));

   CheckGuardBlocks();

@@ -641,12 +639,10 @@

 TEST_P(ConvolveTest, CopyVert) {

   uint8_t *const in = input();

   uint8_t *const out = output();

-  DECLARE_ALIGNED(256, const int16_t,

-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

   ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,

-                                         filter8, 16, filter8, 16, Width(),

-                                         Height()));

+                                         vp9_filter_kernels[0], 0, 16, 0, 16,

+                                         Width(), Height()));

   CheckGuardBlocks();

@@ -661,12 +657,10 @@

 TEST_P(ConvolveTest, Copy2D) {

   uint8_t *const in = input();

   uint8_t *const out = output();

-  DECLARE_ALIGNED(256, const int16_t,

-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };

   ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,

-                                          filter8, 16, filter8, 16, Width(),

-                                          Height()));

+                                          vp9_filter_kernels[0], 0, 16, 0, 16,

+                                          Width(), Height()));

   CheckGuardBlocks();

@@ -702,7 +696,6 @@

-const int16_t kInvalidFilter[8] = { 0 };

 const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {

   wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c

};

@@ -755,21 +748,21 @@

                                       Width(), Height(), UUT_->use_highbd_);

           if (filter_x && filter_y)

-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](

-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,

-                filters[filter_y], 16, Width(), Height()));

+            ASM_REGISTER_STATE_CHECK(

+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,

+                              filter_x, 16, filter_y, 16, Width(), Height()));

           else if (filter_y)

-            ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](

-                in, kInputStride, out, kOutputStride, kInvalidFilter, 16,

-                filters[filter_y], 16, Width(), Height()));

+            ASM_REGISTER_STATE_CHECK(

+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,

+                             16, filter_y, 16, Width(), Height()));

           else if (filter_x)

-            ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](

-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,

-                kInvalidFilter, 16, Width(), Height()));

+            ASM_REGISTER_STATE_CHECK(

+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,

+                             filter_x, 16, 0, 16, Width(), Height()));

           else

-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](

-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,

-                kInvalidFilter, 0, Width(), Height()));

+            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,

+                                                    kOutputStride, NULL, 0, 0,

+                                                    0, 0, Width(), Height()));

           CheckGuardBlocks();

@@ -853,21 +846,21 @@

                                        filters[filter_y], ref, kOutputStride,

                                        Width(), Height(), UUT_->use_highbd_);

             if (filter_x && filter_y)

-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](

-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,

-                  filters[filter_y], 16, Width(), Height()));

+              ASM_REGISTER_STATE_CHECK(

+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,

+                                filter_x, 16, filter_y, 16, Width(), Height()));

             else if (filter_y)

-              ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](

-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,

-                  filters[filter_y], 16, Width(), Height()));

+              ASM_REGISTER_STATE_CHECK(

+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,

+                               16, filter_y, 16, Width(), Height()));

             else if (filter_x)

-              ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](

-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,

-                  kInvalidFilter, 16, Width(), Height()));

+              ASM_REGISTER_STATE_CHECK(

+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,

+                               filter_x, 16, 0, 16, Width(), Height()));

             else

-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](

-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,

-                  kInvalidFilter, 0, Width(), Height()));

+              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,

+                                                      kOutputStride, NULL, 0, 0,

+                                                      0, 0, Width(), Height()));

             for (int y = 0; y < Height(); ++y) {

               for (int x = 0; x < Width(); ++x)

@@ -897,8 +890,8 @@

     for (int step = 1; step <= 32; ++step) {

       /* Test the horizontal and vertical filters in combination. */

       ASM_REGISTER_STATE_CHECK(

-          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],

-                         step, eighttap[frac], step, Width(), Height()));

+          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac,

+                         step, frac, step, Width(), Height()));

       CheckGuardBlocks();

@@ -917,14 +910,14 @@

 using std::tr1::make_tuple;

 #if CONFIG_VP9_HIGHBITDEPTH

-#define WRAP(func, bd)                                                         \

-  void wrap_##func##_##bd(                                                     \

-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,      \

-      const int16_t *filter_y, int filter_y_stride, int w, int h) {            \

-    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,     \

-                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \

-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd);   \

+#define WRAP(func, bd)                                                       \

+  void wrap_##func##_##bd(                                                   \

+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \

+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \

+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \

+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \

+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \

 #if HAVE_SSE2 && ARCH_X86_64

--- a/vp9/common/vp9_reconinter.h

+++ b/vp9/common/vp9_reconinter.h

@@ -26,9 +26,9 @@

                                    const struct scale_factors *sf, int w, int h,

                                    int ref, const InterpKernel *kernel, int xs,

                                    int ys) {

-  sf->predict[subpel_x != 0][subpel_y != 0][ref](

-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],

-      ys, w, h);

+  sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,

+                                                 dst_stride, kernel, subpel_x,

+                                                 xs, subpel_y, ys, w, h);

 #if CONFIG_VP9_HIGHBITDEPTH

@@ -37,8 +37,8 @@

     const int subpel_x, const int subpel_y, const struct scale_factors *sf,

     int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {

   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](

-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],

-      ys, w, h, bd);

+      src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,

+      h, bd);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_denoiser.c

+++ b/vp9/encoder/vp9_denoiser.c

@@ -390,12 +390,12 @@

   if (decision == FILTER_BLOCK) {

-    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,

-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,

+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,

+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,

                       num_4x4_blocks_high_lookup[bs] << 2);

   } else {  // COPY_BLOCK

-    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,

-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,

+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,

+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,

                       num_4x4_blocks_high_lookup[bs] << 2);

   *denoiser_decision = decision;

--- a/vp9/encoder/vp9_encoder.c

+++ b/vp9/encoder/vp9_encoder.c

@@ -2645,15 +2645,14 @@

         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {

           vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,

-                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,

-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,

-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,

-                               16 / factor, 16 / factor, bd);

+                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,

+                               x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,

+                               16 * src_h / dst_h, 16 / factor, 16 / factor,

+                               bd);

         } else {

-          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,

-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,

-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,

-                        16 / factor);

+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,

+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,

+                        16 * src_h / dst_h, 16 / factor, 16 / factor);

--- a/vp9/encoder/vp9_frame_scale.c

+++ b/vp9/encoder/vp9_frame_scale.c

@@ -43,10 +43,9 @@

                                  (x / factor) * src_w / dst_w;

         uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);

-        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,

-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,

-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,

-                      16 / factor);

+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,

+                      x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,

+                      16 * src_h / dst_h, 16 / factor, 16 / factor);

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -2162,15 +2162,15 @@

           vpx_highbd_convolve_copy(

               CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,

               CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,

-              NULL, 0, NULL, 0, bw, bh, xd->bd);

+              NULL, 0, 0, 0, 0, bw, bh, xd->bd);

         else

           vpx_convolve_copy(best_pred->data, best_pred->stride,

                             this_mode_pred->data, this_mode_pred->stride, NULL,

-                            0, NULL, 0, bw, bh);

+                            0, 0, 0, 0, bw, bh);

 #else

         vpx_convolve_copy(best_pred->data, best_pred->stride,

                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,

-                          NULL, 0, bw, bh);

+                          0, 0, 0, bw, bh);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

         best_pred = this_mode_pred;

@@ -2264,14 +2264,14 @@

       if (cm->use_highbitdepth)

         vpx_highbd_convolve_copy(

             CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,

-            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,

+            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,

             bw, bh, xd->bd);

       else

         vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,

-                          pd->dst.stride, NULL, 0, NULL, 0, bw, bh);

+                          pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);

 #else

       vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,

-                        pd->dst.stride, NULL, 0, NULL, 0, bw, bh);

+                        pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -600,7 +600,7 @@

 #if CONFIG_VP9_HIGHBITDEPTH

       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,

-                                 32, NULL, 0, NULL, 0, bs, bs, xd->bd);

+                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);

         if (xd->lossless) {

           vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);

         } else {

@@ -623,7 +623,7 @@

         recon = CONVERT_TO_BYTEPTR(recon16);

       } else {

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);

+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);

         switch (tx_size) {

           case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;

           case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;

--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c

@@ -137,15 +137,14 @@

 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,

                                      uint16_t *dst, ptrdiff_t dst_stride,

-                                     const int16_t *filter_x, int x_step_q4,

-                                     const int16_t *filter_y,  // unused

-                                     int y_step_q4,            // unused

+                                     const InterpKernel *filter, int x0_q4,

+                                     int x_step_q4, int y0_q4, int y_step_q4,

                                      int w, int h, int bd) {

   if (x_step_q4 != 16) {

-    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);

+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);

   } else {

-    const int16x8_t filters = vld1q_s16(filter_x);

+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     uint16x8_t t0, t1, t2, t3;

@@ -337,15 +336,15 @@

 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,

                                          ptrdiff_t src_stride, uint16_t *dst,

                                          ptrdiff_t dst_stride,

-                                         const int16_t *filter_x, int x_step_q4,

-                                         const int16_t *filter_y,  // unused

-                                         int y_step_q4,            // unused

-                                         int w, int h, int bd) {

+                                         const InterpKernel *filter, int x0_q4,

+                                         int x_step_q4, int y0_q4,

+                                         int y_step_q4, int w, int h, int bd) {

   if (x_step_q4 != 16) {

-    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);

+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,

+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,

+                                     bd);

   } else {

-    const int16x8_t filters = vld1q_s16(filter_x);

+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     uint16x8_t t0, t1, t2, t3;

@@ -566,15 +565,14 @@

 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,

                                     uint16_t *dst, ptrdiff_t dst_stride,

-                                    const int16_t *filter_x,  // unused

-                                    int x_step_q4,            // unused

-                                    const int16_t *filter_y, int y_step_q4,

+                                    const InterpKernel *filter, int x0_q4,

+                                    int x_step_q4, int y0_q4, int y_step_q4,

                                     int w, int h, int bd) {

   if (y_step_q4 != 16) {

-    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                                x_step_q4, filter_y, y_step_q4, w, h, bd);

+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);

   } else {

-    const int16x8_t filters = vld1q_s16(filter_y);

+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     assert(!((intptr_t)dst & 3));

@@ -732,15 +730,15 @@

 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,

                                         ptrdiff_t src_stride, uint16_t *dst,

                                         ptrdiff_t dst_stride,

-                                        const int16_t *filter_x,  // unused

-                                        int x_step_q4,            // unused

-                                        const int16_t *filter_y, int y_step_q4,

+                                        const InterpKernel *filter, int x0_q4,

+                                        int x_step_q4, int y0_q4, int y_step_q4,

                                         int w, int h, int bd) {

   if (y_step_q4 != 16) {

-    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                                    x_step_q4, filter_y, y_step_q4, w, h, bd);

+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,

+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,

+                                    bd);

   } else {

-    const int16x8_t filters = vld1q_s16(filter_y);

+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);

     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

     assert(!((intptr_t)dst & 3));

--- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c

@@ -15,13 +15,14 @@

 void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,

                                   uint16_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int filter_x_stride,

-                                  const int16_t *filter_y, int filter_y_stride,

+                                  const InterpKernel *filter, int x0_q4,

+                                  int x_step_q4, int y0_q4, int y_step_q4,

                                   int w, int h, int bd) {

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   if (w < 8) {  // avg4

--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c

@@ -15,13 +15,14 @@

 void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,

                                    uint16_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int filter_x_stride,

-                                   const int16_t *filter_y, int filter_y_stride,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int x_step_q4, int y0_q4, int y_step_q4,

                                    int w, int h, int bd) {

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   if (w < 8) {  // copy4

--- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c

+++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c

@@ -15,10 +15,9 @@

 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,

                                uint16_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4, int w,

+                               const InterpKernel *filter, int x0_q4,

+                               int x_step_q4, int y0_q4, int y_step_q4, int w,

                                int h, int bd) {

-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));

   // + 1 to make it divisible by 4

   uint16_t temp[64 * 136];

   const int intermediate_height =

@@ -29,20 +28,19 @@

    * buffer which has lots of extra room and is subsequently discarded this is

    * safe if somewhat less than ideal.   */

   vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,

-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,

+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,

                                   intermediate_height, bd);

   /* Step into the temp buffer 3 lines to get the actual frame data */

-  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);

+  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,

                                    uint16_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int x_step_q4,

-                                   const int16_t *filter_y, int y_step_q4,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int x_step_q4, int y0_q4, int y_step_q4,

                                    int w, int h, int bd) {

-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));

   // + 1 to make it divisible by 4

   uint16_t temp[64 * 136];

   const int intermediate_height =

@@ -52,8 +50,9 @@

    * to average the values after both passes.

*/

   vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,

-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,

+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,

                                   intermediate_height, bd);

-  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,

-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);

+  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,

+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,

+                                     bd);

--- a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm

+++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm

@@ -42,10 +42,11 @@

 ; r1    int src_stride

 ; r2    uint8_t *dst

 ; r3    int dst_stride

-; sp[]const int16_t *filter_x

-; sp[]int x_step_q4

-; sp[]const int16_t *filter_y ; unused

-; sp[]int y_step_q4           ; unused

+; sp[]const int16_t *filter

+; sp[]int x0_q4

+; sp[]int x_step_q4 ; unused

+; sp[]int y0_q4

+; sp[]int y_step_q4 ; unused

 ; sp[]int w

 ; sp[]int h

@@ -54,11 +55,11 @@

     sub             r0, r0, #3              ; adjust for taps

-    ldr             r5, [sp, #32]           ; filter_x

-    ldr             r6, [sp, #48]           ; w

-    ldr             r7, [sp, #52]           ; h

+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4

+    add             r4, r5, lsl #4

+    ldrd            r6, r7, [sp, #52]       ; w, h

-    vld1.s16        {q0}, [r5]              ; filter_x

+    vld1.s16        {q0}, [r4]              ; filter

     sub             r8, r1, r1, lsl #2      ; -src_stride * 3

     add             r8, r8, #4              ; -src_stride * 3 + 4

@@ -127,7 +128,7 @@

     sub             r2, r2, r3, lsl #2      ; reset for store

-    ; src[] * filter_x

+    ; src[] * filter

     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24

     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26

     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27

@@ -184,11 +185,13 @@

     sub             r0, r0, r1

     sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y

-    ldr             r6, [sp, #40]           ; w

-    ldr             lr, [sp, #44]           ; h

+    ldr             r4, [sp, #24]           ; filter

+    ldr             r5, [sp, #36]           ; y0_q4

+    add             r4, r5, lsl #4

+    ldr             r6, [sp, #44]           ; w

+    ldr             lr, [sp, #48]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y

+    vld1.s16        {q0}, [r4]              ; filter

     lsl             r1, r1, #1

     lsl             r3, r3, #1

@@ -232,7 +235,7 @@

     pld             [r7]

     pld             [r4]

-    ; src[] * filter_y

+    ; src[] * filter

     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

     pld             [r7, r1]

--- a/vpx_dsp/arm/vpx_convolve8_neon.c

+++ b/vpx_dsp/arm/vpx_convolve8_neon.c

@@ -125,11 +125,10 @@

 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y,  // unused

-                              int y_step_q4,            // unused

-                              int w, int h) {

-  const int16x8_t filters = vld1q_s16(filter_x);

+                              const InterpKernel *filter, int x0_q4,

+                              int x_step_q4, int y0_q4, int y_step_q4, int w,

+                              int h) {

+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);

   uint8x8_t t0, t1, t2, t3;

   assert(!((intptr_t)dst & 3));

@@ -137,8 +136,8 @@

   assert(x_step_q4 == 16);

   (void)x_step_q4;

+  (void)y0_q4;

   (void)y_step_q4;

-  (void)filter_y;

   src -= 3;

@@ -390,11 +389,10 @@

 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,

                                   uint8_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int x_step_q4,

-                                  const int16_t *filter_y,  // unused

-                                  int y_step_q4,            // unused

+                                  const InterpKernel *filter, int x0_q4,

+                                  int x_step_q4, int y0_q4, int y_step_q4,

                                   int w, int h) {

-  const int16x8_t filters = vld1q_s16(filter_x);

+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);

   uint8x8_t t0, t1, t2, t3;

   assert(!((intptr_t)dst & 3));

@@ -402,8 +400,8 @@

   assert(x_step_q4 == 16);

   (void)x_step_q4;

+  (void)y0_q4;

   (void)y_step_q4;

-  (void)filter_y;

   src -= 3;

@@ -692,19 +690,18 @@

 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x,  // unused

-                             int x_step_q4,            // unused

-                             const int16_t *filter_y, int y_step_q4, int w,

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

                              int h) {

-  const int16x8_t filters = vld1q_s16(filter_y);

+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);

   assert(!((intptr_t)dst & 3));

   assert(!(dst_stride & 3));

   assert(y_step_q4 == 16);

+  (void)x0_q4;

   (void)x_step_q4;

   (void)y_step_q4;

-  (void)filter_x;

   src -= 3 * src_stride;

@@ -864,19 +861,18 @@

 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

                                  uint8_t *dst, ptrdiff_t dst_stride,

-                                 const int16_t *filter_x,  // unused

-                                 int x_step_q4,            // unused

-                                 const int16_t *filter_y, int y_step_q4, int w,

+                                 const InterpKernel *filter, int x0_q4,

+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,

                                  int h) {

-  const int16x8_t filters = vld1q_s16(filter_y);

+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);

   assert(!((intptr_t)dst & 3));

   assert(!(dst_stride & 3));

   assert(y_step_q4 == 16);

+  (void)x0_q4;

   (void)x_step_q4;

   (void)y_step_q4;

-  (void)filter_x;

   src -= 3 * src_stride;

--- a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm

+++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm

@@ -42,10 +42,11 @@

 ; r1    int src_stride

 ; r2    uint8_t *dst

 ; r3    int dst_stride

-; sp[]const int16_t *filter_x

-; sp[]int x_step_q4

-; sp[]const int16_t *filter_y ; unused

-; sp[]int y_step_q4           ; unused

+; sp[]const int16_t *filter

+; sp[]int x0_q4

+; sp[]int x_step_q4 ; unused

+; sp[]int y0_q4

+; sp[]int y_step_q4 ; unused

 ; sp[]int w

 ; sp[]int h

@@ -54,11 +55,11 @@

     sub             r0, r0, #3              ; adjust for taps

-    ldr             r5, [sp, #32]           ; filter_x

-    ldr             r6, [sp, #48]           ; w

-    ldr             r7, [sp, #52]           ; h

+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4

+    add             r4, r5, lsl #4

+    ldrd            r6, r7, [sp, #52]       ; w, h

-    vld1.s16        {q0}, [r5]              ; filter_x

+    vld1.s16        {q0}, [r4]              ; filter

     sub             r8, r1, r1, lsl #2      ; -src_stride * 3

     add             r8, r8, #4              ; -src_stride * 3 + 4

@@ -119,7 +120,7 @@

     pld             [r5, r1, lsl #1]

-    ; src[] * filter_x

+    ; src[] * filter

     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24

     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26

     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27

@@ -173,11 +174,13 @@

     sub             r0, r0, r1

     sub             r0, r0, r1, lsl #1

-    ldr             r4, [sp, #32]           ; filter_y

-    ldr             r6, [sp, #40]           ; w

-    ldr             lr, [sp, #44]           ; h

+    ldr             r4, [sp, #24]           ; filter

+    ldr             r5, [sp, #36]           ; y0_q4

+    add             r4, r5, lsl #4

+    ldr             r6, [sp, #44]           ; w

+    ldr             lr, [sp, #48]           ; h

-    vld1.s16        {q0}, [r4]              ; filter_y

+    vld1.s16        {q0}, [r4]              ; filter

     lsl             r1, r1, #1

     lsl             r3, r3, #1

@@ -216,7 +219,7 @@

     pld             [r5]

     pld             [r8]

-    ; src[] * filter_y

+    ; src[] * filter

     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24

     pld             [r5, r3]

--- a/vpx_dsp/arm/vpx_convolve_avg_neon.c

+++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c

@@ -15,13 +15,13 @@

 void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int filter_x_stride,

-                           const int16_t *filter_y, int filter_y_stride, int w,

-                           int h) {

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int y_step_q4, int w, int h) {

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   if (w < 8) {  // avg4

     uint8x8_t s0, s1;

--- a/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm

+++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm

@@ -17,7 +17,7 @@

 |vpx_convolve_avg_neon| PROC

     push                {r4-r6, lr}

-    ldrd                r4, r5, [sp, #32]

+    ldrd                r4, r5, [sp, #36]

     mov                 r6, r2

     cmp                 r4, #32

--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c

+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c

@@ -15,13 +15,14 @@

 void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int filter_x_stride,

-                            const int16_t *filter_y, int filter_y_stride, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   if (w < 8) {  // copy4

     do {

--- a/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm

+++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm

@@ -17,7 +17,7 @@

 |vpx_convolve_copy_neon| PROC

     push                {r4-r5, lr}

-    ldrd                r4, r5, [sp, #28]

+    ldrd                r4, r5, [sp, #32]

     cmp                 r4, #32

     bgt                 copy64

--- a/vpx_dsp/arm/vpx_convolve_neon.c

+++ b/vpx_dsp/arm/vpx_convolve_neon.c

@@ -15,8 +15,8 @@

 #include "vpx_ports/mem.h"

 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                        ptrdiff_t dst_stride, const int16_t *filter_x,

-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                        ptrdiff_t dst_stride, const InterpKernel *filter,

+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                         int w, int h) {

   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the

    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).

@@ -33,19 +33,19 @@

    * height and filter a multiple of 4 lines. Since this goes in to the temp

    * buffer which has lots of extra room and is subsequently discarded this is

    * safe if somewhat less than ideal.   */

-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,

-                           x_step_q4, filter_y, y_step_q4, w,

+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,

+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,

                            intermediate_height);

   /* Step into the temp buffer 3 lines to get the actual frame data */

-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,

-                          filter_y, y_step_q4, w, h);

+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,

+                          x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int x_step_q4,

-                            const int16_t *filter_y, int y_step_q4, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

   uint8_t temp[64 * 72];

   const int intermediate_height = h + 7;

@@ -56,9 +56,9 @@

   /* This implementation has the same issues as above. In addition, we only want

    * to average the values after both passes.

*/

-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,

-                           x_step_q4, filter_y, y_step_q4, w,

+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,

+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,

                            intermediate_height);

-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,

-                              x_step_q4, filter_y, y_step_q4, w, h);

+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,

+                              x_step_q4, y0_q4, y_step_q4, w, h);

--- a/vpx_dsp/mips/convolve2_avg_dspr2.c

+++ b/vpx_dsp/mips/convolve2_avg_dspr2.c

@@ -219,9 +219,10 @@

 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                   uint8_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int x_step_q4,

-                                  const int16_t *filter_y, int y_step_q4, int w,

-                                  int h) {

+                                  const InterpKernel *filter, int x0_q4,

+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,

+                                  int w, int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   uint32_t pos = 38;

   assert(y_step_q4 == 16);

@@ -247,8 +248,8 @@

h);

       break;

     default:

-      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                               x_step_q4, filter_y, y_step_q4, w, h);

+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                               x_step_q4, y0_q4, y_step_q4, w, h);

       break;

--- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c

+++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c

@@ -751,9 +751,10 @@

 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                    uint8_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int x_step_q4,

-                                   const int16_t *filter_y, int y_step_q4,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,

                                    int w, int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   uint32_t pos = 38;

   assert(x_step_q4 == 16);

@@ -793,8 +794,8 @@

h);

       break;

     default:

-      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                                x_step_q4, filter_y, y_step_q4, w, h);

+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                                x_step_q4, y0_q4, y_step_q4, w, h);

       break;

--- a/vpx_dsp/mips/convolve2_horiz_dspr2.c

+++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c

@@ -628,9 +628,10 @@

 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                uint8_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4, int w,

-                               int h) {

+                               const InterpKernel *filter, int x0_q4,

+                               int32_t x_step_q4, int y0_q4, int y_step_q4,

+                               int w, int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   uint32_t pos = 38;

   assert(x_step_q4 == 16);

@@ -672,8 +673,8 @@

                                  (int32_t)dst_stride, filter_x, (int32_t)h);

       break;

     default:

-      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                            x_step_q4, filter_y, y_step_q4, w, h);

+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                            x_step_q4, y0_q4, y_step_q4, w, h);

       break;

--- a/vpx_dsp/mips/convolve2_vert_dspr2.c

+++ b/vpx_dsp/mips/convolve2_vert_dspr2.c

@@ -201,9 +201,10 @@

 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4, int w,

-                              int h) {

+                              const InterpKernel *filter, int x0_q4,

+                              int32_t x_step_q4, int y0_q4, int y_step_q4,

+                              int w, int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   uint32_t pos = 38;

   assert(y_step_q4 == 16);

@@ -228,8 +229,8 @@

       convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);

       break;

     default:

-      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                           x_step_q4, filter_y, y_step_q4, w, h);

+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                           x_step_q4, y0_q4, y_step_q4, w, h);

       break;

--- a/vpx_dsp/mips/convolve8_avg_dspr2.c

+++ b/vpx_dsp/mips/convolve8_avg_dspr2.c

@@ -334,15 +334,16 @@

 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                   uint8_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int x_step_q4,

-                                  const int16_t *filter_y, int y_step_q4, int w,

-                                  int h) {

+                                  const InterpKernel *filter, int x0_q4,

+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,

+                                  int w, int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   assert(y_step_q4 == 16);

   assert(((const int32_t *)filter_y)[1] != 0x800000);

   if (((const int32_t *)filter_y)[0] == 0) {

-    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h);

+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

   } else {

     uint32_t pos = 38;

@@ -367,8 +368,8 @@

h);

         break;

       default:

-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

@@ -376,8 +377,8 @@

 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x, int x_step_q4,

-                             const int16_t *filter_y, int y_step_q4, int w,

+                             const InterpKernel *filter, int x0_q4,

+                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,

                              int h) {

   /* Fixed size intermediate buffer places limits on parameters. */

   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);

@@ -390,24 +391,26 @@

   if (intermediate_height < h) intermediate_height = h;

-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,

-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);

+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,

+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,

+                      intermediate_height);

-  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,

-                         x_step_q4, filter_y, y_step_q4, w, h);

+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,

+                         x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int filter_x_stride,

-                            const int16_t *filter_y, int filter_y_stride, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

   int x, y;

   uint32_t tp1, tp2, tn1, tp3, tp4, tn2;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   /* prefetch data to cache memory */

   prefetch_load(src);

--- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c

+++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c

@@ -938,15 +938,16 @@

 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                    uint8_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int x_step_q4,

-                                   const int16_t *filter_y, int y_step_q4,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,

                                    int w, int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   assert(x_step_q4 == 16);

   assert(((const int32_t *)filter_x)[1] != 0x800000);

   if (((const int32_t *)filter_x)[0] == 0) {

-    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,

-                                  x_step_q4, filter_y, y_step_q4, w, h);

+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,

+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

   } else {

     uint32_t pos = 38;

@@ -987,9 +988,8 @@

h);

         break;

       default:

-        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,

-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,

-                                  h);

+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,

+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/convolve8_dspr2.c

+++ b/vpx_dsp/mips/convolve8_dspr2.c

@@ -1296,9 +1296,11 @@

 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                         ptrdiff_t dst_stride, const int16_t *filter_x,

-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                         ptrdiff_t dst_stride, const InterpKernel *filter,

+                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,

                          int w, int h) {

+  const int16_t *const filter_x = filter[x0_q4];

+  const int16_t *const filter_y = filter[y0_q4];

   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);

   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;

   uint32_t pos = 38;

@@ -1395,14 +1397,15 @@

 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x, int filter_x_stride,

-                             const int16_t *filter_y, int filter_y_stride,

-                             int w, int h) {

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

+                             int h) {

   int x, y;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   /* prefetch data to cache memory */

   prefetch_load(src);

--- a/vpx_dsp/mips/convolve8_horiz_dspr2.c

+++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c

@@ -818,15 +818,16 @@

 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                uint8_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4, int w,

+                               const InterpKernel *filter, int x0_q4,

+                               int x_step_q4, int y0_q4, int y_step_q4, int w,

                                int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   assert(x_step_q4 == 16);

   assert(((const int32_t *)filter_x)[1] != 0x800000);

   if (((const int32_t *)filter_x)[0] == 0) {

-    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,

-                              x_step_q4, filter_y, y_step_q4, w, h);

+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,

+                              x_step_q4, y0_q4, y_step_q4, w, h);

   } else {

     uint32_t pos = 38;

@@ -868,8 +869,8 @@

                                 (int32_t)dst_stride, filter_x, (int32_t)h);

         break;

       default:

-        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,

-                              x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,

+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/convolve8_vert_dspr2.c

+++ b/vpx_dsp/mips/convolve8_vert_dspr2.c

@@ -318,15 +318,16 @@

 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4, int w,

+                              const InterpKernel *filter, int x0_q4,

+                              int x_step_q4, int y0_q4, int y_step_q4, int w,

                               int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   assert(y_step_q4 == 16);

   assert(((const int32_t *)filter_y)[1] != 0x800000);

   if (((const int32_t *)filter_y)[0] == 0) {

-    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,

-                             x_step_q4, filter_y, y_step_q4, w, h);

+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,

+                             x_step_q4, y0_q4, y_step_q4, w, h);

   } else {

     uint32_t pos = 38;

@@ -349,8 +350,8 @@

         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);

         break;

       default:

-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                             x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                             x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/convolve_common_dspr2.h

+++ b/vpx_dsp/mips/convolve_common_dspr2.h

@@ -24,21 +24,21 @@

 #if HAVE_DSPR2

 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                uint8_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4, int w,

-                               int h);

+                               const InterpKernel *filter, int x0_q4,

+                               int32_t x_step_q4, int y0_q4, int y_step_q4,

+                               int w, int h);

 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                    uint8_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int x_step_q4,

-                                   const int16_t *filter_y, int y_step_q4,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,

                                    int w, int h);

 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                                   uint8_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int x_step_q4,

-                                  const int16_t *filter_y, int y_step_q4, int w,

-                                  int h);

+                                  const InterpKernel *filter, int x0_q4,

+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,

+                                  int w, int h);

 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

                          ptrdiff_t dst_stride, const int16_t *filter, int w,

@@ -46,9 +46,9 @@

 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4, int w,

-                              int h);

+                              const InterpKernel *filter, int x0_q4,

+                              int32_t x_step_q4, int y0_q4, int y_step_q4,

+                              int w, int h);

 #endif  // #if HAVE_DSPR2

 #ifdef __cplusplus

--- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c

@@ -633,9 +633,10 @@

 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,

                                  uint8_t *dst, ptrdiff_t dst_stride,

-                                 const int16_t *filter_x, int x_step_q4,

-                                 const int16_t *filter_y, int y_step_q4, int w,

+                                 const InterpKernel *filter, int x0_q4,

+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,

                                  int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   int8_t cnt, filt_hor[8];

   assert(x_step_q4 == 16);

@@ -668,8 +669,8 @@

                                           (int32_t)dst_stride, &filt_hor[3], h);

         break;

       default:

-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                                  x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,

+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else {

@@ -695,8 +696,8 @@

                                           (int32_t)dst_stride, filt_hor, h);

         break;

       default:

-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                                  x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,

+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c

@@ -516,9 +516,10 @@

 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int x_step_q4,

-                           const int16_t *filter_y, int y_step_q4, int w,

-                           int h) {

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int y_step_q4, int w, int h) {

+  const int16_t *const filter_x = filter[x0_q4];

+  const int16_t *const filter_y = filter[y0_q4];

   int8_t cnt, filt_hor[8], filt_ver[8];

   assert(x_step_q4 == 16);

@@ -560,14 +561,14 @@

                                                &filt_hor[3], &filt_ver[3], h);

         break;

       default:

-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,

-                            x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                            x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else if (((const int32_t *)filter_x)[0] == 0 ||

              ((const int32_t *)filter_y)[0] == 0) {

-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                        filter_y, y_step_q4, w, h);

+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                        x_step_q4, y0_q4, y_step_q4, w, h);

   } else {

     switch (w) {

       case 4:

@@ -596,8 +597,8 @@

                                                filt_ver, h);

         break;

       default:

-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,

-                            x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                            x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c

@@ -605,9 +605,10 @@

 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,

                                 uint8_t *dst, ptrdiff_t dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4, int w,

+                                const InterpKernel *filter, int x0_q4,

+                                int x_step_q4, int y0_q4, int y_step_q4, int w,

                                 int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   int8_t cnt, filt_ver[8];

   assert(y_step_q4 == 16);

@@ -640,8 +641,8 @@

                                           (int32_t)dst_stride, &filt_ver[3], h);

         break;

       default:

-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else {

@@ -668,8 +669,8 @@

                                           (int32_t)dst_stride, filt_ver, h);

         break;

       default:

-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                                 x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,

+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c

@@ -621,9 +621,10 @@

 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x, int x_step_q4,

-                             const int16_t *filter_y, int y_step_q4, int w,

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

                              int h) {

+  const int16_t *const filter_x = filter[x0_q4];

   int8_t cnt, filt_hor[8];

   assert(x_step_q4 == 16);

@@ -656,8 +657,8 @@

                              &filt_hor[3], h);

         break;

       default:

-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                              x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                              x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else {

@@ -683,8 +684,8 @@

                              filt_hor, h);

         break;

       default:

-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                              x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                              x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve8_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_msa.c

@@ -541,9 +541,11 @@

 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                       ptrdiff_t dst_stride, const int16_t *filter_x,

-                       int32_t x_step_q4, const int16_t *filter_y,

+                       ptrdiff_t dst_stride, const InterpKernel *filter,

+                       int x0_q4, int32_t x_step_q4, int y0_q4,

                        int32_t y_step_q4, int32_t w, int32_t h) {

+  const int16_t *const filter_x = filter[x0_q4];

+  const int16_t *const filter_y = filter[y0_q4];

   int8_t cnt, filt_hor[8], filt_ver[8];

   assert(x_step_q4 == 16);

@@ -585,14 +587,14 @@

                                   &filt_ver[3], (int32_t)h);

         break;

       default:

-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                        filter_y, y_step_q4, w, h);

+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                        x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else if (((const int32_t *)filter_x)[0] == 0 ||

              ((const int32_t *)filter_y)[0] == 0) {

-    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                    filter_y, y_step_q4, w, h);

+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                    y0_q4, y_step_q4, w, h);

   } else {

     switch (w) {

       case 4:

@@ -621,8 +623,8 @@

                                   (int32_t)h);

         break;

       default:

-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                        filter_y, y_step_q4, w, h);

+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                        x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c

+++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c

@@ -628,9 +628,10 @@

 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int x_step_q4,

-                            const int16_t *filter_y, int y_step_q4, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

+  const int16_t *const filter_y = filter[y0_q4];

   int8_t cnt, filt_ver[8];

   assert(y_step_q4 == 16);

@@ -663,8 +664,8 @@

                              &filt_ver[3], h);

         break;

       default:

-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                             x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                             x_step_q4, y0_q4, y_step_q4, w, h);

         break;

   } else {

@@ -690,8 +691,8 @@

                              filt_ver, h);

         break;

       default:

-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                             x_step_q4, filter_y, y_step_q4, w, h);

+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                             x_step_q4, y0_q4, y_step_q4, w, h);

         break;

--- a/vpx_dsp/mips/vpx_convolve_avg_msa.c

+++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c

@@ -189,13 +189,14 @@

 void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,

                           uint8_t *dst, ptrdiff_t dst_stride,

-                          const int16_t *filter_x, int32_t filter_x_stride,

-                          const int16_t *filter_y, int32_t filter_y_stride,

+                          const InterpKernel *filter, int x0_q4,

+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,

                           int32_t w, int32_t h) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   switch (w) {

     case 4: {

--- a/vpx_dsp/mips/vpx_convolve_copy_msa.c

+++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c

@@ -199,13 +199,14 @@

 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int32_t filter_x_stride,

-                           const int16_t *filter_y, int32_t filter_y_stride,

+                           const InterpKernel *filter, int x0_q4,

+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,

                            int32_t w, int32_t h) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   switch (w) {

     case 4: {

--- a/vpx_dsp/ppc/vpx_convolve_vsx.c

+++ b/vpx_dsp/ppc/vpx_convolve_vsx.c

@@ -53,13 +53,13 @@

 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int32_t filter_x_stride,

-                           const int16_t *filter_y, int32_t filter_y_stride,

-                           int32_t w, int32_t h) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   switch (w) {

     case 16: {

@@ -132,14 +132,8 @@

 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,

                           uint8_t *dst, ptrdiff_t dst_stride,

-                          const int16_t *filter_x, int32_t filter_x_stride,

-                          const int16_t *filter_y, int32_t filter_y_stride,

-                          int32_t w, int32_t h) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+                          const InterpKernel *filter, int x0_q4, int x_step_q4,

+                          int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {

   switch (w) {

     case 16: {

       avg_w16(src, src_stride, dst, dst_stride, h);

@@ -154,8 +148,8 @@

       break;

     default: {

-      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,

-                         filter_x_stride, filter_y, filter_y_stride, w, h);

+      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                         x_step_q4, y0_q4, y_step_q4, w, h);

       break;

@@ -299,9 +293,9 @@

 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const InterpKernel *const x_filters, int x0_q4,

-                            int x_step_q4, const InterpKernel *const y_filters,

-                            int y0_q4, int y_step_q4, int w, int h) {

+                            const InterpKernel *const filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

+                            int h) {

   // Note: Fixed size intermediate buffer, temp, places limits on parameters.

   // 2d filtering proceeds in 2 steps:

   //   (1) Interpolate horizontally into an intermediate buffer, temp.

@@ -324,95 +318,77 @@

   assert(x_step_q4 <= 32);

   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,

-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);

-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,

-                y_filters, y0_q4, y_step_q4, w, h);

+                 filter, x0_q4, x_step_q4, w, intermediate_height);

+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,

+                y0_q4, y_step_q4, w, h);

 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const int16_t *filter_x, int x_step_q4,

-                             const int16_t *filter_y, int y_step_q4, int w,

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

                              int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+  (void)y0_q4;

   (void)y_step_q4;

-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,

-                 w, h);

+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,

+                 h);

 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,

                                  uint8_t *dst, ptrdiff_t dst_stride,

-                                 const int16_t *filter_x, int x_step_q4,

-                                 const int16_t *filter_y, int y_step_q4, int w,

+                                 const InterpKernel *filter, int x0_q4,

+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,

                                  int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+  (void)y0_q4;

   (void)y_step_q4;

-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,

-                     x_step_q4, w, h);

+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                     w, h);

 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int x_step_q4,

-                            const int16_t *filter_y, int y_step_q4, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+  (void)x0_q4;

   (void)x_step_q4;

-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,

-                w, h);

+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,

+                h);

 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,

                                 uint8_t *dst, ptrdiff_t dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4, int w,

+                                const InterpKernel *filter, int x0_q4,

+                                int x_step_q4, int y0_q4, int y_step_q4, int w,

                                 int h) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+  (void)x0_q4;

   (void)x_step_q4;

-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,

-                    y_step_q4, w, h);

+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,

+                    w, h);

 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                       ptrdiff_t dst_stride, const int16_t *filter_x,

-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                       ptrdiff_t dst_stride, const InterpKernel *filter,

+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                        int w, int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,

-           filters_y, y0_q4, y_step_q4, w, h);

+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,

+           y_step_q4, w, h);

 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int x_step_q4,

-                           const int16_t *filter_y, int y_step_q4, int w,

-                           int h) {

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int y_step_q4, int w, int h) {

   // Fixed size intermediate buffer places limits on parameters.

   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);

   assert(w <= 64);

   assert(h <= 64);

-  vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,

+  vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,

                     y_step_q4, w, h);

-  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);

+  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);

--- a/vpx_dsp/vpx_convolve.c

+++ b/vpx_dsp/vpx_convolve.c

@@ -114,10 +114,9 @@

 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,

-                     int x0_q4, int x_step_q4,

-                     const InterpKernel *const y_filters, int y0_q4,

-                     int y_step_q4, int w, int h) {

+                     ptrdiff_t dst_stride, const InterpKernel *filter,

+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,

+                     int h) {

   // Note: Fixed size intermediate buffer, temp, places limits on parameters.

   // 2d filtering proceeds in 2 steps:

   //   (1) Interpolate horizontally into an intermediate buffer, temp.

@@ -140,87 +139,64 @@

   assert(x_step_q4 <= 32);

   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,

-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);

-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,

-                y_filters, y0_q4, y_step_q4, w, h);

+                 filter, x0_q4, x_step_q4, w, intermediate_height);

+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,

+                y0_q4, y_step_q4, w, h);

 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int x_step_q4,

-                           const int16_t *filter_y, int y_step_q4, int w,

-                           int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int y_step_q4, int w, int h) {

+  (void)y0_q4;

   (void)y_step_q4;

-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,

-                 w, h);

+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,

+                 h);

 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,

                                uint8_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int x_step_q4,

-                               const int16_t *filter_y, int y_step_q4, int w,

+                               const InterpKernel *filter, int x0_q4,

+                               int x_step_q4, int y0_q4, int y_step_q4, int w,

                                int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+  (void)y0_q4;

   (void)y_step_q4;

-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,

-                     x_step_q4, w, h);

+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                     w, h);

 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,

                           uint8_t *dst, ptrdiff_t dst_stride,

-                          const int16_t *filter_x, int x_step_q4,

-                          const int16_t *filter_y, int y_step_q4, int w,

-                          int h) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+                          const InterpKernel *filter, int x0_q4, int x_step_q4,

+                          int y0_q4, int y_step_q4, int w, int h) {

+  (void)x0_q4;

   (void)x_step_q4;

-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,

-                w, h);

+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,

+                h);

 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4, int w,

+                              const InterpKernel *filter, int x0_q4,

+                              int x_step_q4, int y0_q4, int y_step_q4, int w,

                               int h) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+  (void)x0_q4;

   (void)x_step_q4;

-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,

-                    y_step_q4, w, h);

+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,

+                    w, h);

 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                     ptrdiff_t dst_stride, const int16_t *filter_x,

-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,

-                     int w, int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,

-           filters_y, y0_q4, y_step_q4, w, h);

+                     ptrdiff_t dst_stride, const InterpKernel *filter,

+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,

+                     int h) {

+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,

+           y_step_q4, w, h);

 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                         ptrdiff_t dst_stride, const int16_t *filter_x,

-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                         ptrdiff_t dst_stride, const InterpKernel *filter,

+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                          int w, int h) {

   // Fixed size intermediate buffer places limits on parameters.

   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);

@@ -227,21 +203,22 @@

   assert(w <= 64);

   assert(h <= 64);

-  vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,

+  vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,

                   y_step_q4, w, h);

-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);

+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);

 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                         ptrdiff_t dst_stride, const int16_t *filter_x,

-                         int filter_x_stride, const int16_t *filter_y,

-                         int filter_y_stride, int w, int h) {

+                         ptrdiff_t dst_stride, const InterpKernel *filter,

+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

+                         int w, int h) {

   int r;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   for (r = h; r > 0; --r) {

     memcpy(dst, src, w);

@@ -251,15 +228,16 @@

 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                        ptrdiff_t dst_stride, const int16_t *filter_x,

-                        int filter_x_stride, const int16_t *filter_y,

-                        int filter_y_stride, int w, int h) {

+                        ptrdiff_t dst_stride, const InterpKernel *filter,

+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

+                        int w, int h) {

   int x, y;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   for (y = 0; y < h; ++y) {

     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);

@@ -269,53 +247,52 @@

 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                        ptrdiff_t dst_stride, const int16_t *filter_x,

-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                        ptrdiff_t dst_stride, const InterpKernel *filter,

+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                         int w, int h) {

-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                        filter_y, y_step_q4, w, h);

+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                        x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                       ptrdiff_t dst_stride, const int16_t *filter_x,

-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                       ptrdiff_t dst_stride, const InterpKernel *filter,

+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                        int w, int h) {

-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                       filter_y, y_step_q4, w, h);

+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                       x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                     ptrdiff_t dst_stride, const int16_t *filter_x,

-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,

-                     int w, int h) {

-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                  filter_y, y_step_q4, w, h);

+                     ptrdiff_t dst_stride, const InterpKernel *filter,

+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,

+                     int h) {

+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                  y0_q4, y_step_q4, w, h);

 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,

                             uint8_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int x_step_q4,

-                            const int16_t *filter_y, int y_step_q4, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h) {

-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,

-                            x_step_q4, filter_y, y_step_q4, w, h);

+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                            x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,

                            uint8_t *dst, ptrdiff_t dst_stride,

-                           const int16_t *filter_x, int x_step_q4,

-                           const int16_t *filter_y, int y_step_q4, int w,

-                           int h) {

-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,

-                           x_step_q4, filter_y, y_step_q4, w, h);

+                           const InterpKernel *filter, int x0_q4, int x_step_q4,

+                           int y0_q4, int y_step_q4, int w, int h) {

+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                           x_step_q4, y0_q4, y_step_q4, w, h);

 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                         ptrdiff_t dst_stride, const int16_t *filter_x,

-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                         ptrdiff_t dst_stride, const InterpKernel *filter,

+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                          int w, int h) {

-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,

-                      filter_y, y_step_q4, w, h);

+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,

+                      x_step_q4, y0_q4, y_step_q4, w, h);

 #if CONFIG_VP9_HIGHBITDEPTH

@@ -417,9 +394,9 @@

 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,

                             uint16_t *dst, ptrdiff_t dst_stride,

-                            const InterpKernel *const x_filters, int x0_q4,

-                            int x_step_q4, const InterpKernel *const y_filters,

-                            int y0_q4, int y_step_q4, int w, int h, int bd) {

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

+                            int h, int bd) {

   // Note: Fixed size intermediate buffer, temp, places limits on parameters.

   // 2d filtering proceeds in 2 steps:

   //   (1) Interpolate horizontally into an intermediate buffer, temp.

@@ -442,90 +419,73 @@

   assert(x_step_q4 <= 32);

   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,

-                        temp, 64, x_filters, x0_q4, x_step_q4, w,

+                        temp, 64, filter, x0_q4, x_step_q4, w,

                         intermediate_height, bd);

   highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,

-                       y_filters, y0_q4, y_step_q4, w, h, bd);

+                       filter, y0_q4, y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,

                                   uint16_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int x_step_q4,

-                                  const int16_t *filter_y, int y_step_q4, int w,

-                                  int h, int bd) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+                                  const InterpKernel *filter, int x0_q4,

+                                  int x_step_q4, int y0_q4, int y_step_q4,

+                                  int w, int h, int bd) {

+  (void)y0_q4;

   (void)y_step_q4;

-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,

+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,

                         x_step_q4, w, h, bd);

 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,

                                       uint16_t *dst, ptrdiff_t dst_stride,

-                                      const int16_t *filter_x, int x_step_q4,

-                                      const int16_t *filter_y, int y_step_q4,

+                                      const InterpKernel *filter, int x0_q4,

+                                      int x_step_q4, int y0_q4, int y_step_q4,

                                       int w, int h, int bd) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  (void)filter_y;

+  (void)y0_q4;

   (void)y_step_q4;

-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,

+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,

                             x_step_q4, w, h, bd);

 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,

                                  uint16_t *dst, ptrdiff_t dst_stride,

-                                 const int16_t *filter_x, int x_step_q4,

-                                 const int16_t *filter_y, int y_step_q4, int w,

+                                 const InterpKernel *filter, int x0_q4,

+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,

                                  int h, int bd) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+  (void)x0_q4;

   (void)x_step_q4;

-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,

+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,

                        y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,

                                      uint16_t *dst, ptrdiff_t dst_stride,

-                                     const int16_t *filter_x, int x_step_q4,

-                                     const int16_t *filter_y, int y_step_q4,

+                                     const InterpKernel *filter, int x0_q4,

+                                     int x_step_q4, int y0_q4, int y_step_q4,

                                      int w, int h, int bd) {

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  (void)filter_x;

+  (void)x0_q4;

   (void)x_step_q4;

-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,

+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,

                            y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,

                             uint16_t *dst, ptrdiff_t dst_stride,

-                            const int16_t *filter_x, int x_step_q4,

-                            const int16_t *filter_y, int y_step_q4, int w,

+                            const InterpKernel *filter, int x0_q4,

+                            int x_step_q4, int y0_q4, int y_step_q4, int w,

                             int h, int bd) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,

-                  filters_y, y0_q4, y_step_q4, w, h, bd);

+  highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                  y0_q4, y_step_q4, w, h, bd);

 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,

                                 uint16_t *dst, ptrdiff_t dst_stride,

-                                const int16_t *filter_x, int x_step_q4,

-                                const int16_t *filter_y, int y_step_q4, int w,

+                                const InterpKernel *filter, int x0_q4,

+                                int x_step_q4, int y0_q4, int y_step_q4, int w,

                                 int h, int bd) {

   // Fixed size intermediate buffer places limits on parameters.

   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);

@@ -532,23 +492,24 @@

   assert(w <= 64);

   assert(h <= 64);

-  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,

-                         filter_y, y_step_q4, w, h, bd);

-  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,

+  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,

+                         y0_q4, y_step_q4, w, h, bd);

+  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,

                             bd);

 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,

                                 uint16_t *dst, ptrdiff_t dst_stride,

-                                const int16_t *filter_x, int filter_x_stride,

-                                const int16_t *filter_y, int filter_y_stride,

-                                int w, int h, int bd) {

+                                const InterpKernel *filter, int x0_q4,

+                                int x_step_q4, int y0_q4, int y_step_q4, int w,

+                                int h, int bd) {

   int r;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   for (r = h; r > 0; --r) {

@@ -560,15 +521,16 @@

 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,

                                uint16_t *dst, ptrdiff_t dst_stride,

-                               const int16_t *filter_x, int filter_x_stride,

-                               const int16_t *filter_y, int filter_y_stride,

-                               int w, int h, int bd) {

+                               const InterpKernel *filter, int x0_q4,

+                               int x_step_q4, int y0_q4, int y_step_q4, int w,

+                               int h, int bd) {

   int x, y;

-  (void)filter_x;

-  (void)filter_x_stride;

-  (void)filter_y;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   for (y = 0; y < h; ++y) {

--- a/vpx_dsp/vpx_convolve.h

+++ b/vpx_dsp/vpx_convolve.h

@@ -19,15 +19,15 @@

 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,

                               uint8_t *dst, ptrdiff_t dst_stride,

-                              const int16_t *filter_x, int x_step_q4,

-                              const int16_t *filter_y, int y_step_q4, int w,

+                              const InterpKernel *filter, int x0_q4,

+                              int x_step_q4, int y0_q4, int y_step_q4, int w,

                               int h);

 #if CONFIG_VP9_HIGHBITDEPTH

 typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,

                                      uint16_t *dst, ptrdiff_t dst_stride,

-                                     const int16_t *filter_x, int x_step_q4,

-                                     const int16_t *filter_y, int y_step_q4,

+                                     const InterpKernel *filter, int x0_q4,

+                                     int x_step_q4, int y0_q4, int y_step_q4,

                                      int w, int h, int bd);

 #endif

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -6,6 +6,7 @@

 #include "vpx/vpx_integer.h"

 #include "vpx_dsp/vpx_dsp_common.h"

+#include "vpx_dsp/vpx_filter.h"

EOF

@@ -331,69 +332,69 @@

 # Sub Pixel Filters

-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;

-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;

-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 specialize qw/vpx_scaled_2d ssse3/;

-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";

+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

   # Sub Pixel Filters

-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;

-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";

-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";

+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";

   specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";

 }  # CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/vpx_filter.h

+++ b/vpx_dsp/vpx_filter.h

@@ -26,17 +26,6 @@

 typedef int16_t InterpKernel[SUBPEL_TAPS];

-static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {

-  // NOTE: This assumes that the filter table is 256-byte aligned.

-  // TODO(agrange) Modify to make independent of table alignment.

-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));

-}

-static INLINE int get_filter_offset(const int16_t *f,

-                                    const InterpKernel *base) {

-  return (int)((const InterpKernel *)(intptr_t)f - base);

-}

 #ifdef __cplusplus

 }  // extern "C"

 #endif

--- a/vpx_dsp/x86/convolve.h

+++ b/vpx_dsp/x86/convolve.h

@@ -20,14 +20,15 @@

                                 uint8_t *output_ptr, ptrdiff_t out_pitch,

                                 uint32_t output_height, const int16_t *filter);

-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \

+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \

   void vpx_convolve8_##name##_##opt(                                         \

       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \

-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \

-    (void)filter_x;                                                          \

+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \

+    const int16_t *filter = filter_kernel[offset];                           \

+    (void)x0_q4;                                                             \

     (void)x_step_q4;                                                         \

-    (void)filter_y;                                                          \

+    (void)y0_q4;                                                             \

     (void)y_step_q4;                                                         \

     assert(filter[3] != 128);                                                \

     assert(step_q4 == 16);                                                   \

@@ -64,32 +65,36 @@

     }                                                                        \

-#define FUN_CONV_2D(avg, opt)                                                 \

-  void vpx_convolve8_##avg##opt(                                              \

-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \

-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \

-    assert(filter_x[3] != 128);                                               \

-    assert(filter_y[3] != 128);                                               \

-    assert(w <= 64);                                                          \

-    assert(h <= 64);                                                          \

-    assert(x_step_q4 == 16);                                                  \

-    assert(y_step_q4 == 16);                                                  \

-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                            \

-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \

-      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \

-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \

-                                h + 7);                                       \

-      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \

-                                      filter_x, x_step_q4, filter_y,          \

-                                      y_step_q4, w, h);                       \

-    } else {                                                                  \

-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \

-      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \

-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \

-      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \

-                                      x_step_q4, filter_y, y_step_q4, w, h);  \

-    }                                                                         \

+#define FUN_CONV_2D(avg, opt)                                                  \

+  void vpx_convolve8_##avg##opt(                                               \

+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \

+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \

+    const int16_t *filter_x = filter[x0_q4];                                   \

+    const int16_t *filter_y = filter[y0_q4];                                   \

+    (void)filter_y;                                                            \

+    assert(filter_x[3] != 128);                                                \

+    assert(filter_y[3] != 128);                                                \

+    assert(w <= 64);                                                           \

+    assert(h <= 64);                                                           \

+    assert(x_step_q4 == 16);                                                   \

+    assert(y_step_q4 == 16);                                                   \

+    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \

+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \

+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \

+                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \

+                                h + 7);                                        \

+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \

+                                      filter, x0_q4, x_step_q4, y0_q4,         \

+                                      y_step_q4, w, h);                        \

+    } else {                                                                   \

+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \

+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \

+                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \

+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \

+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \

+                                      h);                                      \

+    }                                                                          \

 #if CONFIG_VP9_HIGHBITDEPTH

@@ -101,95 +106,97 @@

                                        unsigned int output_height,

                                        const int16_t *filter, int bd);

-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \

-  void vpx_highbd_convolve8_##name##_##opt(                               \

-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,           \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \

-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \

-    if (step_q4 == 16 && filter[3] != 128) {                              \

-      if (filter[0] | filter[1] | filter[2]) {                            \

-        while (w >= 16) {                                                 \

-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \

-          src += 16;                                                      \

-          dst += 16;                                                      \

-          w -= 16;                                                        \

-        }                                                                 \

-        while (w >= 8) {                                                  \

-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \

-          src += 8;                                                       \

-          dst += 8;                                                       \

-          w -= 8;                                                         \

-        }                                                                 \

-        while (w >= 4) {                                                  \

-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \

-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \

-          src += 4;                                                       \

-          dst += 4;                                                       \

-          w -= 4;                                                         \

-        }                                                                 \

-      } else {                                                            \

-        while (w >= 16) {                                                 \

-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \

-              src, src_stride, dst, dst_stride, h, filter, bd);           \

-          src += 16;                                                      \

-          dst += 16;                                                      \

-          w -= 16;                                                        \

-        }                                                                 \

-        while (w >= 8) {                                                  \

-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \

-              src, src_stride, dst, dst_stride, h, filter, bd);           \

-          src += 8;                                                       \

-          dst += 8;                                                       \

-          w -= 8;                                                         \

-        }                                                                 \

-        while (w >= 4) {                                                  \

-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \

-              src, src_stride, dst, dst_stride, h, filter, bd);           \

-          src += 4;                                                       \

-          dst += 4;                                                       \

-          w -= 4;                                                         \

-        }                                                                 \

-      }                                                                   \

-    }                                                                     \

-    if (w) {                                                              \

-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,   \

-                                      filter_x, x_step_q4, filter_y,      \

-                                      y_step_q4, w, h, bd);               \

-    }                                                                     \

-  }

-#define HIGH_FUN_CONV_2D(avg, opt)                                            \

-  void vpx_highbd_convolve8_##avg##opt(                                       \

+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \

+  void vpx_highbd_convolve8_##name##_##opt(                                   \

       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \

-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \

-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \

-    assert(w <= 64);                                                          \

-    assert(h <= 64);                                                          \

-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \

-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \

-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \

-        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \

-                                         fdata2, 64, filter_x, x_step_q4,     \

-                                         filter_y, y_step_q4, w, h + 7, bd);  \

-        vpx_highbd_convolve8_##avg##vert_##opt(                               \

-            fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \

-            y_step_q4, w, h, bd);                                             \

+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \

+    const int16_t *filter = filter_kernel[offset];                            \

+    if (step_q4 == 16 && filter[3] != 128) {                                  \

+      if (filter[0] | filter[1] | filter[2]) {                                \

+        while (w >= 16) {                                                     \

+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \

+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

+          src += 16;                                                          \

+          dst += 16;                                                          \

+          w -= 16;                                                            \

+        }                                                                     \

+        while (w >= 8) {                                                      \

+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \

+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

+          src += 8;                                                           \

+          dst += 8;                                                           \

+          w -= 8;                                                             \

+        }                                                                     \

+        while (w >= 4) {                                                      \

+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \

+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \

+          src += 4;                                                           \

+          dst += 4;                                                           \

+          w -= 4;                                                             \

+        }                                                                     \

       } else {                                                                \

-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \

-        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64,         \

-                                         filter_x, x_step_q4, filter_y,       \

-                                         y_step_q4, w, h + 1, bd);            \

-        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,   \

-                                               filter_x, x_step_q4, filter_y, \

-                                               y_step_q4, w, h, bd);          \

+        while (w >= 16) {                                                     \

+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \

+              src, src_stride, dst, dst_stride, h, filter, bd);               \

+          src += 16;                                                          \

+          dst += 16;                                                          \

+          w -= 16;                                                            \

+        }                                                                     \

+        while (w >= 8) {                                                      \

+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \

+              src, src_stride, dst, dst_stride, h, filter, bd);               \

+          src += 8;                                                           \

+          dst += 8;                                                           \

+          w -= 8;                                                             \

+        }                                                                     \

+        while (w >= 4) {                                                      \

+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \

+              src, src_stride, dst, dst_stride, h, filter, bd);               \

+          src += 4;                                                           \

+          dst += 4;                                                           \

+          w -= 4;                                                             \

+        }                                                                     \

       }                                                                       \

-    } else {                                                                  \

-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \

-                                    filter_x, x_step_q4, filter_y, y_step_q4, \

-                                    w, h, bd);                                \

     }                                                                         \

+    if (w) {                                                                  \

+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \

+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \

+                                      y_step_q4, w, h, bd);                   \

+    }                                                                         \

+  }

+#define HIGH_FUN_CONV_2D(avg, opt)                                             \

+  void vpx_highbd_convolve8_##avg##opt(                                        \

+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \

+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \

+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \

+    const int16_t *filter_x = filter[x0_q4];                                   \

+    assert(w <= 64);                                                           \

+    assert(h <= 64);                                                           \

+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \

+      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \

+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \

+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \

+                                         fdata2, 64, filter, x0_q4, x_step_q4, \

+                                         y0_q4, y_step_q4, w, h + 7, bd);      \

+        vpx_highbd_convolve8_##avg##vert_##opt(                                \

+            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \

+            y0_q4, y_step_q4, w, h, bd);                                       \

+      } else {                                                                 \

+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \

+        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \

+                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \

+                                         w, h + 1, bd);                        \

+        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \

+                                               filter, x0_q4, x_step_q4,       \

+                                               y0_q4, y_step_q4, w, h, bd);    \

+      }                                                                        \

+    } else {                                                                   \

+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \

+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \

+                                    bd);                                       \

+    }                                                                          \

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/x86/highbd_convolve_avx2.c

+++ b/vpx_dsp/x86/highbd_convolve_avx2.c

@@ -18,13 +18,14 @@

 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,

                                    uint16_t *dst, ptrdiff_t dst_stride,

-                                   const int16_t *filter_x, int filter_x_stride,

-                                   const int16_t *filter_y, int filter_y_stride,

+                                   const InterpKernel *filter, int x0_q4,

+                                   int x_step_q4, int y0_q4, int y_step_q4,

                                    int width, int h, int bd) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   assert(width % 4 == 0);

@@ -99,13 +100,14 @@

 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,

                                   uint16_t *dst, ptrdiff_t dst_stride,

-                                  const int16_t *filter_x, int filter_x_stride,

-                                  const int16_t *filter_y, int filter_y_stride,

+                                  const InterpKernel *filter, int x0_q4,

+                                  int x_step_q4, int y0_q4, int y_step_q4,

                                   int width, int h, int bd) {

-  (void)filter_x;

-  (void)filter_y;

-  (void)filter_x_stride;

-  (void)filter_y_stride;

+  (void)filter;

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y0_q4;

+  (void)y_step_q4;

   (void)bd;

   assert(width % 4 == 0);

@@ -1073,8 +1075,8 @@

 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2

 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2

-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);

-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);

+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);

+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);

 HIGH_FUN_CONV_2D(, avx2);

 void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,

@@ -1098,8 +1100,8 @@

 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \

   vpx_highbd_filter_block1d4_v2_avg_sse2

-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);

-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);

+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,

                  avx2);

 HIGH_FUN_CONV_2D(avg_, avx2);

--- a/vpx_dsp/x86/vpx_asm_stubs.c

+++ b/vpx_dsp/x86/vpx_asm_stubs.c

@@ -41,38 +41,38 @@

 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                               uint8_t *dst, ptrdiff_t dst_stride,

-//                               const int16_t *filter_x, int x_step_q4,

-//                               const int16_t *filter_y, int y_step_q4,

+//                               const InterpKernel *filter, int x0_q4,

+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                               int w, int h);

 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                              uint8_t *dst, ptrdiff_t dst_stride,

-//                              const int16_t *filter_x, int x_step_q4,

-//                              const int16_t *filter_y, int y_step_q4,

+//                              const InterpKernel *filter, int x0_q4,

+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                              int w, int h);

 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                                   uint8_t *dst, ptrdiff_t dst_stride,

-//                                   const int16_t *filter_x, int x_step_q4,

-//                                   const int16_t *filter_y, int y_step_q4,

-//                                   int w, int h);

+//                                   const InterpKernel *filter, int x0_q4,

+//                                   int32_t x_step_q4, int y0_q4,

+//                                   int y_step_q4, int w, int h);

 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                                  uint8_t *dst, ptrdiff_t dst_stride,

-//                                  const int16_t *filter_x, int x_step_q4,

-//                                  const int16_t *filter_y, int y_step_q4,

+//                                  const InterpKernel *filter, int x0_q4,

+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                                  int w, int h);

-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);

-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);

-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);

-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);

+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);

+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);

+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);

+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);

 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                         uint8_t *dst, ptrdiff_t dst_stride,

-//                         const int16_t *filter_x, int x_step_q4,

-//                         const int16_t *filter_y, int y_step_q4,

+//                         const InterpKernel *filter, int x0_q4,

+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                         int w, int h);

 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                             uint8_t *dst, ptrdiff_t dst_stride,

-//                             const int16_t *filter_x, int x_step_q4,

-//                             const int16_t *filter_y, int y_step_q4,

+//                             const InterpKernel *filter, int x0_q4,

+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                             int w, int h);

 FUN_CONV_2D(, sse2);

 FUN_CONV_2D(avg_, sse2);

@@ -140,22 +140,22 @@

 //                                         const int16_t *filter_y,

 //                                         int y_step_q4,

 //                                         int w, int h, int bd);

-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);

-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);

-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);

-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);

+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);

+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);

+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,

                  sse2);

 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                                uint8_t *dst, ptrdiff_t dst_stride,

-//                                const int16_t *filter_x, int x_step_q4,

-//                                const int16_t *filter_y, int y_step_q4,

+//                                const InterpKernel *filter, int x0_q4,

+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                                int w, int h, int bd);

 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,

 //                                    uint8_t *dst, ptrdiff_t dst_stride,

-//                                    const int16_t *filter_x, int x_step_q4,

-//                                    const int16_t *filter_y, int y_step_q4,

-//                                    int w, int h, int bd);

+//                                    const InterpKernel *filter, int x0_q4,

+//                                    int32_t x_step_q4, int y0_q4,

+//                                    int y_step_q4, int w, int h, int bd);

 HIGH_FUN_CONV_2D(, sse2);

 HIGH_FUN_CONV_2D(avg_, sse2);

 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64

--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm

+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm

@@ -20,14 +20,14 @@

 %endif

 %ifidn %2, highbd

 %define pavg pavgw

-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \

+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \

                                               dst, dst_stride, \

-                                              fx, fxs, fy, fys, w, h, bd

+                                              f, fxo, fxs, fyo, fys, w, h, bd

 %else

 %define pavg pavgb

-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \

+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \

                                            dst, dst_stride, \

-                                           fx, fxs, fy, fys, w, h

+                                           f, fxo, fxs, fyo, fys, w, h

 %endif

   mov r4d, dword wm

 %ifidn %2, highbd

--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c

+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c

@@ -554,21 +554,21 @@

 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3

 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,

 //                                uint8_t *dst, ptrdiff_t dst_stride,

-//                                const int16_t *filter_x, int x_step_q4,

-//                                const int16_t *filter_y, int y_step_q4,

+//                                const InterpKernel *filter, int x0_q4,

+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                                int w, int h);

 // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,

 //                               uint8_t *dst, ptrdiff_t dst_stride,

-//                               const int16_t *filter_x, int x_step_q4,

-//                               const int16_t *filter_y, int y_step_q4,

+//                               const InterpKernel *filter, int x0_q4,

+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                               int w, int h);

-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);

-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);

+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);

+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);

 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,

 //                          uint8_t *dst, ptrdiff_t dst_stride,

-//                          const int16_t *filter_x, int x_step_q4,

-//                          const int16_t *filter_y, int y_step_q4,

+//                          const InterpKernel *filter, int x0_q4,

+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                          int w, int h);

 FUN_CONV_2D(, avx2);

 #endif  // HAVE_AX2 && HAVE_SSSE3

--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c

@@ -306,29 +306,28 @@

 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                                uint8_t *dst, ptrdiff_t dst_stride,

-//                                const int16_t *filter_x, int x_step_q4,

-//                                const int16_t *filter_y, int y_step_q4,

+//                                const InterpKernel *filter, int x0_q4,

+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                                int w, int h);

 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                               uint8_t *dst, ptrdiff_t dst_stride,

-//                               const int16_t *filter_x, int x_step_q4,

-//                               const int16_t *filter_y, int y_step_q4,

+//                               const InterpKernel *filter, int x0_q4,

+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                               int w, int h);

 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                                    uint8_t *dst, ptrdiff_t dst_stride,

-//                                    const int16_t *filter_x, int x_step_q4,

-//                                    const int16_t *filter_y, int y_step_q4,

-//                                    int w, int h);

+//                                    const InterpKernel *filter, int x0_q4,

+//                                    int32_t x_step_q4, int y0_q4,

+//                                    int y_step_q4, int w, int h);

 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                                   uint8_t *dst, ptrdiff_t dst_stride,

-//                                   const int16_t *filter_x, int x_step_q4,

-//                                   const int16_t *filter_y, int y_step_q4,

-//                                   int w, int h);

-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);

-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);

-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);

-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,

-            ssse3);

+//                                   const InterpKernel *filter, int x0_q4,

+//                                   int32_t x_step_q4, int y0_q4,

+//                                   int y_step_q4, int w, int h);

+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);

+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);

+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);

+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);

 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \

                       out2, out3, out4, out5, out6, out7)                 \

@@ -813,9 +812,9 @@

 static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

-                             const InterpKernel *const x_filters, int x0_q4,

-                             int x_step_q4, const InterpKernel *const y_filters,

-                             int y0_q4, int y_step_q4, int w, int h) {

+                             const InterpKernel *const filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

+                             int h) {

   // Note: Fixed size intermediate buffer, temp, places limits on parameters.

   // 2d filtering proceeds in 2 steps:

   //   (1) Interpolate horizontally into an intermediate buffer, temp.

@@ -840,49 +839,43 @@

   if (w >= 8) {

     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),

-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,

-                            w, intermediate_height);

+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,

+                            intermediate_height);

   } else {

     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),

-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,

-                            w, intermediate_height);

+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,

+                            intermediate_height);

   if (w >= 16) {

     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);

+                            dst_stride, filter, y0_q4, y_step_q4, w, h);

   } else if (w == 8) {

     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);

+                           dst_stride, filter, y0_q4, y_step_q4, w, h);

   } else {

     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,

-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);

+                           dst_stride, filter, y0_q4, y_step_q4, w, h);

 void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,

-                         ptrdiff_t dst_stride, const int16_t *filter_x,

-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,

+                         ptrdiff_t dst_stride, const InterpKernel *filter,

+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,

                          int w, int h) {

-  const InterpKernel *const filters_x = get_filter_base(filter_x);

-  const int x0_q4 = get_filter_offset(filter_x, filters_x);

-  const InterpKernel *const filters_y = get_filter_base(filter_y);

-  const int y0_q4 = get_filter_offset(filter_y, filters_y);

-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,

-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);

+  scaledconvolve2d(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,

+                   y0_q4, y_step_q4, w, h);

 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                          uint8_t *dst, ptrdiff_t dst_stride,

-//                          const int16_t *filter_x, int x_step_q4,

-//                          const int16_t *filter_y, int y_step_q4,

+//                          const InterpKernel *filter, int x0_q4,

+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                          int w, int h);

 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,

 //                              uint8_t *dst, ptrdiff_t dst_stride,

-//                              const int16_t *filter_x, int x_step_q4,

-//                              const int16_t *filter_y, int y_step_q4,

+//                              const InterpKernel *filter, int x0_q4,

+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,

 //                              int w, int h);

 FUN_CONV_2D(, ssse3);

 FUN_CONV_2D(avg_, ssse3);

--

⑨