shithub: libvpx

Download patch

ref: 10823f54681747b9f64deb3002531c95cc67d17f
parent: 66c1ff6850fd53bcf5c17247569bea1d700d6247
author: Jonathan Wright <jonathan.wright@arm.com>
date: Sat May 22 18:07:25 EDT 2021

Merge transpose and permute in Neon SDOT vertical convolution

The original dot-product implementation of vpx_convolve8_vert_neon
used a separate transpose before and after the convolution operation.
This patch merges the first transpose with the TBL permute (necessary
before using SDOT to compute the convolution) to significantly reduce
the amount of data re-arrangement. This new approach also allows for
more effective data re-use between loop iterations.

Co-authored by: James Greenhalgh <james.greenhalgh@arm.com>

Bug: b/181236880
Change-Id: I87fe4dadd312c3ad6216943b71a5410ddf4a1b5b

--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,31 +31,8 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
-                                const uint8x8_t s0, const uint8x8_t s1,
-                                const uint8x8_t s2, const uint8x8_t s3,
-                                const uint8x8_t s4, const uint8x8_t s5,
-                                const uint8x8_t s6, const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
-
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
     (__ARM_FEATURE_DOTPROD == 1)
-
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
@@ -62,6 +39,64 @@
   8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
 };
 
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
+                                        int8x8_t *a2, int8x8_t *a3,
+                                        int8x16_t *b0, int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -270,6 +305,28 @@
 
 #else
 
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -826,7 +883,11 @@
   const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
   const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
   const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
 
   assert(!((intptr_t)dst & 3));
   assert(!(dst_stride & 3));
@@ -839,154 +900,196 @@
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, d01, d23;
-    uint8x16_t s0, s1, s2, s3;
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
     int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
 
-    load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-    transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
-    src += 8 * src_stride;
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    src += 4 * src_stride;
+    t4 = vld1_u8(src);
+    src += src_stride;
+    t5 = vld1_u8(src);
+    src += src_stride;
+    t6 = vld1_u8(src);
+    src += src_stride;
 
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+
     do {
-      load_u8_8x4(src, src_stride, &t8, &t9, &t10, &t11);
-      transpose_u8_8x4(&t8, &t9, &t10, &t11);
-      s0 = vcombine_u8(t0, t8);
-      s1 = vcombine_u8(t1, t9);
-      s2 = vcombine_u8(t2, t10);
-      s3 = vcombine_u8(t3, t11);
+      uint8x8_t t7, t8, t9, t10;
 
-      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
 
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
+
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-      transpose_u8_4x4(&d01, &d23);
 
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
       dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
       dst += dst_stride;
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
+      dst += dst_stride;
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
       dst += dst_stride;
 
-      t0 = vext_u8(t0, t8, 4);
-      t1 = vext_u8(t1, t9, 4);
-      t2 = vext_u8(t2, t10, 4);
-      t3 = vext_u8(t3, t11, 4);
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
       src += 4 * src_stride;
       h -= 4;
     } while (h > 0);
-  } else if (h == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, d04, d15, d26, d37;
-    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-    int32x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
     const uint8_t *s;
     uint8_t *d;
+    int height;
 
     do {
+      height = h;
       s = src;
       d = dst;
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      s += 8 * src_stride;
-      t8 = vld1_u8(s);
+
+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+      s += 4 * src_stride;
+      t4 = vld1_u8(s);
       s += src_stride;
-      t9 = vld1_u8(s);
+      t5 = vld1_u8(s);
       s += src_stride;
-      t10 = vld1_u8(s);
+      t6 = vld1_u8(s);
       s += src_stride;
 
-      transpose_u8_8x16(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,
-                        vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0),
-                        vdup_n_u8(0), &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
 
-      d0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      d1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      d2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      d3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
-      d4 = convolve8_4_dot(s4, filters, correction, range_limit, permute_tbl);
-      d5 = convolve8_4_dot(s5, filters, correction, range_limit, permute_tbl);
-      d6 = convolve8_4_dot(s6, filters, correction, range_limit, permute_tbl);
-      d7 = convolve8_4_dot(s7, filters, correction, range_limit, permute_tbl);
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
 
-      d04 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d4)), 7);
-      d15 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d1), vqmovn_s32(d5)), 7);
-      d26 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d6)), 7);
-      d37 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d3), vqmovn_s32(d7)), 7);
+      do {
+        uint8x8_t t7, t8, t9, t10;
 
-      transpose_u8_8x4(&d04, &d15, &d26, &d37);
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
 
-      vst1_u8(d, d04);
-      d += dst_stride;
-      vst1_u8(d, d15);
-      d += dst_stride;
-      vst1_u8(d, d26);
-      d += dst_stride;
-      vst1_u8(d, d37);
-      d += dst_stride;
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14,
-        t15, d0, d1, d2, d3, d4, d5, d6, d7;
-    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
 
-    do {
-      height = h;
-      s = src;
-      d = dst;
-      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-      s += 8 * src_stride;
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      do {
-        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
-                    &t15);
-        transpose_u8_8x8(&t8, &t9, &t10, &t11, &t12, &t13, &t14, &t15);
-        s0 = vcombine_u8(t0, t8);
-        s1 = vcombine_u8(t1, t9);
-        s2 = vcombine_u8(t2, t10);
-        s3 = vcombine_u8(t3, t11);
-        s4 = vcombine_u8(t4, t12);
-        s5 = vcombine_u8(t5, t13);
-        s6 = vcombine_u8(t6, t14);
-        s7 = vcombine_u8(t7, t15);
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
-        d4 = convolve8_8_dot(s4, filters, correction, range_limit, permute_tbl);
-        d5 = convolve8_8_dot(s5, filters, correction, range_limit, permute_tbl);
-        d6 = convolve8_8_dot(s6, filters, correction, range_limit, permute_tbl);
-        d7 = convolve8_8_dot(s7, filters, correction, range_limit, permute_tbl);
+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                     correction, filters);
+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                     correction, filters);
+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                     correction, filters);
+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                     correction, filters);
+        vst1_u8(d + 0 * dst_stride, d0);
+        vst1_u8(d + 1 * dst_stride, d1);
+        vst1_u8(d + 2 * dst_stride, d2);
+        vst1_u8(d + 3 * dst_stride, d3);
 
-        transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
-        store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
 
-        t0 = t8;
-        t1 = t9;
-        t2 = t10;
-        t3 = t11;
-        t4 = t12;
-        t5 = t13;
-        t6 = t14;
-        t7 = t15;
-        s += 8 * src_stride;
-        d += 8 * dst_stride;
-        height -= 8;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
       } while (height > 0);
       src += 8;
       dst += 8;
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -75,6 +75,21 @@
 #if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
     (__ARM_FEATURE_DOTPROD == 1)
 
+static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
+                                                const int8x16_t samples_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
 static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
                                         const int8x8_t filters,
                                         const int32x4_t correction,
@@ -98,6 +113,29 @@
 
   /* Narrowing and packing is performed by the caller. */
   return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
+                                                const int8x16_t samples0_hi,
+                                                const int8x16_t samples1_lo,
+                                                const int8x16_t samples1_hi,
+                                                const int32x4_t correction,
+                                                const int8x8_t filters) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filters, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filters, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
 }
 
 static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,