shithub: libvpx

--- a/vpx_dsp/arm/vpx_convolve8_neon.c

+++ b/vpx_dsp/arm/vpx_convolve8_neon.c

@@ -303,6 +303,467 @@

+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

+                             uint8_t *dst, ptrdiff_t dst_stride,

+                             const InterpKernel *filter, int x0_q4,

+                             int x_step_q4, int y0_q4, int y_step_q4, int w,

+                             int h) {

+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));

+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);

+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));

+  const uint8x8_t range_limit = vdup_n_u8(128);

+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);

+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;

+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;

+  int8x16x2_t samples_LUT;

+  assert(!((intptr_t)dst & 3));

+  assert(!(dst_stride & 3));

+  assert(y_step_q4 == 16);

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y_step_q4;

+  src -= 3 * src_stride;

+  if (w == 4) {

+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);

+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;

+    int32x4_t d0, d1, d2, d3;

+    uint8x8_t d01, d23;

+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);

+    src += 4 * src_stride;

+    t4 = vld1_u8(src);

+    src += src_stride;

+    t5 = vld1_u8(src);

+    src += src_stride;

+    t6 = vld1_u8(src);

+    src += src_stride;

+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

+    s7 = vdup_n_s8(0);

+    s8 = vdup_n_s8(0);

+    s9 = vdup_n_s8(0);

+    /* This operation combines a conventional transpose and the sample permute

+     * (see horizontal case) required before computing the dot product.

+     */

+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);

+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);

+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);

+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);

+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);

+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);

+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);

+    do {

+      uint8x8_t t7, t8, t9, t10;

+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);

+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);

+      /* Merge new data into block from previous iteration. */

+      samples_LUT.val[0] = s3456;

+      samples_LUT.val[1] = s78910;

+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);

+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);

+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);

+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);

+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);

+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);

+      dst += dst_stride;

+      /* Prepare block for next iteration - re-using as much as possible. */

+      /* Shuffle everything up four rows. */

+      s0123 = s4567;

+      s1234 = s5678;

+      s2345 = s6789;

+      s3456 = s78910;

+      src += 4 * src_stride;

+      h -= 4;

+    } while (h > 0);

+  } else {

+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);

+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,

+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,

+        s6789_hi, s78910_lo, s78910_hi;

+    uint8x8_t d0, d1, d2, d3;

+    const uint8_t *s;

+    uint8_t *d;

+    int height;

+    do {

+      height = h;

+      s = src;

+      d = dst;

+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);

+      s += 4 * src_stride;

+      t4 = vld1_u8(s);

+      s += src_stride;

+      t5 = vld1_u8(s);

+      s += src_stride;

+      t6 = vld1_u8(s);

+      s += src_stride;

+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

+      s7 = vdup_n_s8(0);

+      s8 = vdup_n_s8(0);

+      s9 = vdup_n_s8(0);

+      /* This operation combines a conventional transpose and the sample permute

+       * (see horizontal case) required before computing the dot product.

+       */

+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,

+                           tran_concat_tbl);

+      do {

+        uint8x8_t t7, t8, t9, t10;

+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);

+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,

+                             tran_concat_tbl);

+        /* Merge new data into block from previous iteration. */

+        samples_LUT.val[0] = s3456_lo;

+        samples_LUT.val[1] = s78910_lo;

+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+        samples_LUT.val[0] = s3456_hi;

+        samples_LUT.val[1] = s78910_hi;

+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,

+                                     correction, filters);

+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,

+                                     correction, filters);

+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,

+                                     correction, filters);

+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,

+                                     correction, filters);

+        vst1_u8(d + 0 * dst_stride, d0);

+        vst1_u8(d + 1 * dst_stride, d1);

+        vst1_u8(d + 2 * dst_stride, d2);

+        vst1_u8(d + 3 * dst_stride, d3);

+        /* Prepare block for next iteration - re-using as much as possible. */

+        /* Shuffle everything up four rows. */

+        s0123_lo = s4567_lo;

+        s0123_hi = s4567_hi;

+        s1234_lo = s5678_lo;

+        s1234_hi = s5678_hi;

+        s2345_lo = s6789_lo;

+        s2345_hi = s6789_hi;

+        s3456_lo = s78910_lo;

+        s3456_hi = s78910_hi;

+        s += 4 * src_stride;

+        d += 4 * dst_stride;

+        height -= 4;

+      } while (height > 0);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    } while (w > 0);

+  }

+}

+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

+                                 uint8_t *dst, ptrdiff_t dst_stride,

+                                 const InterpKernel *filter, int x0_q4,

+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,

+                                 int h) {

+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));

+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);

+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));

+  const uint8x8_t range_limit = vdup_n_u8(128);

+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);

+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;

+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;

+  int8x16x2_t samples_LUT;

+  assert(!((intptr_t)dst & 3));

+  assert(!(dst_stride & 3));

+  assert(y_step_q4 == 16);

+  (void)x0_q4;

+  (void)x_step_q4;

+  (void)y_step_q4;

+  src -= 3 * src_stride;

+  if (w == 4) {

+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);

+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;

+    int32x4_t d0, d1, d2, d3;

+    uint8x8_t d01, d23, dd01, dd23;

+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);

+    src += 4 * src_stride;

+    t4 = vld1_u8(src);

+    src += src_stride;

+    t5 = vld1_u8(src);

+    src += src_stride;

+    t6 = vld1_u8(src);

+    src += src_stride;

+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

+    s7 = vdup_n_s8(0);

+    s8 = vdup_n_s8(0);

+    s9 = vdup_n_s8(0);

+    /* This operation combines a conventional transpose and the sample permute

+     * (see horizontal case) required before computing the dot product.

+     */

+    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);

+    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);

+    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);

+    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);

+    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);

+    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);

+    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);

+    do {

+      uint8x8_t t7, t8, t9, t10;

+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);

+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

+      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);

+      /* Merge new data into block from previous iteration. */

+      samples_LUT.val[0] = s3456;

+      samples_LUT.val[1] = s78910;

+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);

+      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);

+      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);

+      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);

+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);

+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);

+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);

+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);

+      d01 = vrhadd_u8(d01, dd01);

+      d23 = vrhadd_u8(d23, dd23);

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);

+      dst += dst_stride;

+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);

+      dst += dst_stride;

+      /* Prepare block for next iteration - re-using as much as possible. */

+      /* Shuffle everything up four rows. */

+      s0123 = s4567;

+      s1234 = s5678;

+      s2345 = s6789;

+      s3456 = s78910;

+      src += 4 * src_stride;

+      h -= 4;

+    } while (h > 0);

+  } else {

+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);

+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,

+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,

+        s6789_hi, s78910_lo, s78910_hi;

+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;

+    const uint8_t *s;

+    uint8_t *d;

+    int height;

+    do {

+      height = h;

+      s = src;

+      d = dst;

+      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);

+      s += 4 * src_stride;

+      t4 = vld1_u8(s);

+      s += src_stride;

+      t5 = vld1_u8(s);

+      s += src_stride;

+      t6 = vld1_u8(s);

+      s += src_stride;

+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

+      s7 = vdup_n_s8(0);

+      s8 = vdup_n_s8(0);

+      s9 = vdup_n_s8(0);

+      /* This operation combines a conventional transpose and the sample permute

+       * (see horizontal case) required before computing the dot product.

+       */

+      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,

+                           tran_concat_tbl);

+      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,

+                           tran_concat_tbl);

+      do {

+        uint8x8_t t7, t8, t9, t10;

+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);

+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

+        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,

+                             tran_concat_tbl);

+        /* Merge new data into block from previous iteration. */

+        samples_LUT.val[0] = s3456_lo;

+        samples_LUT.val[1] = s78910_lo;

+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+        samples_LUT.val[0] = s3456_hi;

+        samples_LUT.val[1] = s78910_hi;

+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

+        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,

+                                     correction, filters);

+        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,

+                                     correction, filters);

+        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,

+                                     correction, filters);

+        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,

+                                     correction, filters);

+        dd0 = vld1_u8(d + 0 * dst_stride);

+        dd1 = vld1_u8(d + 1 * dst_stride);

+        dd2 = vld1_u8(d + 2 * dst_stride);

+        dd3 = vld1_u8(d + 3 * dst_stride);

+        d0 = vrhadd_u8(d0, dd0);

+        d1 = vrhadd_u8(d1, dd1);

+        d2 = vrhadd_u8(d2, dd2);

+        d3 = vrhadd_u8(d3, dd3);

+        vst1_u8(d + 0 * dst_stride, d0);

+        vst1_u8(d + 1 * dst_stride, d1);

+        vst1_u8(d + 2 * dst_stride, d2);

+        vst1_u8(d + 3 * dst_stride, d3);

+        /* Prepare block for next iteration - re-using as much as possible. */

+        /* Shuffle everything up four rows. */

+        s0123_lo = s4567_lo;

+        s0123_hi = s4567_hi;

+        s1234_lo = s5678_lo;

+        s1234_hi = s5678_hi;

+        s2345_lo = s6789_lo;

+        s2345_hi = s6789_hi;

+        s3456_lo = s78910_lo;

+        s3456_hi = s78910_hi;

+        s += 4 * src_stride;

+        d += 4 * dst_stride;

+        height -= 4;

+      } while (height > 0);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    } while (w > 0);

+  }

+}

 #else

 static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,

@@ -870,241 +1331,11 @@

-#endif

-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \

-    (__ARM_FEATURE_DOTPROD == 1)

 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

                              uint8_t *dst, ptrdiff_t dst_stride,

                              const InterpKernel *filter, int x0_q4,

                              int x_step_q4, int y0_q4, int y_step_q4, int w,

                              int h) {

-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));

-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);

-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));

-  const uint8x8_t range_limit = vdup_n_u8(128);

-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);

-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;

-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;

-  int8x16x2_t samples_LUT;

-  assert(!((intptr_t)dst & 3));

-  assert(!(dst_stride & 3));

-  assert(y_step_q4 == 16);

-  (void)x0_q4;

-  (void)x_step_q4;

-  (void)y_step_q4;

-  src -= 3 * src_stride;

-  if (w == 4) {

-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);

-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;

-    int32x4_t d0, d1, d2, d3;

-    uint8x8_t d01, d23;

-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);

-    src += 4 * src_stride;

-    t4 = vld1_u8(src);

-    src += src_stride;

-    t5 = vld1_u8(src);

-    src += src_stride;

-    t6 = vld1_u8(src);

-    src += src_stride;

-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

-    s7 = vdup_n_s8(0);

-    s8 = vdup_n_s8(0);

-    s9 = vdup_n_s8(0);

-    /* This operation combines a conventional transpose and the sample permute

-     * (see horizontal case) required before computing the dot product.

-     */

-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);

-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);

-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);

-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);

-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);

-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);

-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);

-    do {

-      uint8x8_t t7, t8, t9, t10;

-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);

-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);

-      /* Merge new data into block from previous iteration. */

-      samples_LUT.val[0] = s3456;

-      samples_LUT.val[1] = s78910;

-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);

-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);

-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);

-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);

-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);

-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);

-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);

-      dst += dst_stride;

-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);

-      dst += dst_stride;

-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);

-      dst += dst_stride;

-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);

-      dst += dst_stride;

-      /* Prepare block for next iteration - re-using as much as possible. */

-      /* Shuffle everything up four rows. */

-      s0123 = s4567;

-      s1234 = s5678;

-      s2345 = s6789;

-      s3456 = s78910;

-      src += 4 * src_stride;

-      h -= 4;

-    } while (h > 0);

-  } else {

-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);

-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,

-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,

-        s6789_hi, s78910_lo, s78910_hi;

-    uint8x8_t d0, d1, d2, d3;

-    const uint8_t *s;

-    uint8_t *d;

-    int height;

-    do {

-      height = h;

-      s = src;

-      d = dst;

-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);

-      s += 4 * src_stride;

-      t4 = vld1_u8(s);

-      s += src_stride;

-      t5 = vld1_u8(s);

-      s += src_stride;

-      t6 = vld1_u8(s);

-      s += src_stride;

-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */

-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));

-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));

-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));

-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));

-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));

-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));

-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));

-      s7 = vdup_n_s8(0);

-      s8 = vdup_n_s8(0);

-      s9 = vdup_n_s8(0);

-      /* This operation combines a conventional transpose and the sample permute

-       * (see horizontal case) required before computing the dot product.

-       */

-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,

-                           tran_concat_tbl);

-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,

-                           tran_concat_tbl);

-      do {

-        uint8x8_t t7, t8, t9, t10;

-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);

-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));

-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));

-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));

-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));

-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,

-                             tran_concat_tbl);

-        /* Merge new data into block from previous iteration. */

-        samples_LUT.val[0] = s3456_lo;

-        samples_LUT.val[1] = s78910_lo;

-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

-        samples_LUT.val[0] = s3456_hi;

-        samples_LUT.val[1] = s78910_hi;

-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);

-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);

-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);

-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,

-                                     correction, filters);

-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,

-                                     correction, filters);

-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,

-                                     correction, filters);

-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,

-                                     correction, filters);

-        vst1_u8(d + 0 * dst_stride, d0);

-        vst1_u8(d + 1 * dst_stride, d1);

-        vst1_u8(d + 2 * dst_stride, d2);

-        vst1_u8(d + 3 * dst_stride, d3);

-        /* Prepare block for next iteration - re-using as much as possible. */

-        /* Shuffle everything up four rows. */

-        s0123_lo = s4567_lo;

-        s0123_hi = s4567_hi;

-        s1234_lo = s5678_lo;

-        s1234_hi = s5678_hi;

-        s2345_lo = s6789_lo;

-        s2345_hi = s6789_hi;

-        s3456_lo = s78910_lo;

-        s3456_hi = s78910_hi;

-        s += 4 * src_stride;

-        d += 4 * dst_stride;

-        height -= 4;

-      } while (height > 0);

-      src += 8;

-      dst += 8;

-      w -= 8;

-    } while (w > 0);

-  }

-}

-#else

-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

-                             uint8_t *dst, ptrdiff_t dst_stride,

-                             const InterpKernel *filter, int x0_q4,

-                             int x_step_q4, int y0_q4, int y_step_q4, int w,

-                             int h) {

   const int16x8_t filters = vld1q_s16(filter[y0_q4]);

   assert(!((intptr_t)dst & 3));

@@ -1272,8 +1503,6 @@

-#endif

 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,

                                  uint8_t *dst, ptrdiff_t dst_stride,

                                  const InterpKernel *filter, int x0_q4,

@@ -1464,3 +1693,5 @@

     } while (w != 0);

+#endif

--

⑨