ref: e03f2d062a845392d547e9cef4fa2e54f660a17d
parent: ea54dbe2a89d3eb4edbdbbf1810180984467c6aa
author: Martin Storsjö <martin@martin.st>
date: Sat Apr 4 19:49:51 EDT 2020
arm64: mc: NEON implementation of emu_edge for 16bpc Relative speedup over C code: Cortex A53 A72 A73 emu_edge_w4_16bpc_neon: 2.49 1.53 1.91 emu_edge_w8_16bpc_neon: 2.27 1.55 1.90 emu_edge_w16_16bpc_neon: 2.46 1.46 2.09 emu_edge_w32_16bpc_neon: 2.20 1.39 1.73 emu_edge_w64_16bpc_neon: 1.65 1.00 1.46 emu_edge_w128_16bpc_neon: 1.55 1.44 1.54
--- a/src/arm/64/mc16.S
+++ b/src/arm/64/mc16.S
@@ -3407,3 +3407,163 @@
warp
warp t
+
+// void dav1d_emu_edge_16bpc_neon(
+// const intptr_t bw, const intptr_t bh,
+// const intptr_t iw, const intptr_t ih,
+// const intptr_t x, const intptr_t y,
+// pixel *dst, const ptrdiff_t dst_stride,
+// const pixel *ref, const ptrdiff_t ref_stride)
+function emu_edge_16bpc_neon, export=1
+ ldp x8, x9, [sp]
+
+ // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ // ref += iclip(x, 0, iw - 1)
+ sub x12, x3, #1 // ih - 1
+ cmp x5, x3
+ sub x13, x2, #1 // iw - 1
+ csel x12, x12, x5, ge // min(y, ih - 1)
+ cmp x4, x2
+ bic x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
+ csel x13, x13, x4, ge // min(x, iw - 1)
+ bic x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
+ madd x8, x12, x9, x8 // ref += iclip() * stride
+ add x8, x8, x13, lsl #1 // ref += iclip()
+
+ // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ // top_ext = iclip(-y, 0, bh - 1)
+ add x10, x5, x1 // y + bh
+ neg x5, x5 // -y
+ sub x10, x10, x3 // y + bh - ih
+ sub x12, x1, #1 // bh - 1
+ cmp x10, x1
+ bic x5, x5, x5, asr #63 // max(-y, 0)
+ csel x10, x10, x12, lt // min(y + bh - ih, bh-1)
+ cmp x5, x1
+ bic x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
+ csel x5, x5, x12, lt // min(max(-y, 0), bh-1)
+
+ // right_ext = iclip(x + bw - iw, 0, bw - 1)
+ // left_ext = iclip(-x, 0, bw - 1)
+ add x11, x4, x0 // x + bw
+ neg x4, x4 // -x
+ sub x11, x11, x2 // x + bw - iw
+ sub x13, x0, #1 // bw - 1
+ cmp x11, x0
+ bic x4, x4, x4, asr #63 // max(-x, 0)
+ csel x11, x11, x13, lt // min(x + bw - iw, bw-1)
+ cmp x4, x0
+ bic x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
+ csel x4, x4, x13, lt // min(max(-x, 0), bw - 1)
+
+ // center_h = bh - top_ext - bottom_ext
+ // dst += top_ext * PXSTRIDE(dst_stride)
+ // center_w = bw - left_ext - right_ext
+ sub x1, x1, x5 // bh - top_ext
+ madd x6, x5, x7, x6
+ sub x2, x0, x4 // bw - left_ext
+ sub x1, x1, x10 // center_h = bh - top_ext - bottom_ext
+ sub x2, x2, x11 // center_w = bw - left_ext - right_ext
+
+ mov x14, x6 // backup of dst
+
+.macro v_loop need_left, need_right
+0:
+.if \need_left
+ ld1r {v0.8h}, [x8]
+ mov x12, x6 // out = dst
+ mov x3, x4
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+ mov x13, x8
+ add x12, x6, x4, lsl #1 // out = dst + left_ext
+ mov x3, x2
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
+ subs x3, x3, #32
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
+ b.gt 1b
+.if \need_right
+ add x3, x8, x2, lsl #1 // in + center_w
+ sub x3, x3, #2 // in + center_w - 1
+ add x12, x6, x4, lsl #1 // dst + left_ext
+ ld1r {v0.8h}, [x3]
+ add x12, x12, x2, lsl #1 // out = dst + left_ext + center_w
+ mov x3, x11
+ mov v1.16b, v0.16b
+1:
+ subs x3, x3, #16
+ st1 {v0.8h, v1.8h}, [x12], #32
+ b.gt 1b
+.endif
+
+ subs x1, x1, #1 // center_h--
+ add x6, x6, x7
+ add x8, x8, x9
+ b.gt 0b
+.endm
+
+ cbz x4, 2f
+ // need_left
+ cbz x11, 3f
+ // need_left + need_right
+ v_loop 1, 1
+ b 5f
+
+2:
+ // !need_left
+ cbz x11, 4f
+ // !need_left + need_right
+ v_loop 0, 1
+ b 5f
+
+3:
+ // need_left + !need_right
+ v_loop 1, 0
+ b 5f
+
+4:
+ // !need_left + !need_right
+ v_loop 0, 0
+
+5:
+
+ cbz x10, 3f
+ // need_bottom
+ sub x8, x6, x7 // ref = dst - stride
+ mov x4, x0
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
+ mov x3, x10
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x10, x6 // dst -= bottom_ext * stride
+ subs x4, x4, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ cbz x5, 3f
+ // need_top
+ msub x6, x7, x5, x14 // dst = stored_dst - top_ext * stride
+1:
+ ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
+ mov x3, x5
+2:
+ subs x3, x3, #1
+ st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
+ b.gt 2b
+ msub x6, x7, x5, x6 // dst -= top_ext * stride
+ subs x0, x0, #32 // bw -= 32
+ add x6, x6, #64 // dst += 32
+ b.gt 1b
+
+3:
+ ret
+endfunc
--- a/src/arm/mc_init_tmpl.c
+++ b/src/arm/mc_init_tmpl.c
@@ -112,7 +112,7 @@
c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
#endif
-#if BITDEPTH == 8 && ARCH_AARCH64
+#if ARCH_AARCH64
c->emu_edge = BF(dav1d_emu_edge, neon);
#endif
}