ref: 1e852dc1af7e72eafd4f815274eb63faab85185b
parent: ed1298182b54d50bed64c232e31bd88290556e67
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Nov 6 06:42:32 EST 2018
AVX2 for emu_edge emu_edge_w4_8bpc_c: 226.7 emu_edge_w4_8bpc_avx2: 72.6 emu_edge_w8_8bpc_c: 317.7 emu_edge_w8_8bpc_avx2: 87.9 emu_edge_w16_8bpc_c: 499.2 emu_edge_w16_8bpc_avx2: 92.1 emu_edge_w32_8bpc_c: 617.4 emu_edge_w32_8bpc_avx2: 165.0 emu_edge_w64_8bpc_c: 1579.0 emu_edge_w64_8bpc_avx2: 412.3 emu_edge_w128_8bpc_c: 3266.9 emu_edge_w128_8bpc_avx2: 1548.0
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -3281,4 +3281,198 @@
jg .w128_loop
RET
+INIT_YMM avx2
+cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovl r10, yq
+ test yq, yq
+ cmovl r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovl r10, xq
+ test xq, xq
+ cmovl r10, r12
+ add srcq, r10
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovl bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovl topextq, r12
+ cmp bottomextq, bhq
+ cmovge bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovl rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovl leftextq, r12
+ cmp rightextq, bwq
+ cmovge rightextq, r2
+ cmp leftextq, bwq
+ cmovge leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ test leftextq, leftextq
+ jz .body_%3
+
+ ; left extension
+ xor r3, r3
+ vpbroadcastb m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, 32
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+.body_%3:
+ lea r12, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3]
+%if %1
+ movu [r12+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, 32
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+ test rightextq, rightextq
+ jz .body_loop_end_%3
+%if %1
+ add r12, centerwq
+%else
+ lea r12, [dstq+centerwq]
+%endif
+ xor r3, r3
+ vpbroadcastb m0, [srcq+centerwq-1]
+.right_loop_%3:
+ movu [r12+r3], m0
+ add r3, 32
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+.body_loop_end_%3:
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
%endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -58,6 +58,8 @@
decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+decl_emu_edge_fn(dav1d_emu_edge_avx2);
+
void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_##suffix
@@ -97,5 +99,7 @@
c->warp8x8 = dav1d_warp_affine_8x8_avx2;
c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
+
+ c->emu_edge = dav1d_emu_edge_avx2;
#endif
}