shithub: dav1d

Download patch

ref: 1e852dc1af7e72eafd4f815274eb63faab85185b
parent: ed1298182b54d50bed64c232e31bd88290556e67
author: Ronald S. Bultje <rsbultje@gmail.com>
date: Tue Nov 6 06:42:32 EST 2018

AVX2 for emu_edge

emu_edge_w4_8bpc_c: 226.7
emu_edge_w4_8bpc_avx2: 72.6
emu_edge_w8_8bpc_c: 317.7
emu_edge_w8_8bpc_avx2: 87.9
emu_edge_w16_8bpc_c: 499.2
emu_edge_w16_8bpc_avx2: 92.1
emu_edge_w32_8bpc_c: 617.4
emu_edge_w32_8bpc_avx2: 165.0
emu_edge_w64_8bpc_c: 1579.0
emu_edge_w64_8bpc_avx2: 412.3
emu_edge_w128_8bpc_c: 3266.9
emu_edge_w128_8bpc_avx2: 1548.0

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -3281,4 +3281,198 @@
     jg .w128_loop
     RET
 
+INIT_YMM avx2
+cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+                             bottomext, rightext
+    ; we assume that the buffer (stride) is larger than width, so we can
+    ; safely overwrite by a few bytes
+
+    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+    xor                r12d, r12d
+    lea                 r10, [ihq-1]
+    cmp                  yq, ihq
+    cmovl               r10, yq
+    test                 yq, yq
+    cmovl               r10, r12
+    imul                r10, sstrideq
+    add                srcq, r10
+
+    ; ref += iclip(x, 0, iw - 1)
+    lea                 r10, [iwq-1]
+    cmp                  xq, iwq
+    cmovl               r10, xq
+    test                 xq, xq
+    cmovl               r10, r12
+    add                srcq, r10
+
+    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+    lea          bottomextq, [yq+bhq]
+    sub          bottomextq, ihq
+    lea                  r3, [bhq-1]
+    cmovl        bottomextq, r12
+
+    DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+                bottomext, rightext
+
+    ; top_ext = iclip(-y, 0, bh - 1)
+    neg             topextq
+    cmovl           topextq, r12
+    cmp          bottomextq, bhq
+    cmovge       bottomextq, r3
+    cmp             topextq, bhq
+    cmovg           topextq, r3
+
+    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+    lea           rightextq, [xq+bwq]
+    sub           rightextq, iwq
+    lea                  r2, [bwq-1]
+    cmovl         rightextq, r12
+
+    DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+                bottomext, rightext
+
+    ; left_ext = iclip(-x, 0, bw - 1)
+    neg            leftextq
+    cmovl          leftextq, r12
+    cmp           rightextq, bwq
+    cmovge        rightextq, r2
+    cmp            leftextq, bwq
+    cmovge         leftextq, r2
+
+    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+                dst, dstride, src, sstride, bottomext, rightext
+
+    ; center_h = bh - top_ext - bottom_ext
+    lea                  r3, [bottomextq+topextq]
+    sub            centerhq, r3
+
+    ; blk += top_ext * PXSTRIDE(dst_stride)
+    mov                  r2, topextq
+    imul                 r2, dstrideq
+    add                dstq, r2
+    mov                 r9m, dstq
+
+    ; center_w = bw - left_ext - right_ext
+    mov            centerwq, bwq
+    lea                  r3, [rightextq+leftextq]
+    sub            centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+    test           leftextq, leftextq
+    jz .body_%3
+
+    ; left extension
+    xor                  r3, r3
+    vpbroadcastb         m0, [srcq]
+.left_loop_%3:
+    mova          [dstq+r3], m0
+    add                  r3, 32
+    cmp                  r3, leftextq
+    jl .left_loop_%3
+
+    ; body
+.body_%3:
+    lea                 r12, [dstq+leftextq]
+%endif
+    xor                  r3, r3
+.body_loop_%3:
+    movu                 m0, [srcq+r3]
+%if %1
+    movu           [r12+r3], m0
+%else
+    movu          [dstq+r3], m0
+%endif
+    add                  r3, 32
+    cmp                  r3, centerwq
+    jl .body_loop_%3
+
+%if %2
+    ; right extension
+    test          rightextq, rightextq
+    jz .body_loop_end_%3
+%if %1
+    add                 r12, centerwq
+%else
+    lea                 r12, [dstq+centerwq]
+%endif
+    xor                  r3, r3
+    vpbroadcastb         m0, [srcq+centerwq-1]
+.right_loop_%3:
+    movu           [r12+r3], m0
+    add                  r3, 32
+    cmp                  r3, rightextq
+    jl .right_loop_%3
+
+.body_loop_end_%3:
+%endif
+    add                dstq, dstrideq
+    add                srcq, sstrideq
+    dec            centerhq
+    jg .v_loop_%3
+%endmacro
+
+    test           leftextq, leftextq
+    jnz .need_left_ext
+    test          rightextq, rightextq
+    jnz .need_right_ext
+    v_loop                0, 0, 0
+    jmp .body_done
+
+.need_left_ext:
+    test          rightextq, rightextq
+    jnz .need_left_right_ext
+    v_loop                1, 0, 1
+    jmp .body_done
+
+.need_left_right_ext:
+    v_loop                1, 1, 2
+    jmp .body_done
+
+.need_right_ext:
+    v_loop                0, 1, 3
+
+.body_done:
+    ; bottom edge extension
+    test         bottomextq, bottomextq
+    jz .top
+    mov                srcq, dstq
+    sub                srcq, dstrideq
+    xor                  r1, r1
+.bottom_x_loop:
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, bottomextq
+.bottom_y_loop:
+    mova               [r3], m0
+    add                  r3, dstrideq
+    dec                  r4
+    jg .bottom_y_loop
+    add                  r1, 32
+    cmp                  r1, bwq
+    jl .bottom_x_loop
+
+.top:
+    ; top edge extension
+    test            topextq, topextq
+    jz .end
+    mov                srcq, r9m
+    mov                dstq, dstm
+    xor                  r1, r1
+.top_x_loop:
+    mova                 m0, [srcq+r1]
+    lea                  r3, [dstq+r1]
+    mov                  r4, topextq
+.top_y_loop:
+    mova               [r3], m0
+    add                  r3, dstrideq
+    dec                  r4
+    jg .top_y_loop
+    add                  r1, 32
+    cmp                  r1, bwq
+    jl .top_x_loop
+
+.end:
+    RET
 %endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -58,6 +58,8 @@
 decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
 decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
 
+decl_emu_edge_fn(dav1d_emu_edge_avx2);
+
 void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
 #define init_mc_fn(type, name, suffix) \
     c->mc[type] = dav1d_put_##name##_##suffix
@@ -97,5 +99,7 @@
 
     c->warp8x8  = dav1d_warp_affine_8x8_avx2;
     c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
+
+    c->emu_edge = dav1d_emu_edge_avx2;
 #endif
 }