ref: 973d11bfab8296ed172790ddbb79538c3daa5566
parent: 22e144fb4c5dae7580c149d8f98f888e3d688b26
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Nov 5 09:13:02 EST 2018
x86: Add warp8x8 and warp8x8t AVX2 asm
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -30,6 +30,10 @@
SECTION_RODATA 32
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
@@ -47,8 +51,9 @@
pw_1024: times 2 dw 1024
pw_2048: times 2 dw 2048
pw_8192: times 2 dw 8192
-pd_32: dd 32
-pd_512: dd 512
+pd_32: dd 32
+pd_512: dd 512
+pd_32768: dd 32768
cextern mc_subpel_filters
%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
@@ -123,6 +128,8 @@
%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+cextern mc_warp_filter
+
SECTION .text
INIT_XMM avx2
@@ -2626,6 +2633,211 @@
sub r6d, 1<<8
jg .hv_w8_loop0
RET
+
+%macro WARP_V 5 ; dst, 01, 23, 45, 67
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ lea tmp1d, [myq+deltaq*1]
+ lea tmp2d, [myq+deltaq*2]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ movq xm10, [filterq+tmp1q*8]
+ lea tmp1d, [tmp2q+deltaq*1]
+ lea myd, [tmp2q+deltaq*2]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ movq xm9, [filterq+tmp1q*8]
+ lea tmp1d, [myq+deltaq*1]
+ lea tmp2d, [myq+deltaq*2]
+ shr myd, 10
+ shr tmp1d, 10
+ vinserti128 m8, [filterq+myq *8], 1 ; a e
+ vinserti128 m10, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [tmp2q+deltaq*1]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklbw m8, m10
+ vpbroadcastq m10, [filterq+tmp2q*8] ; c g
+ vpblendd m0, m10, 0x30
+ vpbroadcastq m10, [filterq+tmp1q*8] ; d h
+ vpblendd m9, m10, 0x30
+ punpcklbw m0, m9
+ punpcklwd m9, m8, m0
+ punpckhwd m8, m0
+ pxor m10, m10
+ punpcklbw m0, m9, m8
+ punpckhbw m9, m8
+ punpcklbw m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8
+ punpckhbw m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8
+ pmaddwd m%2, m8
+ pmaddwd m0, m%3
+ punpcklbw m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8
+ punpckhbw m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m9, m%5
+ paddd m0, m%2
+ mova m%2, m%3
+ paddd m0, m8
+ mova m%3, m%4
+ paddd m%1, m0, m9
+ mova m%4, m%5
+%endmacro
+
+cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+%if WIN64
+ sub rsp, 0xa0
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
+.loop:
+ psrad m11, 13
+ psrad m0, 13
+ packssdw m11, m0
+ pmulhrsw m11, m14 ; (x + (1 << 6)) >> 7
+ vpermq m0, m11, q3120
+ mova [tmpq+tsq*0], xm0
+ vextracti128 [tmpq+tsq*2], m0, 1
+ dec r4d
+ jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
+ call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+%if WIN64
+ sub rsp, 0xa0
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 0xa0
+ %assign stack_offset stack_offset+stack_size_padded
+%endif
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m11, 17
+ psrad m0, 17
+ packssdw m11, m0
+ pmulhrsw m11, m14 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m11, 1
+ packuswb xm11, xm0
+ pshufd xm0, xm11, q3120
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+ movaps [rsp+stack_offset+0x10], xmm6
+ movaps [rsp+stack_offset+0x20], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+ movaps [rsp+0x98], xmm15
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ mova m12, [warp_8x8_shufA]
+ mova m13, [warp_8x8_shufB]
+ vpbroadcastd m14, [pw_8192]
+ vpbroadcastd m15, [pd_32768]
+ lea filterq, [mc_warp_filter]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ add tmp2d, tmp2d
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+ sub betad, tmp2d ; beta -= alpha*6
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 01
+ psrld m2, m0, 16
+ call .h
+ pblendw m2, m0, 0xaa ; 12
+ psrld m3, m0, 16
+ call .h
+ pblendw m3, m0, 0xaa ; 23
+ psrld m4, m0, 16
+ call .h
+ pblendw m4, m0, 0xaa ; 34
+ psrld m5, m0, 16
+ call .h
+ pblendw m5, m0, 0xaa ; 45
+ psrld m6, m0, 16
+ call .h
+ pblendw m6, m0, 0xaa ; 56
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ add tmp1d, tmp1d
+ sub gammad, tmp1d ; gamma -= delta*6
+.main2:
+ call .h
+ psrld m7, m6, 16
+ pblendw m7, m0, 0xaa ; 67
+ WARP_V 11, 1, 3, 5, 7
+ call .h
+ psrld m7, 16
+ pblendw m7, m0, 0xaa ; 78
+ WARP_V 0, 2, 4, 6, 7
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*1]
+ lea tmp2d, [mxq+alphaq*2]
+ shr mxd, 10
+ shr tmp1d, 10
+ vbroadcasti128 m10, [srcq]
+ movq xm8, [filterq+mxq *8]
+ movhps xm8, [filterq+tmp1q*8]
+ lea tmp1d, [tmp2q+alphaq*1]
+ lea mxd, [tmp2q+alphaq*2]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+tmp2q*8]
+ movhps xm9, [filterq+tmp1q*8]
+ lea tmp1d, [mxq+alphaq*1]
+ lea tmp2d, [mxq+alphaq*2]
+ shr mxd, 10
+ shr tmp1d, 10
+ vpbroadcastq m0, [filterq+mxq *8]
+ vpblendd m8, m0, 0x30
+ vpbroadcastq m0, [filterq+tmp1q*8]
+ vpblendd m8, m0, 0xc0 ; 0 1 4 5
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
+ lea tmp1d, [tmp2q+alphaq*1]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ vpbroadcastq m8, [filterq+tmp2q*8]
+ vpblendd m9, m8, 0x30
+ vpbroadcastq m8, [filterq+tmp1q*8]
+ vpblendd m9, m8, 0xc0 ; 2 3 6 7
+ pshufb m10, m13
+ pmaddubsw m10, m9
+ add srcq, ssq
+ phaddw m0, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+ ret
%macro BIDIR_FN 1 ; op
%1 0
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -55,6 +55,9 @@
decl_mask_fn(dav1d_mask_avx2);
decl_w_mask_fn(dav1d_w_mask_420_avx2);
+decl_warp8x8_fn(dav1d_warp_affine_8x8_avx2);
+decl_warp8x8t_fn(dav1d_warp_affine_8x8t_avx2);
+
void bitfn(dav1d_mc_dsp_init_x86)(Dav1dMCDSPContext *const c) {
#define init_mc_fn(type, name, suffix) \
c->mc[type] = dav1d_put_##name##_##suffix
@@ -91,5 +94,8 @@
c->w_avg = dav1d_w_avg_avx2;
c->mask = dav1d_mask_avx2;
c->w_mask[2] = dav1d_w_mask_420_avx2;
+
+ c->warp8x8 = dav1d_warp_affine_8x8_avx2;
+ c->warp8x8t = dav1d_warp_affine_8x8t_avx2;
#endif
}