ref: 5bc43169057a4b2386ffd828ae1c2bba8f6ddab2
parent: 785f00feccb5ed7c5739fc72bdcb9b422d8386ca
author: Henrik Gramner <gramner@twoorioles.com>
date: Fri May 31 18:01:44 EDT 2019
x86: Optimize warp8x8 AVX2 asm
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -2639,55 +2639,47 @@
jg .hv_w8_loop0
RET
-%macro WARP_V 5 ; dst, 01, 23, 45, 67
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
; Can be done using gathers, but that's terribly slow on many CPU:s
- lea tmp1d, [myq+deltaq*1]
- lea tmp2d, [myq+deltaq*2]
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
shr myd, 10
shr tmp1d, 10
movq xm8, [filterq+myq *8]
- movq xm10, [filterq+tmp1q*8]
- lea tmp1d, [tmp2q+deltaq*1]
- lea myd, [tmp2q+deltaq*2]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
shr tmp2d, 10
shr tmp1d, 10
movq xm0, [filterq+tmp2q*8]
- movq xm9, [filterq+tmp1q*8]
- lea tmp1d, [myq+deltaq*1]
- lea tmp2d, [myq+deltaq*2]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
shr myd, 10
shr tmp1d, 10
- vinserti128 m8, [filterq+myq *8], 1 ; a e
- vinserti128 m10, [filterq+tmp1q*8], 1 ; b f
- lea tmp1d, [tmp2q+deltaq*1]
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
lea myd, [tmp2q+gammaq] ; my += gamma
shr tmp2d, 10
shr tmp1d, 10
- punpcklbw m8, m10
- vpbroadcastq m10, [filterq+tmp2q*8] ; c g
- vpblendd m0, m10, 0x30
- vpbroadcastq m10, [filterq+tmp1q*8] ; d h
- vpblendd m9, m10, 0x30
- punpcklbw m0, m9
- punpcklwd m9, m8, m0
- punpckhwd m8, m0
- pxor m10, m10
- punpcklbw m0, m9, m8
- punpckhbw m9, m8
- punpcklbw m8, m10, m0 ; a0 a4 b0 b4 c0 c4 d0 d4 << 8
- punpckhbw m0, m10, m0 ; a1 a5 b1 b5 c1 c5 d1 d5 << 8
+ punpcklwd m8, m0
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
pmaddwd m%2, m8
- pmaddwd m0, m%3
- punpcklbw m8, m10, m9 ; a2 a6 b2 b6 c2 c6 d2 d6 << 8
- punpckhbw m9, m10, m9 ; a3 a7 b3 b7 c3 c7 d3 d7 << 8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
pmaddwd m8, m%4
- pmaddwd m9, m%5
- paddd m0, m%2
- mova m%2, m%3
+ pmaddwd m0, m%5
+ paddd m%2, m9
paddd m0, m8
- mova m%3, m%4
- paddd m%1, m0, m9
- mova m%4, m%5
+ paddd m%1, m0, m%2
%endmacro
cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
@@ -2696,13 +2688,13 @@
%endif
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
.loop:
- psrad m11, 13
+ psrad m7, 13
psrad m0, 13
- packssdw m11, m0
- pmulhrsw m11, m14 ; (x + (1 << 6)) >> 7
- vpermq m0, m11, q3120
- mova [tmpq+tsq*0], xm0
- vextracti128 [tmpq+tsq*2], m0, 1
+ packssdw m7, m0
+ pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
dec r4d
jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
@@ -2723,15 +2715,15 @@
call .main2
lea dstq, [dstq+dsq*2]
.start:
- psrad m11, 17
- psrad m0, 17
- packssdw m11, m0
- pmulhrsw m11, m14 ; (x + (1 << 10)) >> 11
- vextracti128 xm0, m11, 1
- packuswb xm11, xm0
- pshufd xm0, xm11, q3120
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
+ psrad m7, 18
+ psrad m0, 18
+ packusdw m7, m0
+ pavgw m7, m11 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ pshufd xm7, xm7, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
dec r4d
jg .loop
.end:
@@ -2759,83 +2751,82 @@
mova m13, [warp_8x8_shufB]
vpbroadcastd m14, [pw_8192]
vpbroadcastd m15, [pd_32768]
+ pxor m11, m11
lea filterq, [mc_warp_filter]
lea tmp1q, [ssq*3+3]
add mxd, 512+(64<<10)
lea tmp2d, [alphaq*3]
- add tmp2d, tmp2d
sub srcq, tmp1q ; src -= src_stride*3 + 3
- sub betad, tmp2d ; beta -= alpha*6
+ sub betad, tmp2d ; beta -= alpha*3
mov myd, r7m
call .h
psrld m1, m0, 16
call .h
- pblendw m1, m0, 0xaa ; 01
- psrld m2, m0, 16
+ psrld m4, m0, 16
call .h
- pblendw m2, m0, 0xaa ; 12
- psrld m3, m0, 16
+ pblendw m1, m0, 0xaa ; 02
call .h
- pblendw m3, m0, 0xaa ; 23
- psrld m4, m0, 16
+ pblendw m4, m0, 0xaa ; 13
call .h
- pblendw m4, m0, 0xaa ; 34
- psrld m5, m0, 16
+ psrld m2, m1, 16
+ pblendw m2, m0, 0xaa ; 24
call .h
- pblendw m5, m0, 0xaa ; 45
- psrld m6, m0, 16
+ psrld m5, m4, 16
+ pblendw m5, m0, 0xaa ; 35
call .h
- pblendw m6, m0, 0xaa ; 56
+ psrld m3, m2, 16
+ pblendw m3, m0, 0xaa ; 46
movsx deltad, word [abcdq+2*2]
movsx gammad, word [abcdq+2*3]
add myd, 512+(64<<10)
mov r4d, 4
lea tmp1d, [deltaq*3]
- add tmp1d, tmp1d
- sub gammad, tmp1d ; gamma -= delta*6
+ sub gammad, tmp1d ; gamma -= delta*3
.main2:
call .h
- psrld m7, m6, 16
- pblendw m7, m0, 0xaa ; 67
- WARP_V 11, 1, 3, 5, 7
+ psrld m6, m5, 16
+ pblendw m6, m0, 0xaa ; 57
+ WARP_V 7, 1, 3, 4, 6
call .h
- psrld m7, 16
- pblendw m7, m0, 0xaa ; 78
- WARP_V 0, 2, 4, 6, 7
+ mova m1, m2
+ mova m2, m3
+ psrld m3, 16
+ pblendw m3, m0, 0xaa ; 68
+ WARP_V 0, 4, 6, 1, 3
+ mova m4, m5
+ mova m5, m6
ret
ALIGN function_align
.h:
- lea tmp1d, [mxq+alphaq*1]
- lea tmp2d, [mxq+alphaq*2]
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ vbroadcasti128 m10, [srcq]
shr mxd, 10
shr tmp1d, 10
- vbroadcasti128 m10, [srcq]
movq xm8, [filterq+mxq *8]
- movhps xm8, [filterq+tmp1q*8]
- lea tmp1d, [tmp2q+alphaq*1]
- lea mxd, [tmp2q+alphaq*2]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
shr tmp2d, 10
shr tmp1d, 10
- movq xm9, [filterq+tmp2q*8]
- movhps xm9, [filterq+tmp1q*8]
- lea tmp1d, [mxq+alphaq*1]
- lea tmp2d, [mxq+alphaq*2]
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
shr mxd, 10
shr tmp1d, 10
- vpbroadcastq m0, [filterq+mxq *8]
- vpblendd m8, m0, 0x30
- vpbroadcastq m0, [filterq+tmp1q*8]
- vpblendd m8, m0, 0xc0 ; 0 1 4 5
- pshufb m0, m10, m12
- pmaddubsw m0, m8
- lea tmp1d, [tmp2q+alphaq*1]
+ movq xm9, [filterq+mxq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
lea mxd, [tmp2q+betaq] ; mx += beta
shr tmp2d, 10
shr tmp1d, 10
- vpbroadcastq m8, [filterq+tmp2q*8]
- vpblendd m9, m8, 0x30
- vpbroadcastq m8, [filterq+tmp1q*8]
- vpblendd m9, m8, 0xc0 ; 2 3 6 7
+ punpcklqdq m8, m0 ; 0 1 4 5
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ punpcklqdq m9, m0 ; 2 3 6 7
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
pshufb m10, m13
pmaddubsw m10, m9
add srcq, ssq