ref: ea74e3d513206fcdda4316f3f1303df47b890d48
parent: 07261e8c38ab709f86a279df996e9e0ec0e9c508
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Thu Jun 11 09:11:55 EDT 2020
x86: Add prep_8tap_scaled AVX2 asm mct_scaled_8tap_regular_w4_8bpc_c: 872.1 mct_scaled_8tap_regular_w4_8bpc_avx2: 125.6 mct_scaled_8tap_regular_w4_dy1_8bpc_c: 886.3 mct_scaled_8tap_regular_w4_dy1_8bpc_avx2: 84.0 mct_scaled_8tap_regular_w4_dy2_8bpc_c: 1189.1 mct_scaled_8tap_regular_w4_dy2_8bpc_avx2: 84.7 mct_scaled_8tap_regular_w8_8bpc_c: 2261.0 mct_scaled_8tap_regular_w8_8bpc_avx2: 306.2 mct_scaled_8tap_regular_w8_dy1_8bpc_c: 2189.9 mct_scaled_8tap_regular_w8_dy1_8bpc_avx2: 233.8 mct_scaled_8tap_regular_w8_dy2_8bpc_c: 3060.3 mct_scaled_8tap_regular_w8_dy2_8bpc_avx2: 282.8 mct_scaled_8tap_regular_w16_8bpc_c: 4335.3 mct_scaled_8tap_regular_w16_8bpc_avx2: 680.7 mct_scaled_8tap_regular_w16_dy1_8bpc_c: 5137.2 mct_scaled_8tap_regular_w16_dy1_8bpc_avx2: 578.6 mct_scaled_8tap_regular_w16_dy2_8bpc_c: 7878.4 mct_scaled_8tap_regular_w16_dy2_8bpc_avx2: 774.6 mct_scaled_8tap_regular_w32_8bpc_c: 17871.9 mct_scaled_8tap_regular_w32_8bpc_avx2: 2954.8 mct_scaled_8tap_regular_w32_dy1_8bpc_c: 18594.7 mct_scaled_8tap_regular_w32_dy1_8bpc_avx2: 2073.9 mct_scaled_8tap_regular_w32_dy2_8bpc_c: 28696.0 mct_scaled_8tap_regular_w32_dy2_8bpc_avx2: 2852.1 mct_scaled_8tap_regular_w64_8bpc_c: 46967.5 mct_scaled_8tap_regular_w64_8bpc_avx2: 7527.5 mct_scaled_8tap_regular_w64_dy1_8bpc_c: 45564.2 mct_scaled_8tap_regular_w64_dy1_8bpc_avx2: 5262.9 mct_scaled_8tap_regular_w64_dy2_8bpc_c: 72793.3 mct_scaled_8tap_regular_w64_dy2_8bpc_avx2: 7535.9 mct_scaled_8tap_regular_w128_8bpc_c: 111190.8 mct_scaled_8tap_regular_w128_8bpc_avx2: 19386.8 mct_scaled_8tap_regular_w128_dy1_8bpc_c: 122625.0 mct_scaled_8tap_regular_w128_dy1_8bpc_avx2: 15376.1 mct_scaled_8tap_regular_w128_dy2_8bpc_c: 197120.6 mct_scaled_8tap_regular_w128_dy2_8bpc_avx2: 21871.0
--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -244,7 +244,7 @@
%endmacro
%macro SCALED_JMP_TABLE 1-*
- %xdefine %1_table (%%table - 2)
+ %xdefine %1_table (%%table - %2)
%xdefine %%base mangle(private_prefix %+ _%1)
%%table:
%rep %0 - 1
@@ -253,7 +253,7 @@
%endrep
%rotate 1
%%dy_1024:
- %xdefine %1_dy1_table (%%dy_1024 - 2)
+ %xdefine %1_dy1_table (%%dy_1024 - %2)
%rep %0 - 1
dw %%base %+ .dy1_w%2 - %%base
%rotate 1
@@ -260,7 +260,7 @@
%endrep
%rotate 1
%%dy_2048:
- %xdefine %1_dy2_table (%%dy_2048 - 2)
+ %xdefine %1_dy2_table (%%dy_2048 - %2)
%rep %0 - 1
dw %%base %+ .dy2_w%2 - %%base
%rotate 1
@@ -280,6 +280,7 @@
HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
@@ -3914,54 +3915,159 @@
RET
%endmacro
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
-%if WIN64
-DECLARE_REG_TMP 6, 5
-%else
-DECLARE_REG_TMP 6, 8
-%endif
-PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
-PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
-%if required_stack_alignment <= STACK_ALIGNMENT
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
-%else
+ %else
cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
%endif
- lea r12, [put_8tap_scaled_avx2]
-%define base r12-put_8tap_scaled_avx2
+ lea base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
tzcnt wd, wm
vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
vpbroadcastd m14, mxm
+%endif
mov dyd, dym
-%if WIN64
+%ifidn %1, put
+ %if WIN64
mov r8d, hm
- DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
- %define hm r5m
-%else
- DEFINE_ARGS dst, ds, src, ss, w, h, _, my, _, dy, ss3
- %define hm r6m
-%endif
-%if required_stack_alignment > STACK_ALIGNMENT
- %define dsm [rsp+96]
- %define rX r1
- %define rXd r1d
-%else
- %define dsm dsq
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+96]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
%define rX r14
%define rXd r14d
%endif
vpbroadcastd m10, [base+pd_0x3ff]
vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
pxor m9, m9
lea ss3q, [ssq*3]
movzx r7d, t1b
@@ -3973,9 +4079,10 @@
je .dy1
cmp dyd, 2048
je .dy2
- movzx wd, word [base+put_8tap_scaled_avx2_table+wq*2]
- add wq, r12
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
jmp wq
+%ifidn %1, put
.w2:
mov myd, mym
movzx t0d, t0b
@@ -4086,6 +4193,7 @@
punpcklwd xm2, xm1, xm5 ; 45 56
punpckhwd xm4, xm1, xm5 ; 67 __
jmp .w2_loop
+%endif
.w4:
mov myd, mym
vbroadcasti128 m7, [base+rescale_mul]
@@ -4175,11 +4283,16 @@
paddd xm6, xm7
paddd xm4, xm13
paddd xm4, xm6
- psrad xm4, 10
+ psrad xm4, rndshift
packssdw xm4, xm4
+%ifidn %1, put
packuswb xm4, xm4
movd [dstq], xm4
add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
dec hd
jz .ret
add myd, dyd
@@ -4222,7 +4335,9 @@
lea srcq, [srcq+ssq*2]
jmp .w4_loop
.w8:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
movd xm15, t0d
@@ -4269,42 +4384,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
-%macro PUT_8TAP_SCALED_H 8 ; dst, tmp[0-6]
- movq xm%1, [srcq+r4]
- movq xm%2, [srcq+r6]
- movhps xm%1, [srcq+r7]
- movhps xm%2, [srcq+r9]
- vinserti128 m%1, [srcq+r10], 1
- vinserti128 m%2, [srcq+r11], 1
- vpbroadcastq m%5, [srcq+r13]
- vpbroadcastq m%6, [srcq+rX]
- add srcq, ssq
- movq xm%3, [srcq+r4]
- movq xm%4, [srcq+r6]
- movhps xm%3, [srcq+r7]
- movhps xm%4, [srcq+r9]
- vinserti128 m%3, [srcq+r10], 1
- vinserti128 m%4, [srcq+r11], 1
- vpbroadcastq m%7, [srcq+r13]
- vpbroadcastq m%8, [srcq+rX]
- add srcq, ssq
- vpblendd m%1, m%5, 0xc0
- vpblendd m%2, m%6, 0xc0
- vpblendd m%3, m%7, 0xc0
- vpblendd m%4, m%8, 0xc0
- pmaddubsw m%1, m15
- pmaddubsw m%2, m10
- pmaddubsw m%3, m15
- pmaddubsw m%4, m10
- phaddw m%1, m%2
- phaddw m%3, m%4
- phaddw m%1, m%3
- pmulhrsw m%1, m12
-%endmacro
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
mov dyd, dym
pshufb m0, m14 ; 01a 01b
@@ -4335,12 +4418,17 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
dec hd
jz .ret
add myd, dyd
@@ -4354,11 +4442,11 @@
mov r9d, [rsp+12]
jz .w8_skip_line
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
add srcq, ssq
@@ -4386,22 +4474,22 @@
mova m1, m2
mova m2, m3
vpbroadcastq m7, [srcq+r13]
- vpbroadcastq m8, [srcq+rX]
- movq xm3, [srcq+r4]
- movq xm4, [srcq+r6]
- movhps xm3, [srcq+r7]
- movhps xm4, [srcq+r9]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
add srcq, ssq
- movq xm5, [srcq+r4]
- movq xm6, [srcq+r6]
- movhps xm5, [srcq+r7]
- movhps xm6, [srcq+r9]
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
vinserti128 m5, [srcq+r10], 1
vinserti128 m6, [srcq+r11], 1
vpbroadcastq m9, [srcq+r13]
- vpbroadcastq m11, [srcq+rX]
+ vpbroadcastq m11, [srcq+ rX]
add srcq, ssq
mov myd, [rsp+16]
mov dyd, dym
@@ -4423,18 +4511,24 @@
pmulhrsw m3, m12
jmp .w8_loop
.w16:
- mov dword [rsp+48], 1 << 1
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
jmp .w_start
.w32:
- mov dword [rsp+48], 1 << 3
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
jmp .w_start
.w64:
- mov dword [rsp+48], 1 << 7
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
jmp .w_start
.w128:
- mov dword [rsp+48], 1 << 15
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
.w_start:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
pmaddwd m8, [base+rescale_mul]
@@ -4441,26 +4535,26 @@
movd xm15, t0d
mov [rsp+72], t0d
mov [rsp+56], srcq
- mov [rsp+64], dstq
+ mov [rsp+64], r0q ; dstq / tmpq
%if UNIX64
mov hm, hd
%endif
- shl dword r8m, 3 ; dx*8
+ shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
jmp .hloop
.hloop_prep:
- shr dword [rsp+48], 1
+ dec dword [rsp+48]
jz .ret
- add qword [rsp+64], 8
+ add qword [rsp+64], 8*(isprep+1)
mov hd, hm
- vpbroadcastd m8, r8m
+ vpbroadcastd m8, dxm
vpbroadcastd m10, [base+pd_0x3ff]
paddd m14, m8, [rsp+16]
vpbroadcastd m15, [rsp+72]
pxor m9, m9
mov srcq, [rsp+56]
- mov dstq, [rsp+64]
+ mov r0q, [rsp+64] ; dstq / tmpq
.hloop:
vpbroadcastq m11, [base+pq_0x40000000]
pand m6, m14, m10
@@ -4477,14 +4571,14 @@
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movu [rsp+16], m14
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
mova [rsp], xm14
@@ -4503,10 +4597,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
mov dyd, dym
pshufb m0, m14 ; 01a 01b
@@ -4537,12 +4631,17 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
dec hd
jz .hloop_prep
add myd, dyd
@@ -4556,11 +4655,11 @@
mov r9d, [rsp+12]
jz .skip_line
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
add srcq, ssq
@@ -4588,22 +4687,22 @@
mova m1, m2
mova m2, m3
vpbroadcastq m7, [srcq+r13]
- vpbroadcastq m8, [srcq+rX]
- movq xm3, [srcq+r4]
- movq xm4, [srcq+r6]
- movhps xm3, [srcq+r7]
- movhps xm4, [srcq+r9]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
add srcq, ssq
- movq xm5, [srcq+r4]
- movq xm6, [srcq+r6]
- movhps xm5, [srcq+r7]
- movhps xm6, [srcq+r9]
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
vinserti128 m5, [srcq+r10], 1
vinserti128 m6, [srcq+r11], 1
vpbroadcastq m9, [srcq+r13]
- vpbroadcastq m11, [srcq+rX]
+ vpbroadcastq m11, [srcq+ rX]
add srcq, ssq
mov myd, [rsp+52]
mov dyd, dym
@@ -4625,9 +4724,10 @@
pmulhrsw m3, m12
jmp .vloop
.dy1:
- movzx wd, word [base+put_8tap_scaled_avx2_dy1_table+wq*2]
- add wq, r12
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
jmp wq
+%ifidn %1, put
.dy1_w2:
mov myd, mym
movzx t0d, t0b
@@ -4706,7 +4806,7 @@
mova xm4, xm1
paddd xm5, xm6
paddd xm5, xm7
- psrad xm5, 10
+ psrad xm5, rndshift
packssdw xm5, xm5
packuswb xm5, xm5
pextrw [dstq+dsq*0], xm5, 0
@@ -4715,6 +4815,7 @@
sub hd, 2
jg .dy1_w2_loop
RET
+%endif
.dy1_w4:
mov myd, mym
vbroadcasti128 m7, [base+rescale_mul]
@@ -4810,19 +4911,27 @@
pmaddwd m6, m2, m10
paddd m4, m5
paddd m4, m6
- psrad m4, 10
+ psrad m4, rndshift
vextracti128 xm5, m4, 1
packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
pshuflw xm4, xm4, q3120
movd [dstq+dsq*0], xm4
pextrd [dstq+dsq*1], xm4, 1
lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
sub hd, 2
jg .dy1_w4_loop
- RET
+ MC_8TAP_SCALED_RET
.dy1_w8:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
movd xm15, t0d
@@ -4843,14 +4952,14 @@
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movd r4d, xm14
@@ -4869,10 +4978,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
movu [rsp], m10
pshufb m0, m14 ; 01a 01b
@@ -4902,22 +5011,27 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
dec hd
jz .ret
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
+ vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
pshufb m0, m14
pshufb m1, m14
@@ -4937,18 +5051,24 @@
pblendw m3, m4, 0xaa
jmp .dy1_w8_loop
.dy1_w16:
- mov dword [rsp+72], 1 << 1
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
jmp .dy1_w_start
.dy1_w32:
- mov dword [rsp+72], 1 << 3
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
jmp .dy1_w_start
.dy1_w64:
- mov dword [rsp+72], 1 << 7
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
jmp .dy1_w_start
.dy1_w128:
- mov dword [rsp+72], 1 << 15
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
.dy1_w_start:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
pmaddwd m8, [base+rescale_mul]
@@ -4955,26 +5075,26 @@
movd xm15, t0d
mov [rsp+76], t0d
mov [rsp+80], srcq
- mov [rsp+88], dstq
+ mov [rsp+88], r0q ; dstq / tmpq
%if UNIX64
mov hm, hd
%endif
- shl dword r8m, 3 ; dx*8
+ shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
jmp .dy1_hloop
.dy1_hloop_prep:
- shr dword [rsp+72], 1
+ dec dword [rsp+72]
jz .ret
- add qword [rsp+88], 8
+ add qword [rsp+88], 8*(isprep+1)
mov hd, hm
- vpbroadcastd m8, r8m
+ vpbroadcastd m8, dxm
vpbroadcastd m10, [base+pd_0x3ff]
paddd m14, m8, [rsp+32]
vpbroadcastd m15, [rsp+76]
pxor m9, m9
mov srcq, [rsp+80]
- mov dstq, [rsp+88]
+ mov r0q, [rsp+88] ; dstq / tmpq
.dy1_hloop:
vpbroadcastq m11, [base+pq_0x40000000]
pand m6, m14, m10
@@ -4991,14 +5111,14 @@
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movu [rsp+32], m14
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movq [rsp+64], xm14
@@ -5017,10 +5137,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
movu [rsp], m10
pshufb m0, m14 ; 01a 01b
@@ -5051,22 +5171,27 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
dec hd
jz .dy1_hloop_prep
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
+ vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
pshufb m0, m14
pshufb m1, m14
@@ -5086,9 +5211,10 @@
pblendw m3, m4, 0xaa
jmp .dy1_vloop
.dy2:
- movzx wd, word [base+put_8tap_scaled_avx2_dy2_table+wq*2]
- add wq, r12
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
jmp wq
+%ifidn %1, put
.dy2_w2:
mov myd, mym
movzx t0d, t0b
@@ -5170,7 +5296,7 @@
paddd xm4, xm13
paddd xm6, xm7
paddd xm4, xm6
- psrad xm4, 10
+ psrad xm4, rndshift
packssdw xm4, xm4
packuswb xm4, xm4
pextrw [dstq+dsq*0], xm4, 0
@@ -5179,6 +5305,7 @@
sub hd, 2
jg .dy2_w2_loop
RET
+%endif
.dy2_w4:
mov myd, mym
vbroadcasti128 m7, [base+rescale_mul]
@@ -5270,18 +5397,25 @@
paddd m4, m13
paddd m6, m7
paddd m4, m6
- psrad m4, 10
+ psrad m4, rndshift
vextracti128 xm5, m4, 1
packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movd [dstq+dsq*0], xm4
pextrd [dstq+dsq*1], xm4, 1
lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
sub hd, 2
jg .dy2_w4_loop
- RET
+ MC_8TAP_SCALED_RET
.dy2_w8:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
movd xm15, t0d
@@ -5302,14 +5436,14 @@
pextrd r11d, xm7, 2
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movd r4d, xm14
@@ -5328,10 +5462,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
@@ -5359,25 +5493,30 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
dec hd
jz .ret
mova m0, m1
mova m1, m2
mova m2, m3
- movq xm3, [srcq+r4]
- movq xm4, [srcq+r6]
- movhps xm3, [srcq+r7]
- movhps xm4, [srcq+r9]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
vpbroadcastq m5, [srcq+r13]
- vpbroadcastq m6, [srcq+rX]
+ vpbroadcastq m6, [srcq+ rX]
add srcq, ssq
vpblendd m3, m5, 0xc0
vpblendd m4, m6, 0xc0
@@ -5384,14 +5523,14 @@
pmaddubsw m3, m15
pmaddubsw m4, m10
phaddw m3, m4
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
+ vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
vpblendd m4, m6, 0xc0
vpblendd m5, m7, 0xc0
@@ -5406,18 +5545,24 @@
pmulhrsw m3, m12
jmp .dy2_w8_loop
.dy2_w16:
- mov dword [rsp+40], 1 << 1
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
jmp .dy2_w_start
.dy2_w32:
- mov dword [rsp+40], 1 << 3
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
jmp .dy2_w_start
.dy2_w64:
- mov dword [rsp+40], 1 << 7
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
jmp .dy2_w_start
.dy2_w128:
- mov dword [rsp+40], 1 << 15
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
.dy2_w_start:
+%ifidn %1, put
movifnidn dsm, dsq
+%endif
shr t0d, 16
sub srcq, 3
pmaddwd m8, [base+rescale_mul]
@@ -5424,26 +5569,26 @@
movd xm15, t0d
mov [rsp+64], t0d
mov [rsp+48], srcq
- mov [rsp+56], dstq
+ mov [rsp+56], r0q ; dstq / tmpq
%if UNIX64
- mov r6m, hd
+ mov hm, hd
%endif
- shl dword r8m, 3 ; dx*8
+ shl dword dxm, 3 ; dx*8
vpbroadcastd m15, xm15
paddd m14, m8 ; mx+dx*[0-7]
jmp .dy2_hloop
.dy2_hloop_prep:
- shr dword [rsp+40], 1
+ dec dword [rsp+40]
jz .ret
- add qword [rsp+56], 8
+ add qword [rsp+56], 8*(isprep+1)
mov hd, hm
- vpbroadcastd m8, r8m
+ vpbroadcastd m8, dxm
vpbroadcastd m10, [base+pd_0x3ff]
paddd m14, m8, [rsp]
vpbroadcastd m15, [rsp+64]
pxor m9, m9
mov srcq, [rsp+48]
- mov dstq, [rsp+56]
+ mov r0q, [rsp+56] ; dstq / tmpq
.dy2_hloop:
vpbroadcastq m11, [base+pq_0x40000000]
pand m6, m14, m10
@@ -5460,14 +5605,14 @@
pextrd r13d, xm7, 1
pextrd rXd, xm7, 3
movu [rsp], m14
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
vinserti128 m15, [base+subpel_filters+r10*8], 1
vinserti128 m10, [base+subpel_filters+r11*8], 1
vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
psrld m14, 10
vextracti128 xm7, m14, 1
movq [rsp+32], xm14
@@ -5486,10 +5631,10 @@
pblendvb m15, m11, m5
pblendvb m10, m11, m6
vbroadcasti128 m14, [base+subpel_s_shuf8]
- PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
mov myd, mym
pshufb m0, m14 ; 01a 01b
pshufb m1, m14 ; 23a 23b
@@ -5518,25 +5663,30 @@
paddd m6, m7
paddd m4, m13
paddd m4, m6
- psrad m4, 10
- packssdw m4, m4
- vpermq m4, m4, q3120
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
packuswb xm4, xm4
movq [dstq], xm4
add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
dec hd
jz .dy2_hloop_prep
mova m0, m1
mova m1, m2
mova m2, m3
- movq xm3, [srcq+r4]
- movq xm4, [srcq+r6]
- movhps xm3, [srcq+r7]
- movhps xm4, [srcq+r9]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
vinserti128 m3, [srcq+r10], 1
vinserti128 m4, [srcq+r11], 1
vpbroadcastq m5, [srcq+r13]
- vpbroadcastq m6, [srcq+rX]
+ vpbroadcastq m6, [srcq+ rX]
add srcq, ssq
vpblendd m3, m5, 0xc0
vpblendd m4, m6, 0xc0
@@ -5543,14 +5693,14 @@
pmaddubsw m3, m15
pmaddubsw m4, m10
phaddw m3, m4
- movq xm4, [srcq+r4]
- movq xm5, [srcq+r6]
- movhps xm4, [srcq+r7]
- movhps xm5, [srcq+r9]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
vinserti128 m4, [srcq+r10], 1
vinserti128 m5, [srcq+r11], 1
vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+rX]
+ vpbroadcastq m7, [srcq+ rX]
add srcq, ssq
vpblendd m4, m6, 0xc0
vpblendd m5, m7, 0xc0
@@ -5565,7 +5715,43 @@
pmulhrsw m3, m12
jmp .dy2_vloop
.ret:
- RET
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+MC_8TAP_SCALED prep
%macro WARP_V 5 ; dst, 02, 46, 13, 57
; Can be done using gathers, but that's terribly slow on many CPU:s
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -49,16 +49,6 @@
decl_mc_fn(dav1d_put_bilin_avx2);
decl_mc_fn(dav1d_put_bilin_ssse3);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
-
decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
@@ -100,6 +90,26 @@
decl_mct_fn(dav1d_prep_bilin_ssse3);
decl_mct_fn(dav1d_prep_bilin_sse2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+
decl_avg_fn(dav1d_avg_avx512icl);
decl_avg_fn(dav1d_avg_avx2);
decl_avg_fn(dav1d_avg_ssse3);
@@ -145,6 +155,8 @@
c->mct[type] = dav1d_prep_##name##_##suffix
#define init_mc_scaled_fn(type, name, suffix) \
c->mc_scaled[type] = dav1d_put_##name##_##suffix
+#define init_mct_scaled_fn(type, name, suffix) \
+ c->mct_scaled[type] = dav1d_prep_##name##_##suffix
const unsigned flags = dav1d_get_cpu_flags();
@@ -232,16 +244,6 @@
init_mc_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
init_mc_fn(FILTER_2D_BILINEAR, bilin, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
- init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
-
init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, avx2);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, avx2);
@@ -252,6 +254,26 @@
init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, avx2);
init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, avx2);
init_mct_fn(FILTER_2D_BILINEAR, bilin, avx2);
+
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mc_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
+
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR, 8tap_scaled_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_scaled_regular_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH, 8tap_scaled_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_scaled_smooth_sharp, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_scaled_sharp_regular, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_scaled_sharp_smooth, avx2);
+ init_mct_scaled_fn(FILTER_2D_8TAP_SHARP, 8tap_scaled_sharp, avx2);
c->avg = dav1d_avg_avx2;
c->w_avg = dav1d_w_avg_avx2;
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -107,6 +107,61 @@
report("mc");
}
+/* Generate worst case input in the topleft corner, randomize the rest */
+static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
+ static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 };
+ const int sign = -(rnd() & 1);
+
+ for (int y = 0; y < 135; y++)
+ for (int x = 0; x < 135; x++)
+ buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
+ : rnd()) & bitdepth_max;
+}
+
+static void check_mct(Dav1dMCDSPContext *const c) {
+ ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+ ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+ ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
+ const pixel *src = src_buf + 135 * 3 + 3;
+ const ptrdiff_t src_stride = 135 * sizeof(pixel);
+
+ declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+ int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
+
+ for (int filter = 0; filter < N_2D_FILTERS; filter++)
+ for (int w = 4; w <= 128; w <<= 1)
+ for (int mxy = 0; mxy < 4; mxy++)
+ if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
+ filter_names[filter], w, mxy_names[mxy], BITDEPTH))
+ for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+ {
+ const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
+ const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
+#if BITDEPTH == 16
+ const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+ const int bitdepth_max = 0xff;
+#endif
+ generate_mct_input(src_buf, bitdepth_max);
+
+ call_ref(c_tmp, src, src_stride, w, h,
+ mx, my HIGHBD_TAIL_SUFFIX);
+ call_new(a_tmp, src, src_stride, w, h,
+ mx, my HIGHBD_TAIL_SUFFIX);
+ checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
+ a_tmp, w * sizeof(*a_tmp),
+ w, h, "tmp");
+
+ if (filter == FILTER_2D_8TAP_REGULAR ||
+ filter == FILTER_2D_BILINEAR)
+ {
+ bench_new(a_tmp, src, src_stride, w, h,
+ mx, my HIGHBD_TAIL_SUFFIX);
+ }
+ }
+ report("mct");
+}
+
static void check_mc_scaled(Dav1dMCDSPContext *const c) {
ALIGN_STK_64(pixel, src_buf, 263 * 263,);
ALIGN_STK_64(pixel, c_dst, 128 * 128,);
@@ -161,47 +216,44 @@
report("mc_scaled");
}
-/* Generate worst case input in the topleft corner, randomize the rest */
-static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
- static const int8_t pattern[8] = { -1, 0, -1, 0, 0, -1, 0, -1 };
- const int sign = -(rnd() & 1);
+static void check_mct_scaled(Dav1dMCDSPContext *const c) {
+ ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+ ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+ ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
+ const pixel *src = src_buf + 263 * 3 + 3;
+ const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+ const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+ const int bitdepth_max = 0xff;
+#endif
- for (int y = 0; y < 135; y++)
- for (int x = 0; x < 135; x++)
- buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
- : rnd()) & bitdepth_max;
-}
-
-static void check_mct(Dav1dMCDSPContext *const c) {
- ALIGN_STK_64(pixel, src_buf, 135 * 135,);
- ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
- ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
- const pixel *src = src_buf + 135 * 3 + 3;
- const ptrdiff_t src_stride = 135 * sizeof(pixel);
-
declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
- int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
+ int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
for (int filter = 0; filter < N_2D_FILTERS; filter++)
for (int w = 4; w <= 128; w <<= 1)
- for (int mxy = 0; mxy < 4; mxy++)
- if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
- filter_names[filter], w, mxy_names[mxy], BITDEPTH))
- for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
- {
- const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
- const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
-#if BITDEPTH == 16
- const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
- const int bitdepth_max = 0xff;
-#endif
- generate_mct_input(src_buf, bitdepth_max);
+ for (int p = 0; p < 3; ++p) {
+ if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
+ filter_names[filter], w, scaled_paths[p], BITDEPTH))
+ {
+ const int h_min = imax(w / 4, 4);
+ const int h_max = imin(w * 4, 128);
+ for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+ const int mx = rnd() % 1024;
+ const int my = rnd() % 1024;
+ const int dx = rnd() % 2048 + 1;
+ const int dy = !p
+ ? rnd() % 2048 + 1
+ : p << 10; // ystep=1.0 and ystep=2.0 paths
- call_ref(c_tmp, src, src_stride, w, h,
- mx, my HIGHBD_TAIL_SUFFIX);
- call_new(a_tmp, src, src_stride, w, h,
- mx, my HIGHBD_TAIL_SUFFIX);
+ for (int k = 0; k < 263 * 263; k++)
+ src_buf[k] = rnd() & bitdepth_max;
+
+ call_ref(c_tmp, src, src_stride,
+ w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+ call_new(a_tmp, src, src_stride,
+ w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
a_tmp, w * sizeof(*a_tmp),
w, h, "tmp");
@@ -208,13 +260,13 @@
if (filter == FILTER_2D_8TAP_REGULAR ||
filter == FILTER_2D_BILINEAR)
- {
- bench_new(a_tmp, src, src_stride, w, h,
- mx, my HIGHBD_TAIL_SUFFIX);
- }
+ bench_new(a_tmp, src, src_stride,
+ w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
}
- report("mct");
-}
+ }
+ }
+ report("mct_scaled");
+}
static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
int16_t (*const tmp)[128 * 128], const int bitdepth_max)
@@ -687,8 +739,9 @@
bitfn(dav1d_mc_dsp_init)(&c);
check_mc(&c);
- check_mc_scaled(&c);
check_mct(&c);
+ check_mc_scaled(&c);
+ check_mct_scaled(&c);
check_avg(&c);
check_w_avg(&c);
check_mask(&c);