shithub: dav1d

Download patch

ref: ea74e3d513206fcdda4316f3f1303df47b890d48
parent: 07261e8c38ab709f86a279df996e9e0ec0e9c508
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Thu Jun 11 09:11:55 EDT 2020

x86: Add prep_8tap_scaled AVX2 asm

mct_scaled_8tap_regular_w4_8bpc_c: 872.1
mct_scaled_8tap_regular_w4_8bpc_avx2: 125.6
mct_scaled_8tap_regular_w4_dy1_8bpc_c: 886.3
mct_scaled_8tap_regular_w4_dy1_8bpc_avx2: 84.0
mct_scaled_8tap_regular_w4_dy2_8bpc_c: 1189.1
mct_scaled_8tap_regular_w4_dy2_8bpc_avx2: 84.7

mct_scaled_8tap_regular_w8_8bpc_c: 2261.0
mct_scaled_8tap_regular_w8_8bpc_avx2: 306.2
mct_scaled_8tap_regular_w8_dy1_8bpc_c: 2189.9
mct_scaled_8tap_regular_w8_dy1_8bpc_avx2: 233.8
mct_scaled_8tap_regular_w8_dy2_8bpc_c: 3060.3
mct_scaled_8tap_regular_w8_dy2_8bpc_avx2: 282.8

mct_scaled_8tap_regular_w16_8bpc_c: 4335.3
mct_scaled_8tap_regular_w16_8bpc_avx2: 680.7
mct_scaled_8tap_regular_w16_dy1_8bpc_c: 5137.2
mct_scaled_8tap_regular_w16_dy1_8bpc_avx2: 578.6
mct_scaled_8tap_regular_w16_dy2_8bpc_c: 7878.4
mct_scaled_8tap_regular_w16_dy2_8bpc_avx2: 774.6

mct_scaled_8tap_regular_w32_8bpc_c: 17871.9
mct_scaled_8tap_regular_w32_8bpc_avx2: 2954.8
mct_scaled_8tap_regular_w32_dy1_8bpc_c: 18594.7
mct_scaled_8tap_regular_w32_dy1_8bpc_avx2: 2073.9
mct_scaled_8tap_regular_w32_dy2_8bpc_c: 28696.0
mct_scaled_8tap_regular_w32_dy2_8bpc_avx2: 2852.1

mct_scaled_8tap_regular_w64_8bpc_c: 46967.5
mct_scaled_8tap_regular_w64_8bpc_avx2: 7527.5
mct_scaled_8tap_regular_w64_dy1_8bpc_c: 45564.2
mct_scaled_8tap_regular_w64_dy1_8bpc_avx2: 5262.9
mct_scaled_8tap_regular_w64_dy2_8bpc_c: 72793.3
mct_scaled_8tap_regular_w64_dy2_8bpc_avx2: 7535.9

mct_scaled_8tap_regular_w128_8bpc_c: 111190.8
mct_scaled_8tap_regular_w128_8bpc_avx2: 19386.8
mct_scaled_8tap_regular_w128_dy1_8bpc_c: 122625.0
mct_scaled_8tap_regular_w128_dy1_8bpc_avx2: 15376.1
mct_scaled_8tap_regular_w128_dy2_8bpc_c: 197120.6
mct_scaled_8tap_regular_w128_dy2_8bpc_avx2: 21871.0

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -244,7 +244,7 @@
 %endmacro
 
 %macro SCALED_JMP_TABLE 1-*
-    %xdefine %1_table (%%table - 2)
+    %xdefine %1_table (%%table - %2)
     %xdefine %%base mangle(private_prefix %+ _%1)
 %%table:
     %rep %0 - 1
@@ -253,7 +253,7 @@
     %endrep
     %rotate 1
 %%dy_1024:
-    %xdefine %1_dy1_table (%%dy_1024 - 2)
+    %xdefine %1_dy1_table (%%dy_1024 - %2)
     %rep %0 - 1
         dw %%base %+ .dy1_w%2 - %%base
         %rotate 1
@@ -260,7 +260,7 @@
     %endrep
     %rotate 1
 %%dy_2048:
-    %xdefine %1_dy2_table (%%dy_2048 - 2)
+    %xdefine %1_dy2_table (%%dy_2048 - %2)
     %rep %0 - 1
         dw %%base %+ .dy2_w%2 - %%base
         %rotate 1
@@ -280,6 +280,7 @@
 HV_JMP_TABLE     put,  8tap,  avx2, 3, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE     prep, 8tap,  avx2, 1,    4, 8, 16, 32, 64, 128
 SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2,   4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE  avg_avx2,                4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE  w_avg_avx2,              4, 8, 16, 32, 64, 128
 BIDIR_JMP_TABLE  mask_avx2,               4, 8, 16, 32, 64, 128
@@ -3914,54 +3915,159 @@
     RET
 %endmacro
 
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%macro movifprep 2
+ %if isprep
+    mov %1, %2
+ %endif
+%endmacro
 
-%if WIN64
-DECLARE_REG_TMP 6, 5
-%else
-DECLARE_REG_TMP 6, 8
-%endif
-PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
-PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
-PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
-PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
-PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+%macro REMAP_REG 2
+ %xdefine r%1  r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
 
-%if required_stack_alignment <= STACK_ALIGNMENT
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+  %xdefine r14_save r14
+  %assign %%i 14
+  %rep 14
+   %assign %%j %%i-1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i-1
+  %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+  %assign %%i 1
+  %rep 13
+   %assign %%j %%i+1
+   REMAP_REG %%i, %%j
+   %assign %%i %%i+1
+  %endrep
+  %xdefine r14 r14_save
+  %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+    RET
+ %if %1
+    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+    movq               xm%1, [srcq+ r4]
+    movq               xm%2, [srcq+ r6]
+    movhps             xm%1, [srcq+ r7]
+    movhps             xm%2, [srcq+ r9]
+    vinserti128         m%1, [srcq+r10], 1
+    vinserti128         m%2, [srcq+r11], 1
+    vpbroadcastq        m%5, [srcq+r13]
+    vpbroadcastq        m%6, [srcq+ rX]
+    add                srcq, ssq
+    movq               xm%3, [srcq+ r4]
+    movq               xm%4, [srcq+ r6]
+    movhps             xm%3, [srcq+ r7]
+    movhps             xm%4, [srcq+ r9]
+    vinserti128         m%3, [srcq+r10], 1
+    vinserti128         m%4, [srcq+r11], 1
+    vpbroadcastq        m%7, [srcq+r13]
+    vpbroadcastq        m%8, [srcq+ rX]
+    add                srcq, ssq
+    vpblendd            m%1, m%5, 0xc0
+    vpblendd            m%2, m%6, 0xc0
+    vpblendd            m%3, m%7, 0xc0
+    vpblendd            m%4, m%8, 0xc0
+    pmaddubsw           m%1, m15
+    pmaddubsw           m%2, m10
+    pmaddubsw           m%3, m15
+    pmaddubsw           m%4, m10
+    phaddw              m%1, m%2
+    phaddw              m%3, m%4
+    phaddw              m%1, m%3
+    pmulhrsw            m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
 cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
-%else
+ %else
 cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+  %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
 %endif
-    lea                 r12, [put_8tap_scaled_avx2]
-%define base r12-put_8tap_scaled_avx2
+    lea            base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
     tzcnt                wd, wm
     vpbroadcastd         m8, dxm
+%if isprep && UNIX64
+    movd               xm14, mxd
+    vpbroadcastd        m14, xm14
+    mov                 r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
     vpbroadcastd        m14, mxm
+%endif
     mov                 dyd, dym
-%if WIN64
+%ifidn %1, put
+ %if WIN64
     mov                 r8d, hm
- DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
- %define hm r5m
-%else
- DEFINE_ARGS dst, ds, src, ss, w, h, _, my, _, dy, ss3
- %define hm r6m
-%endif
-%if required_stack_alignment > STACK_ALIGNMENT
- %define dsm [rsp+96]
- %define rX r1
- %define rXd r1d
-%else
- %define dsm dsq
+  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r5m
+  %define dxm r8m
+ %else
+  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+  %define dsm [rsp+96]
+  %define rX r1
+  %define rXd r1d
+ %else
+  %define dsm dsq
+  %define rX r14
+  %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+    mov                 r7d, hm
+  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+  %define hm r4m
+  %define dxm r7m
+ %else
+  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+  %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
  %define rX r14
  %define rXd r14d
 %endif
     vpbroadcastd        m10, [base+pd_0x3ff]
     vpbroadcastd        m12, [base+pw_8192]
+%ifidn %1, put
     vpbroadcastd        m13, [base+pd_512]
+%else
+    vpbroadcastd        m13, [base+pd_32]
+%endif
     pxor                 m9, m9
     lea                ss3q, [ssq*3]
     movzx               r7d, t1b
@@ -3973,9 +4079,10 @@
     je .dy1
     cmp                 dyd, 2048
     je .dy2
-    movzx                wd, word [base+put_8tap_scaled_avx2_table+wq*2]
-    add                  wq, r12
+    movzx                wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+    add                  wq, base_reg
     jmp                  wq
+%ifidn %1, put
 .w2:
     mov                 myd, mym
     movzx               t0d, t0b
@@ -4086,6 +4193,7 @@
     punpcklwd           xm2, xm1, xm5       ; 45 56
     punpckhwd           xm4, xm1, xm5       ; 67 __
     jmp .w2_loop
+%endif
 .w4:
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
@@ -4175,11 +4283,16 @@
     paddd               xm6, xm7
     paddd               xm4, xm13
     paddd               xm4, xm6
-    psrad               xm4, 10
+    psrad               xm4, rndshift
     packssdw            xm4, xm4
+%ifidn %1, put
     packuswb            xm4, xm4
     movd             [dstq], xm4
     add                dstq, dsq
+%else
+    movq             [tmpq], xm4
+    add                tmpq, 8
+%endif
     dec                  hd
     jz .ret
     add                 myd, dyd
@@ -4222,7 +4335,9 @@
     lea                srcq, [srcq+ssq*2]
     jmp .w4_loop
 .w8:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     movd               xm15, t0d
@@ -4269,42 +4384,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-%macro PUT_8TAP_SCALED_H 8 ; dst, tmp[0-6]
-    movq               xm%1, [srcq+r4]
-    movq               xm%2, [srcq+r6]
-    movhps             xm%1, [srcq+r7]
-    movhps             xm%2, [srcq+r9]
-    vinserti128         m%1, [srcq+r10], 1
-    vinserti128         m%2, [srcq+r11], 1
-    vpbroadcastq        m%5, [srcq+r13]
-    vpbroadcastq        m%6, [srcq+rX]
-    add                srcq, ssq
-    movq               xm%3, [srcq+r4]
-    movq               xm%4, [srcq+r6]
-    movhps             xm%3, [srcq+r7]
-    movhps             xm%4, [srcq+r9]
-    vinserti128         m%3, [srcq+r10], 1
-    vinserti128         m%4, [srcq+r11], 1
-    vpbroadcastq        m%7, [srcq+r13]
-    vpbroadcastq        m%8, [srcq+rX]
-    add                srcq, ssq
-    vpblendd            m%1, m%5, 0xc0
-    vpblendd            m%2, m%6, 0xc0
-    vpblendd            m%3, m%7, 0xc0
-    vpblendd            m%4, m%8, 0xc0
-    pmaddubsw           m%1, m15
-    pmaddubsw           m%2, m10
-    pmaddubsw           m%3, m15
-    pmaddubsw           m%4, m10
-    phaddw              m%1, m%2
-    phaddw              m%3, m%4
-    phaddw              m%1, m%3
-    pmulhrsw            m%1, m12
-%endmacro
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     mov                 dyd, dym
     pshufb               m0, m14    ; 01a 01b
@@ -4335,12 +4418,17 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
     dec                  hd
     jz .ret
     add                 myd, dyd
@@ -4354,11 +4442,11 @@
     mov                 r9d, [rsp+12]
     jz .w8_skip_line
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     add                srcq, ssq
@@ -4386,22 +4474,22 @@
     mova                 m1, m2
     mova                 m2, m3
     vpbroadcastq         m7, [srcq+r13]
-    vpbroadcastq         m8, [srcq+rX]
-    movq                xm3, [srcq+r4]
-    movq                xm4, [srcq+r6]
-    movhps              xm3, [srcq+r7]
-    movhps              xm4, [srcq+r9]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
     vinserti128          m3, [srcq+r10], 1
     vinserti128          m4, [srcq+r11], 1
     add                srcq, ssq
-    movq                xm5, [srcq+r4]
-    movq                xm6, [srcq+r6]
-    movhps              xm5, [srcq+r7]
-    movhps              xm6, [srcq+r9]
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
     vinserti128          m5, [srcq+r10], 1
     vinserti128          m6, [srcq+r11], 1
     vpbroadcastq         m9, [srcq+r13]
-    vpbroadcastq        m11, [srcq+rX]
+    vpbroadcastq        m11, [srcq+ rX]
     add                srcq, ssq
     mov                 myd, [rsp+16]
     mov                 dyd, dym
@@ -4423,18 +4511,24 @@
     pmulhrsw             m3, m12
     jmp .w8_loop
 .w16:
-    mov      dword [rsp+48], 1 << 1
+    mov      dword [rsp+48], 2
+    movifprep   tmp_stridem, 32
     jmp .w_start
 .w32:
-    mov      dword [rsp+48], 1 << 3
+    mov      dword [rsp+48], 4
+    movifprep   tmp_stridem, 64
     jmp .w_start
 .w64:
-    mov      dword [rsp+48], 1 << 7
+    mov      dword [rsp+48], 8
+    movifprep   tmp_stridem, 128
     jmp .w_start
 .w128:
-    mov      dword [rsp+48], 1 << 15
+    mov      dword [rsp+48], 16
+    movifprep   tmp_stridem, 256
 .w_start:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     pmaddwd              m8, [base+rescale_mul]
@@ -4441,26 +4535,26 @@
     movd               xm15, t0d
     mov            [rsp+72], t0d
     mov            [rsp+56], srcq
-    mov            [rsp+64], dstq
+    mov            [rsp+64], r0q ; dstq / tmpq
 %if UNIX64
     mov                  hm, hd
 %endif
-    shl           dword r8m, 3 ; dx*8
+    shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
     jmp .hloop
 .hloop_prep:
-    shr      dword [rsp+48], 1
+    dec      dword [rsp+48]
     jz .ret
-    add      qword [rsp+64], 8
+    add      qword [rsp+64], 8*(isprep+1)
     mov                  hd, hm
-    vpbroadcastd         m8, r8m
+    vpbroadcastd         m8, dxm
     vpbroadcastd        m10, [base+pd_0x3ff]
     paddd               m14, m8, [rsp+16]
     vpbroadcastd        m15, [rsp+72]
     pxor                 m9, m9
     mov                srcq, [rsp+56]
-    mov                dstq, [rsp+64]
+    mov                 r0q, [rsp+64] ; dstq / tmpq
 .hloop:
     vpbroadcastq        m11, [base+pq_0x40000000]
     pand                 m6, m14, m10
@@ -4477,14 +4571,14 @@
     pextrd             r13d, xm7, 1
     pextrd              rXd, xm7, 3
     movu           [rsp+16], m14
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
     vinserti128         m15, [base+subpel_filters+r10*8], 1
     vinserti128         m10, [base+subpel_filters+r11*8], 1
     vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
     mova              [rsp], xm14
@@ -4503,10 +4597,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     mov                 dyd, dym
     pshufb               m0, m14    ; 01a 01b
@@ -4537,12 +4631,17 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
     dec                  hd
     jz .hloop_prep
     add                 myd, dyd
@@ -4556,11 +4655,11 @@
     mov                 r9d, [rsp+12]
     jz .skip_line
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    vpbroadcastq         m7, [srcq+ rX]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     add                srcq, ssq
@@ -4588,22 +4687,22 @@
     mova                 m1, m2
     mova                 m2, m3
     vpbroadcastq         m7, [srcq+r13]
-    vpbroadcastq         m8, [srcq+rX]
-    movq                xm3, [srcq+r4]
-    movq                xm4, [srcq+r6]
-    movhps              xm3, [srcq+r7]
-    movhps              xm4, [srcq+r9]
+    vpbroadcastq         m8, [srcq+ rX]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
     vinserti128          m3, [srcq+r10], 1
     vinserti128          m4, [srcq+r11], 1
     add                srcq, ssq
-    movq                xm5, [srcq+r4]
-    movq                xm6, [srcq+r6]
-    movhps              xm5, [srcq+r7]
-    movhps              xm6, [srcq+r9]
+    movq                xm5, [srcq+ r4]
+    movq                xm6, [srcq+ r6]
+    movhps              xm5, [srcq+ r7]
+    movhps              xm6, [srcq+ r9]
     vinserti128          m5, [srcq+r10], 1
     vinserti128          m6, [srcq+r11], 1
     vpbroadcastq         m9, [srcq+r13]
-    vpbroadcastq        m11, [srcq+rX]
+    vpbroadcastq        m11, [srcq+ rX]
     add                srcq, ssq
     mov                 myd, [rsp+52]
     mov                 dyd, dym
@@ -4625,9 +4724,10 @@
     pmulhrsw             m3, m12
     jmp .vloop
 .dy1:
-    movzx                wd, word [base+put_8tap_scaled_avx2_dy1_table+wq*2]
-    add                  wq, r12
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+    add                  wq, base_reg
     jmp                  wq
+%ifidn %1, put
 .dy1_w2:
     mov                 myd, mym
     movzx               t0d, t0b
@@ -4706,7 +4806,7 @@
     mova                xm4, xm1
     paddd               xm5, xm6
     paddd               xm5, xm7
-    psrad               xm5, 10
+    psrad               xm5, rndshift
     packssdw            xm5, xm5
     packuswb            xm5, xm5
     pextrw     [dstq+dsq*0], xm5, 0
@@ -4715,6 +4815,7 @@
     sub                  hd, 2
     jg .dy1_w2_loop
     RET
+%endif
 .dy1_w4:
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
@@ -4810,19 +4911,27 @@
     pmaddwd              m6, m2, m10
     paddd                m4, m5
     paddd                m4, m6
-    psrad                m4, 10
+    psrad                m4, rndshift
     vextracti128        xm5, m4, 1
     packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     pshuflw             xm4, xm4, q3120
     movd       [dstq+dsq*0], xm4
     pextrd     [dstq+dsq*1], xm4, 1
     lea                dstq, [dstq+dsq*2]
+%else
+    pshufd              xm4, xm4, q3120
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
     sub                  hd, 2
     jg .dy1_w4_loop
-    RET
+    MC_8TAP_SCALED_RET
 .dy1_w8:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     movd               xm15, t0d
@@ -4843,14 +4952,14 @@
     pextrd             r11d, xm7, 2
     pextrd             r13d, xm7, 1
     pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
     vinserti128         m15, [base+subpel_filters+r10*8], 1
     vinserti128         m10, [base+subpel_filters+r11*8], 1
     vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
     movd                r4d, xm14
@@ -4869,10 +4978,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     movu              [rsp], m10
     pshufb               m0, m14    ; 01a 01b
@@ -4902,22 +5011,27 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
     dec                  hd
     jz .ret
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
+    vpbroadcastq         m7, [srcq+ rX]
     add                srcq, ssq
     pshufb               m0, m14
     pshufb               m1, m14
@@ -4937,18 +5051,24 @@
     pblendw              m3, m4, 0xaa
     jmp .dy1_w8_loop
 .dy1_w16:
-    mov      dword [rsp+72], 1 << 1
+    mov      dword [rsp+72], 2
+    movifprep   tmp_stridem, 32
     jmp .dy1_w_start
 .dy1_w32:
-    mov      dword [rsp+72], 1 << 3
+    mov      dword [rsp+72], 4
+    movifprep   tmp_stridem, 64
     jmp .dy1_w_start
 .dy1_w64:
-    mov      dword [rsp+72], 1 << 7
+    mov      dword [rsp+72], 8
+    movifprep   tmp_stridem, 128
     jmp .dy1_w_start
 .dy1_w128:
-    mov      dword [rsp+72], 1 << 15
+    mov      dword [rsp+72], 16
+    movifprep   tmp_stridem, 256
 .dy1_w_start:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     pmaddwd              m8, [base+rescale_mul]
@@ -4955,26 +5075,26 @@
     movd               xm15, t0d
     mov            [rsp+76], t0d
     mov            [rsp+80], srcq
-    mov            [rsp+88], dstq
+    mov            [rsp+88], r0q ; dstq / tmpq
 %if UNIX64
     mov                  hm, hd
 %endif
-    shl           dword r8m, 3 ; dx*8
+    shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
     jmp .dy1_hloop
 .dy1_hloop_prep:
-    shr      dword [rsp+72], 1
+    dec      dword [rsp+72]
     jz .ret
-    add      qword [rsp+88], 8
+    add      qword [rsp+88], 8*(isprep+1)
     mov                  hd, hm
-    vpbroadcastd         m8, r8m
+    vpbroadcastd         m8, dxm
     vpbroadcastd        m10, [base+pd_0x3ff]
     paddd               m14, m8, [rsp+32]
     vpbroadcastd        m15, [rsp+76]
     pxor                 m9, m9
     mov                srcq, [rsp+80]
-    mov                dstq, [rsp+88]
+    mov                 r0q, [rsp+88] ; dstq / tmpq
 .dy1_hloop:
     vpbroadcastq        m11, [base+pq_0x40000000]
     pand                 m6, m14, m10
@@ -4991,14 +5111,14 @@
     pextrd             r13d, xm7, 1
     pextrd              rXd, xm7, 3
     movu           [rsp+32], m14
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
     vinserti128         m15, [base+subpel_filters+r10*8], 1
     vinserti128         m10, [base+subpel_filters+r11*8], 1
     vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
     movq           [rsp+64], xm14
@@ -5017,10 +5137,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     movu              [rsp], m10
     pshufb               m0, m14    ; 01a 01b
@@ -5051,22 +5171,27 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
     dec                  hd
     jz .dy1_hloop_prep
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
+    vpbroadcastq         m7, [srcq+ rX]
     add                srcq, ssq
     pshufb               m0, m14
     pshufb               m1, m14
@@ -5086,9 +5211,10 @@
     pblendw              m3, m4, 0xaa
     jmp .dy1_vloop
 .dy2:
-    movzx                wd, word [base+put_8tap_scaled_avx2_dy2_table+wq*2]
-    add                  wq, r12
+    movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+    add                  wq, base_reg
     jmp                  wq
+%ifidn %1, put
 .dy2_w2:
     mov                 myd, mym
     movzx               t0d, t0b
@@ -5170,7 +5296,7 @@
     paddd               xm4, xm13
     paddd               xm6, xm7
     paddd               xm4, xm6
-    psrad               xm4, 10
+    psrad               xm4, rndshift
     packssdw            xm4, xm4
     packuswb            xm4, xm4
     pextrw     [dstq+dsq*0], xm4, 0
@@ -5179,6 +5305,7 @@
     sub                  hd, 2
     jg .dy2_w2_loop
     RET
+%endif
 .dy2_w4:
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
@@ -5270,18 +5397,25 @@
     paddd                m4, m13
     paddd                m6, m7
     paddd                m4, m6
-    psrad                m4, 10
+    psrad                m4, rndshift
     vextracti128        xm5, m4, 1
     packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movd       [dstq+dsq*0], xm4
     pextrd     [dstq+dsq*1], xm4, 1
     lea                dstq, [dstq+dsq*2]
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
     sub                  hd, 2
     jg .dy2_w4_loop
-    RET
+    MC_8TAP_SCALED_RET
 .dy2_w8:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     movd               xm15, t0d
@@ -5302,14 +5436,14 @@
     pextrd             r11d, xm7, 2
     pextrd             r13d, xm7, 1
     pextrd              rXd, xm7, 3
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
     vinserti128         m15, [base+subpel_filters+r10*8], 1
     vinserti128         m10, [base+subpel_filters+r11*8], 1
     vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
     movd                r4d, xm14
@@ -5328,10 +5462,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
@@ -5359,25 +5493,30 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, 16
+%endif
     dec                  hd
     jz .ret
     mova                 m0, m1
     mova                 m1, m2
     mova                 m2, m3
-    movq                xm3, [srcq+r4]
-    movq                xm4, [srcq+r6]
-    movhps              xm3, [srcq+r7]
-    movhps              xm4, [srcq+r9]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
     vinserti128          m3, [srcq+r10], 1
     vinserti128          m4, [srcq+r11], 1
     vpbroadcastq         m5, [srcq+r13]
-    vpbroadcastq         m6, [srcq+rX]
+    vpbroadcastq         m6, [srcq+ rX]
     add                srcq, ssq
     vpblendd             m3, m5, 0xc0
     vpblendd             m4, m6, 0xc0
@@ -5384,14 +5523,14 @@
     pmaddubsw            m3, m15
     pmaddubsw            m4, m10
     phaddw               m3, m4
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
+    vpbroadcastq         m7, [srcq+ rX]
     add                srcq, ssq
     vpblendd             m4, m6, 0xc0
     vpblendd             m5, m7, 0xc0
@@ -5406,18 +5545,24 @@
     pmulhrsw             m3, m12
     jmp .dy2_w8_loop
 .dy2_w16:
-    mov      dword [rsp+40], 1 << 1
+    mov      dword [rsp+40], 2
+    movifprep   tmp_stridem, 32
     jmp .dy2_w_start
 .dy2_w32:
-    mov      dword [rsp+40], 1 << 3
+    mov      dword [rsp+40], 4
+    movifprep   tmp_stridem, 64
     jmp .dy2_w_start
 .dy2_w64:
-    mov      dword [rsp+40], 1 << 7
+    mov      dword [rsp+40], 8
+    movifprep   tmp_stridem, 128
     jmp .dy2_w_start
 .dy2_w128:
-    mov      dword [rsp+40], 1 << 15
+    mov      dword [rsp+40], 16
+    movifprep   tmp_stridem, 256
 .dy2_w_start:
+%ifidn %1, put
     movifnidn           dsm, dsq
+%endif
     shr                 t0d, 16
     sub                srcq, 3
     pmaddwd              m8, [base+rescale_mul]
@@ -5424,26 +5569,26 @@
     movd               xm15, t0d
     mov            [rsp+64], t0d
     mov            [rsp+48], srcq
-    mov            [rsp+56], dstq
+    mov            [rsp+56], r0q ; dstq / tmpq
 %if UNIX64
-    mov                 r6m, hd
+    mov                  hm, hd
 %endif
-    shl           dword r8m, 3 ; dx*8
+    shl           dword dxm, 3 ; dx*8
     vpbroadcastd        m15, xm15
     paddd               m14, m8 ; mx+dx*[0-7]
     jmp .dy2_hloop
 .dy2_hloop_prep:
-    shr      dword [rsp+40], 1
+    dec      dword [rsp+40]
     jz .ret
-    add      qword [rsp+56], 8
+    add      qword [rsp+56], 8*(isprep+1)
     mov                  hd, hm
-    vpbroadcastd         m8, r8m
+    vpbroadcastd         m8, dxm
     vpbroadcastd        m10, [base+pd_0x3ff]
     paddd               m14, m8, [rsp]
     vpbroadcastd        m15, [rsp+64]
     pxor                 m9, m9
     mov                srcq, [rsp+48]
-    mov                dstq, [rsp+56]
+    mov                 r0q, [rsp+56] ; dstq / tmpq
 .dy2_hloop:
     vpbroadcastq        m11, [base+pq_0x40000000]
     pand                 m6, m14, m10
@@ -5460,14 +5605,14 @@
     pextrd             r13d, xm7, 1
     pextrd              rXd, xm7, 3
     movu              [rsp], m14
-    movq               xm15, [base+subpel_filters+r4*8]
-    movq               xm10, [base+subpel_filters+r6*8]
-    movhps             xm15, [base+subpel_filters+r7*8]
-    movhps             xm10, [base+subpel_filters+r9*8]
+    movq               xm15, [base+subpel_filters+ r4*8]
+    movq               xm10, [base+subpel_filters+ r6*8]
+    movhps             xm15, [base+subpel_filters+ r7*8]
+    movhps             xm10, [base+subpel_filters+ r9*8]
     vinserti128         m15, [base+subpel_filters+r10*8], 1
     vinserti128         m10, [base+subpel_filters+r11*8], 1
     vpbroadcastq         m9, [base+subpel_filters+r13*8]
-    vpbroadcastq         m8, [base+subpel_filters+rX*8]
+    vpbroadcastq         m8, [base+subpel_filters+ rX*8]
     psrld               m14, 10
     vextracti128        xm7, m14, 1
     movq           [rsp+32], xm14
@@ -5486,10 +5631,10 @@
     pblendvb            m15, m11, m5
     pblendvb            m10, m11, m6
     vbroadcasti128      m14, [base+subpel_s_shuf8]
-    PUT_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
-    PUT_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
-    PUT_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
-    PUT_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+    MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7  ; 0a 1a 0b 1b
+    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8  ; 2a 3a 2b 3b
+    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9  ; 4a 5a 4b 5b
+    MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
     mov                 myd, mym
     pshufb               m0, m14    ; 01a 01b
     pshufb               m1, m14    ; 23a 23b
@@ -5518,25 +5663,30 @@
     paddd                m6, m7
     paddd                m4, m13
     paddd                m4, m6
-    psrad                m4, 10
-    packssdw             m4, m4
-    vpermq               m4, m4, q3120
+    psrad                m4, rndshift
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+%ifidn %1, put
     packuswb            xm4, xm4
     movq             [dstq], xm4
     add                dstq, dsm
+%else
+    mova             [tmpq], xm4
+    add                tmpq, tmp_stridem
+%endif
     dec                  hd
     jz .dy2_hloop_prep
     mova                 m0, m1
     mova                 m1, m2
     mova                 m2, m3
-    movq                xm3, [srcq+r4]
-    movq                xm4, [srcq+r6]
-    movhps              xm3, [srcq+r7]
-    movhps              xm4, [srcq+r9]
+    movq                xm3, [srcq+ r4]
+    movq                xm4, [srcq+ r6]
+    movhps              xm3, [srcq+ r7]
+    movhps              xm4, [srcq+ r9]
     vinserti128          m3, [srcq+r10], 1
     vinserti128          m4, [srcq+r11], 1
     vpbroadcastq         m5, [srcq+r13]
-    vpbroadcastq         m6, [srcq+rX]
+    vpbroadcastq         m6, [srcq+ rX]
     add                srcq, ssq
     vpblendd             m3, m5, 0xc0
     vpblendd             m4, m6, 0xc0
@@ -5543,14 +5693,14 @@
     pmaddubsw            m3, m15
     pmaddubsw            m4, m10
     phaddw               m3, m4
-    movq                xm4, [srcq+r4]
-    movq                xm5, [srcq+r6]
-    movhps              xm4, [srcq+r7]
-    movhps              xm5, [srcq+r9]
+    movq                xm4, [srcq+ r4]
+    movq                xm5, [srcq+ r6]
+    movhps              xm4, [srcq+ r7]
+    movhps              xm5, [srcq+ r9]
     vinserti128          m4, [srcq+r10], 1
     vinserti128          m5, [srcq+r11], 1
     vpbroadcastq         m6, [srcq+r13]
-    vpbroadcastq         m7, [srcq+rX]
+    vpbroadcastq         m7, [srcq+ rX]
     add                srcq, ssq
     vpblendd             m4, m6, 0xc0
     vpblendd             m5, m7, 0xc0
@@ -5565,7 +5715,43 @@
     pmulhrsw             m3, m12
     jmp .dy2_vloop
 .ret:
-    RET
+    MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+PUT_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PUT_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PUT_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PUT_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PUT_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+PREP_8TAP_SCALED_FN regular,        REGULAR, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp,  REGULAR, SHARP
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH,  REGULAR
+PREP_8TAP_SCALED_FN smooth,         SMOOTH,  SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp,   SMOOTH,  SHARP
+PREP_8TAP_SCALED_FN sharp_regular,  SHARP,   REGULAR
+PREP_8TAP_SCALED_FN sharp,          SHARP,   SHARP
+PREP_8TAP_SCALED_FN sharp_smooth,   SHARP,   SMOOTH
+MC_8TAP_SCALED prep
 
 %macro WARP_V 5 ; dst, 02, 46, 13, 57
     ; Can be done using gathers, but that's terribly slow on many CPU:s
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -49,16 +49,6 @@
 decl_mc_fn(dav1d_put_bilin_avx2);
 decl_mc_fn(dav1d_put_bilin_ssse3);
 
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
-decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
-
 decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
@@ -100,6 +90,26 @@
 decl_mct_fn(dav1d_prep_bilin_ssse3);
 decl_mct_fn(dav1d_prep_bilin_sse2);
 
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_regular_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_smooth_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_regular_avx2);
+decl_mc_scaled_fn(dav1d_put_8tap_scaled_sharp_smooth_avx2);
+
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_regular_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_smooth_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_regular_avx2);
+decl_mct_scaled_fn(dav1d_prep_8tap_scaled_sharp_smooth_avx2);
+
 decl_avg_fn(dav1d_avg_avx512icl);
 decl_avg_fn(dav1d_avg_avx2);
 decl_avg_fn(dav1d_avg_ssse3);
@@ -145,6 +155,8 @@
     c->mct[type] = dav1d_prep_##name##_##suffix
 #define init_mc_scaled_fn(type, name, suffix) \
     c->mc_scaled[type] = dav1d_put_##name##_##suffix
+#define init_mct_scaled_fn(type, name, suffix) \
+    c->mct_scaled[type] = dav1d_prep_##name##_##suffix
 
     const unsigned flags = dav1d_get_cpu_flags();
 
@@ -232,16 +244,6 @@
     init_mc_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
     init_mc_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
 
-    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
-    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
-
     init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx2);
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx2);
     init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx2);
@@ -252,6 +254,26 @@
     init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx2);
     init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx2);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx2);
+
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mc_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
+
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR,        8tap_scaled_regular,        avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_scaled_regular_smooth, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_scaled_regular_sharp,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_scaled_smooth_regular, avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH,         8tap_scaled_smooth,         avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_scaled_smooth_sharp,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_scaled_sharp_regular,  avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_scaled_sharp_smooth,   avx2);
+    init_mct_scaled_fn(FILTER_2D_8TAP_SHARP,          8tap_scaled_sharp,          avx2);
 
     c->avg = dav1d_avg_avx2;
     c->w_avg = dav1d_w_avg_avx2;
--- a/tests/checkasm/mc.c
+++ b/tests/checkasm/mc.c
@@ -107,6 +107,61 @@
     report("mc");
 }
 
+/* Generate worst case input in the topleft corner, randomize the rest */
+static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
+    static const int8_t pattern[8] = { -1,  0, -1,  0,  0, -1,  0, -1 };
+    const int sign = -(rnd() & 1);
+
+    for (int y = 0; y < 135; y++)
+        for (int x = 0; x < 135; x++)
+            buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
+                                        : rnd()) & bitdepth_max;
+}
+
+static void check_mct(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
+    ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
+    const pixel *src = src_buf + 135 * 3 + 3;
+    const ptrdiff_t src_stride = 135 * sizeof(pixel);
+
+    declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
+                 int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
+
+    for (int filter = 0; filter < N_2D_FILTERS; filter++)
+        for (int w = 4; w <= 128; w <<= 1)
+            for (int mxy = 0; mxy < 4; mxy++)
+                if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
+                    filter_names[filter], w, mxy_names[mxy], BITDEPTH))
+                    for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
+                    {
+                        const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
+                        const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
+#if BITDEPTH == 16
+                        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+                        const int bitdepth_max = 0xff;
+#endif
+                        generate_mct_input(src_buf, bitdepth_max);
+
+                        call_ref(c_tmp, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        call_new(a_tmp, src, src_stride, w, h,
+                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
+                                                a_tmp, w * sizeof(*a_tmp),
+                                                w, h, "tmp");
+
+                        if (filter == FILTER_2D_8TAP_REGULAR ||
+                            filter == FILTER_2D_BILINEAR)
+                        {
+                            bench_new(a_tmp, src, src_stride, w, h,
+                                      mx, my HIGHBD_TAIL_SUFFIX);
+                        }
+                    }
+    report("mct");
+}
+
 static void check_mc_scaled(Dav1dMCDSPContext *const c) {
     ALIGN_STK_64(pixel, src_buf, 263 * 263,);
     ALIGN_STK_64(pixel, c_dst,   128 * 128,);
@@ -161,47 +216,44 @@
     report("mc_scaled");
 }
 
-/* Generate worst case input in the topleft corner, randomize the rest */
-static void generate_mct_input(pixel *const buf, const int bitdepth_max) {
-    static const int8_t pattern[8] = { -1,  0, -1,  0,  0, -1,  0, -1 };
-    const int sign = -(rnd() & 1);
+static void check_mct_scaled(Dav1dMCDSPContext *const c) {
+    ALIGN_STK_64(pixel, src_buf, 263 * 263,);
+    ALIGN_STK_64(int16_t, c_tmp,   128 * 128,);
+    ALIGN_STK_64(int16_t, a_tmp,   128 * 128,);
+    const pixel *src = src_buf + 263 * 3 + 3;
+    const ptrdiff_t src_stride = 263 * sizeof(pixel);
+#if BITDEPTH == 16
+    const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
+#else
+    const int bitdepth_max = 0xff;
+#endif
 
-    for (int y = 0; y < 135; y++)
-        for (int x = 0; x < 135; x++)
-            buf[135*y+x] = ((x | y) < 8 ? (pattern[x] ^ pattern[y] ^ sign)
-                                        : rnd()) & bitdepth_max;
-}
-
-static void check_mct(Dav1dMCDSPContext *const c) {
-    ALIGN_STK_64(pixel, src_buf, 135 * 135,);
-    ALIGN_STK_64(int16_t, c_tmp, 128 * 128,);
-    ALIGN_STK_64(int16_t, a_tmp, 128 * 128,);
-    const pixel *src = src_buf + 135 * 3 + 3;
-    const ptrdiff_t src_stride = 135 * sizeof(pixel);
-
     declare_func(void, int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
-                 int w, int h, int mx, int my HIGHBD_DECL_SUFFIX);
+                 int w, int h, int mx, int my, int dx, int dy HIGHBD_DECL_SUFFIX);
 
     for (int filter = 0; filter < N_2D_FILTERS; filter++)
         for (int w = 4; w <= 128; w <<= 1)
-            for (int mxy = 0; mxy < 4; mxy++)
-                if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc",
-                    filter_names[filter], w, mxy_names[mxy], BITDEPTH))
-                    for (int h = imax(w / 4, 4); h <= imin(w * 4, 128); h <<= 1)
-                    {
-                        const int mx = (mxy & 1) ? rnd() % 15 + 1 : 0;
-                        const int my = (mxy & 2) ? rnd() % 15 + 1 : 0;
-#if BITDEPTH == 16
-                        const int bitdepth_max = rnd() & 1 ? 0x3ff : 0xfff;
-#else
-                        const int bitdepth_max = 0xff;
-#endif
-                        generate_mct_input(src_buf, bitdepth_max);
+            for (int p = 0; p < 3; ++p) {
+                if (check_func(c->mct_scaled[filter], "mct_scaled_%s_w%d%s_%dbpc",
+                               filter_names[filter], w, scaled_paths[p], BITDEPTH))
+                {
+                    const int h_min = imax(w / 4, 4);
+                    const int h_max = imin(w * 4, 128);
+                    for (int h = h_min; h <= h_max; h = mc_h_next(h)) {
+                        const int mx = rnd() % 1024;
+                        const int my = rnd() % 1024;
+                        const int dx = rnd() % 2048 + 1;
+                        const int dy = !p
+                            ? rnd() % 2048 + 1
+                            : p << 10; // ystep=1.0 and ystep=2.0 paths
 
-                        call_ref(c_tmp, src, src_stride, w, h,
-                                 mx, my HIGHBD_TAIL_SUFFIX);
-                        call_new(a_tmp, src, src_stride, w, h,
-                                 mx, my HIGHBD_TAIL_SUFFIX);
+                        for (int k = 0; k < 263 * 263; k++)
+                            src_buf[k] = rnd() & bitdepth_max;
+
+                        call_ref(c_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
+                        call_new(a_tmp, src, src_stride,
+                                 w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
                         checkasm_check(int16_t, c_tmp, w * sizeof(*c_tmp),
                                                 a_tmp, w * sizeof(*a_tmp),
                                                 w, h, "tmp");
@@ -208,13 +260,13 @@
 
                         if (filter == FILTER_2D_8TAP_REGULAR ||
                             filter == FILTER_2D_BILINEAR)
-                        {
-                            bench_new(a_tmp, src, src_stride, w, h,
-                                      mx, my HIGHBD_TAIL_SUFFIX);
-                        }
+                            bench_new(a_tmp, src, src_stride,
+                                      w, h, mx, my, dx, dy HIGHBD_TAIL_SUFFIX);
                     }
-    report("mct");
-}
+                }
+            }
+    report("mct_scaled");
+}
 
 static void init_tmp(Dav1dMCDSPContext *const c, pixel *const buf,
                      int16_t (*const tmp)[128 * 128], const int bitdepth_max)
@@ -687,8 +739,9 @@
     bitfn(dav1d_mc_dsp_init)(&c);
 
     check_mc(&c);
-    check_mc_scaled(&c);
     check_mct(&c);
+    check_mc_scaled(&c);
+    check_mct_scaled(&c);
     check_avg(&c);
     check_w_avg(&c);
     check_mask(&c);