shithub: dav1d

Download patch

ref: e706fac9cf35d50fc19f0c6ef699e07f0fabe160
parent: b83cb9643bc647f6ef633330b52cb68882338067
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Wed Dec 11 19:25:35 EST 2019

x86: add prep_8tap AVX512 asm

--- a/src/x86/mc.asm
+++ b/src/x86/mc.asm
@@ -46,18 +46,6 @@
             db 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
             db 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0, 64,  0
 
-warp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
-                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
-warp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
-                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
-subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
-                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
-subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
-bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
-bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 bilin_h_perm16: db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
                 db  9,  8, 10,  9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
                 db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
@@ -79,6 +67,50 @@
                 db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
                 db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
 bilin_v_perm64: dq  0,  4,  1,  5,  2,  6,  3,  7
+spel_h_perm16a: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+                db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+                db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+                db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+                db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+                db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+                db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+                db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+                db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+                db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+                db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+                db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_hv_perm4a: db  8,  9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+                db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+                db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+                db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+                db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+
+warp_8x8_shufA: db  0,  2,  4,  6,  1,  3,  5,  7,  1,  3,  5,  7,  2,  4,  6,  8
+                db  4,  6,  8, 10,  5,  7,  9, 11,  5,  7,  9, 11,  6,  8, 10, 12
+warp_8x8_shufB: db  2,  4,  6,  8,  3,  5,  7,  9,  3,  5,  7,  9,  4,  6,  8, 10
+                db  6,  8, 10, 12,  7,  9, 11, 13,  7,  9, 11, 13,  8, 10, 12, 14
+subpel_h_shuf4: db  0,  1,  2,  3,  1,  2,  3,  4,  8,  9, 10, 11,  9, 10, 11, 12
+                db  2,  3,  4,  5,  3,  4,  5,  6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+subpel_h_shufB: db  4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+subpel_h_shufC: db  8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+bilin_h_shuf4:  db  1,  0,  2,  1,  3,  2,  4,  3,  9,  8, 10,  9, 11, 10, 12, 11
+bilin_h_shuf8:  db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+bilin_v_shuf4:  db  4,  0,  5,  1,  6,  2,  7,  3,  8,  4,  9,  5, 10,  6, 11,  7
 deint_shuf4:    db  0,  4,  1,  5,  2,  6,  3,  7,  4,  8,  5,  9,  6, 10,  7, 11
 blend_shuf:     db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
 
@@ -90,6 +122,7 @@
 pw_2048: times 2 dw 2048
 pw_6903: times 2 dw 6903
 pw_8192: times 2 dw 8192
+pd_2:     dd 2
 pd_32:    dd 32
 pd_512:   dd 512
 pd_32768: dd 32768
@@ -170,6 +203,7 @@
 
 HV_JMP_TABLE put,  8tap,  avx2,      3, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, 8tap,  avx2,      1,    4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap,  avx512icl, 7,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE put,  bilin, avx2,      7, 2, 4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin, avx2,      7,    4, 8, 16, 32, 64, 128
 HV_JMP_TABLE prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
@@ -1852,12 +1886,6 @@
     RET
 %endmacro
 
-INIT_ZMM  avx512icl
-PREP_BILIN
-
-INIT_YMM avx2
-PREP_BILIN
-
 ; int8_t subpel_filters[5][15][8]
 %assign FILTER_REGULAR (0*15 << 16) | 3*15
 %assign FILTER_SMOOTH  (1*15 << 16) | 4*15
@@ -2525,11 +2553,89 @@
     jg .hv_w8_loop0
     RET
 
-%if WIN64
-DECLARE_REG_TMP 6, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
+%macro PREP_8TAP_H 0
+ %if cpuflag(avx512)
+    vpermb              m10, m5, m0
+    vpermb              m11, m5, m1
+    vpermb              m12, m6, m0
+    vpermb              m13, m6, m1
+    vpermb              m14, m7, m0
+    vpermb              m15, m7, m1
+    mova                 m0, m4
+    mova                 m2, m4
+    mova                 m1, m4
+    mova                 m3, m4
+    vpdpbusd             m0, m10, m8
+    vpdpbusd             m2, m12, m8
+    vpdpbusd             m1, m11, m8
+    vpdpbusd             m3, m13, m8
+    vpdpbusd             m0, m12, m9
+    vpdpbusd             m2, m14, m9
+    vpdpbusd             m1, m13, m9
+    vpdpbusd             m3, m15, m9
+    packssdw             m0, m2
+    packssdw             m1, m3
+    psraw                m0, 2
+    psraw                m1, 2
+    mova          [tmpq+ 0], m0
+    mova          [tmpq+64], m1
+ %else
+    pshufb               m1, m0, m5
+    pshufb               m2, m0, m6
+    pshufb               m3, m0, m7
+    pmaddubsw            m1, m8
+    pmaddubsw            m0, m2, m8
+    pmaddubsw            m2, m9
+    pmaddubsw            m3, m9
+    paddw                m1, m2
+    paddw                m0, m3
+    phaddw               m0, m1, m0
+    pmulhrsw             m0, m4
+ %endif
+%endmacro
+
+%macro PREP_8TAP_V_W4 5 ; round, weights
+    movd                xm0, [srcq+strideq*0]
+    vpbroadcastd        ym1, [srcq+strideq*2]
+    vpbroadcastd        xm2, [srcq+strideq*1]
+    vpbroadcastd        ym3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpblendd            ym1, ym1, ym0, 0x01 ; 0 2 2 _   2 _ _ _
+    vpblendd            ym3, ym3, ym2, 0x03 ; 1 1 3 3   3 3 _ _
+    vpbroadcastd        ym0, [srcq+strideq*0]
+    vpbroadcastd        ym2, [srcq+strideq*1]
+    vpblendd            ym1, ym1, ym0, 0x68 ; 0 2 2 4   2 4 4 _
+    vpbroadcastd        ym0, [srcq+strideq*2]
+    vbroadcasti128      ym5, [deint_shuf4]
+    vpblendd            ym3, ym3, ym2, 0xc0 ; 1 1 3 3   3 3 5 5
+    vpblendd            ym2, ym3, ym1, 0x55 ; 0 1 2 3   2 3 4 5
+    vpblendd            ym3, ym3, ym1, 0xaa ; 1 2 3 4   3 4 5 _
+    punpcklbw           ym1, ym2, ym3       ; 01  12    23  34
+    vpblendd            ym3, ym3, ym0, 0x80 ; 1 2 3 4   3 4 5 6
+    punpckhbw           ym2, ym3           ; 23  34    45  56
+.v_w4_loop:
+    pinsrd              xm0, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastd        ym3, [srcq+strideq*0]
+    vpbroadcastd        ym4, [srcq+strideq*1]
+    vpblendd            ym3, ym3, ym4, 0x20 ; _ _ 8 _   8 9 _ _
+    vpblendd            ym3, ym3, ym0, 0x03 ; 6 7 8 _   8 9 _ _
+    vpbroadcastd        ym0, [srcq+strideq*2]
+    vpblendd            ym3, ym3, ym0, 0x40 ; 6 7 8 _   8 9 a _
+    pshufb              ym3, ym5           ; 67  78    89  9a
+    pmaddubsw           ym4, ym1, ym%2
+    vperm2i128          ym1, ym2, ym3, 0x21 ; 45  56    67  78
+    pmaddubsw           ym2, ym%3
+    paddw               ym4, ym2
+    mova                ym2, ym3
+    pmaddubsw           ym3, ym%5
+    paddw               ym3, ym4
+    pmaddubsw           ym4, ym1, ym%4
+    paddw               ym3, ym4
+    pmulhrsw            ym3, ym%1
+    mova             [tmpq], ym3
+%endmacro
+
 %macro PREP_8TAP_FN 3 ; type, type_h, type_v
 cglobal prep_8tap_%1
     mov                 t0d, FILTER_%2
@@ -2539,6 +2645,12 @@
 %endif
 %endmacro
 
+%macro PREP_8TAP 0
+ %if WIN64
+  DECLARE_REG_TMP 6, 4
+ %else
+  DECLARE_REG_TMP 6, 7
+ %endif
 PREP_8TAP_FN regular,        REGULAR, REGULAR
 PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
 PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
@@ -2554,7 +2666,7 @@
     add                 mxd, t0d ; 8tap_h, mx, 4tap_h
     imul                myd, mym, 0x010101
     add                 myd, t1d ; 8tap_v, my, 4tap_v
-    lea                  r7, [prep_avx2]
+    lea                  r7, [prep%+SUFFIX]
     movsxd               wq, wm
     movifnidn            hd, hm
     test                mxd, 0xf00
@@ -2572,27 +2684,53 @@
 .h:
     test                myd, 0xf00
     jnz .hv
-    vbroadcasti128       m5, [subpel_h_shufA]
+%if cpuflag(avx512)
+    vpbroadcastd         m4, [pd_2]
+%else
     vpbroadcastd         m4, [pw_8192]
+    vbroadcasti128       m5, [subpel_h_shufA]
+%endif
     WIN64_SPILL_XMM      10
     cmp                  wd, 4
     je .h_w4
     tzcnt                wd, wd
+%if notcpuflag(avx512)
     vbroadcasti128       m6, [subpel_h_shufB]
     vbroadcasti128       m7, [subpel_h_shufC]
+%endif
     shr                 mxd, 16
     sub                srcq, 3
     movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
-    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep_avx2+0]
-    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep_avx2+4]
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+    vpbroadcastd         m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
     add                  wq, r7
     jmp                  wq
 .h_w4:
+%if cpuflag(avx512)
+    mov                 r3d, 0x4
+    kmovb                k1, r3d
+    vbroadcasti128      ym5, [subpel_h_shufA]
+%endif
     movzx               mxd, mxb
     dec                srcq
-    vpbroadcastd         m3, [r7+mxq*8+subpel_filters-prep_avx2+2]
+    vpbroadcastd        ym6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
     lea            stride3q, [strideq*3]
 .h_w4_loop:
+%if cpuflag(avx512icl)
+    mova                ym0, ym4
+    mova                ym1, ym4
+    movq                xm2, [srcq+strideq*0]
+    movq                xm3, [srcq+strideq*1]
+    vpbroadcastq    ym2{k1}, [srcq+strideq*2]
+    vpbroadcastq    ym3{k1}, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pshufb              ym2, ym5
+    pshufb              ym3, ym5
+    vpdpbusd            ym0, ym2, ym6
+    vpdpbusd            ym1, ym3, ym6
+    packssdw            ym0, ym1
+    psraw               ym0, 2
+%else
     movq                xm0, [srcq+strideq*0]
     vpbroadcastq         m2, [srcq+strideq*2]
     movq                xm1, [srcq+strideq*1]
@@ -2602,42 +2740,69 @@
     vpblendd             m1, m1, m2, 0xf0
     pshufb               m0, m5
     pshufb               m1, m5
-    pmaddubsw            m0, m3
-    pmaddubsw            m1, m3
+    pmaddubsw            m0, m6
+    pmaddubsw            m1, m6
     phaddw               m0, m1
     pmulhrsw             m0, m4
-    mova             [tmpq], m0
+%endif
+    mova             [tmpq], ym0
     add                tmpq, 32
     sub                  hd, 4
     jg .h_w4_loop
     RET
 .h_w8:
-%macro PREP_8TAP_H 0
-    pshufb               m1, m0, m6
-    pshufb               m2, m0, m7
-    pshufb               m0, m5
-    pmaddubsw            m3, m1, m8
-    pmaddubsw            m1, m9
-    pmaddubsw            m2, m9
-    pmaddubsw            m0, m8
-    paddw                m2, m3
-    paddw                m0, m1
-    phaddw               m0, m2
-    pmulhrsw             m0, m4
-%endmacro
-    movu                xm0,     [srcq+strideq*0]
-    vinserti128          m0, m0, [srcq+strideq*1], 1
-    lea                srcq,     [srcq+strideq*2]
+%if cpuflag(avx512)
+    vbroadcasti128       m5, [subpel_h_shufA]
+    vbroadcasti128       m6, [subpel_h_shufB]
+    vbroadcasti128       m7, [subpel_h_shufC]
+    lea            stride3q, [strideq*3]
+%endif
+.h_w8_loop:
+    movu                xm0, [srcq+strideq*0]
+    vinserti128         ym0, [srcq+strideq*1], 1
+%if cpuflag(avx512)
+    vinserti128          m0, [srcq+strideq*2], 2
+    vinserti128          m0, [srcq+stride3q ], 3
+%endif
+    lea                srcq, [srcq+strideq*(mmsize/(8*2))]
+%if cpuflag(avx512icl)
+    mova                m10, m4
+    mova                m11, m4
+    pshufb               m1, m0, m5
+    pshufb               m2, m0, m6
+    pshufb               m3, m0, m7
+    vpdpbusd            m10, m1, m8
+    vpdpbusd            m11, m2, m8
+    vpdpbusd            m10, m2, m9
+    vpdpbusd            m11, m3, m9
+    packssdw            m10, m11
+    psraw                m0, m10, 2
+%else
     PREP_8TAP_H
+%endif
     mova             [tmpq], m0
-    add                tmpq, 32
-    sub                  hd, 2
-    jg .h_w8
+    add                tmpq, mmsize
+    sub                  hd, mmsize/(8*2)
+    jg .h_w8_loop
     RET
 .h_w16:
-    movu                xm0,     [srcq+strideq*0+8*0]
-    vinserti128          m0, m0, [srcq+strideq*0+8*1], 1
+%if cpuflag(avx512icl)
+    mova                 m5, [spel_h_perm16a]
+    mova                 m6, [spel_h_perm16b]
+    mova                 m7, [spel_h_perm16c]
+    lea            stride3q, [strideq*3]
+.h_w16_loop:
+    movu                ym0, [srcq+strideq*0]
+    movu                ym1, [srcq+strideq*2]
+    vinserti32x8         m0, [srcq+strideq*1], 1
+    vinserti32x8         m1, [srcq+stride3q ], 1
+    lea                srcq, [srcq+strideq*4]
     PREP_8TAP_H
+%else
+.h_w16_loop:
+    movu                xm0, [srcq+strideq*0+8*0]
+    vinserti128          m0, [srcq+strideq*0+8*1], 1
+    PREP_8TAP_H
     mova        [tmpq+32*0], m0
     movu                xm0,     [srcq+strideq*1+8*0]
     vinserti128          m0, m0, [srcq+strideq*1+8*1], 1
@@ -2644,32 +2809,67 @@
     lea                srcq, [srcq+strideq*2]
     PREP_8TAP_H
     mova        [tmpq+32*1], m0
-    add                tmpq, 64
-    sub                  hd, 2
-    jg .h_w16
+%endif
+    add                tmpq, mmsize*2
+    sub                  hd, mmsize*2/(16*2)
+    jg .h_w16_loop
     RET
 .h_w32:
+%if cpuflag(avx512icl)
+    mova                 m5, [spel_h_perm32a]
+    mova                 m6, [spel_h_perm32b]
+    mova                 m7, [spel_h_perm32c]
+.h_w32_loop:
+    movu                 m0, [srcq+strideq*0]
+    movu                 m1, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    PREP_8TAP_H
+    add                tmpq, 64*2
+    sub                  hd, 2
+    jg .h_w32_loop
+    RET
+%else
     xor                 r6d, r6d
     jmp .h_start
+%endif
 .h_w64:
+%if cpuflag(avx512)
+    xor                 r6d, r6d
+%else
     mov                  r6, -32*1
+%endif
     jmp .h_start
 .h_w128:
+%if cpuflag(avx512)
+    mov                  r6, -64*1
+%else
     mov                  r6, -32*3
+%endif
 .h_start:
+%if cpuflag(avx512)
+    mova                 m5, [spel_h_perm32a]
+    mova                 m6, [spel_h_perm32b]
+    mova                 m7, [spel_h_perm32c]
+%endif
     sub                srcq, r6
     mov                  r5, r6
 .h_loop:
-    movu                xm0,     [srcq+r6+8*0]
-    vinserti128          m0, m0, [srcq+r6+8*1], 1
+%if cpuflag(avx512icl)
+    movu                 m0, [srcq+r6+32*0]
+    movu                 m1, [srcq+r6+32*1]
     PREP_8TAP_H
+%else
+    movu                xm0, [srcq+r6+8*0]
+    vinserti128         ym0, [srcq+r6+8*1], 1
+    PREP_8TAP_H
     mova        [tmpq+32*0], m0
-    movu                xm0,     [srcq+r6+8*2]
-    vinserti128          m0, m0, [srcq+r6+8*3], 1
+    movu                xm0, [srcq+r6+8*2]
+    vinserti128         ym0, [srcq+r6+8*3], 1
     PREP_8TAP_H
     mova        [tmpq+32*1], m0
-    add                tmpq, 64
-    add                  r6, 32
+%endif
+    add                tmpq, mmsize*2
+    add                  r6, mmsize
     jle .h_loop
     add                srcq, strideq
     mov                  r6, r5
@@ -2684,62 +2884,89 @@
     cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
     cmove               myd, mxd ; had a negligible effect on performance.
     ; TODO: Would a 6-tap code path be worth it?
+%if cpuflag(avx512)
+    tzcnt                wd, wd
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+    add                  wq, r7
+%endif
+    lea                 myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
     vpbroadcastd         m7, [pw_8192]
-    lea                 myq, [r7+myq*8+subpel_filters-prep_avx2]
     vpbroadcastw         m8, [myq+0]
     vpbroadcastw         m9, [myq+2]
     vpbroadcastw        m10, [myq+4]
     vpbroadcastw        m11, [myq+6]
-    lea            stride3q, [strideq*3]
-    sub                srcq, stride3q
+%if cpuflag(avx512)
+    jmp                  wq
+%else
     cmp                  wd, 8
     jg .v_w16
     je .v_w8
+%endif
 .v_w4:
-    movd                xm0, [srcq+strideq*0]
-    vpbroadcastd         m1, [srcq+strideq*2]
-    vpbroadcastd        xm2, [srcq+strideq*1]
-    vpbroadcastd         m3, [srcq+stride3q ]
-    lea                srcq, [srcq+strideq*4]
-    vpblendd             m1, m1, m0, 0x01 ; 0 2 2 _   2 _ _ _
-    vpblendd             m3, m3, m2, 0x03 ; 1 1 3 3   3 3 _ _
-    vpbroadcastd         m0, [srcq+strideq*0]
-    vpbroadcastd         m2, [srcq+strideq*1]
-    vpblendd             m1, m1, m0, 0x68 ; 0 2 2 4   2 4 4 _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vbroadcasti128       m6, [deint_shuf4]
-    vpblendd             m3, m3, m2, 0xc0 ; 1 1 3 3   3 3 5 5
-    vpblendd             m2, m3, m1, 0x55 ; 0 1 2 3   2 3 4 5
-    vpblendd             m3, m3, m1, 0xaa ; 1 2 3 4   3 4 5 _
-    punpcklbw            m1, m2, m3       ; 01  12    23  34
-    vpblendd             m3, m3, m0, 0x80 ; 1 2 3 4   3 4 5 6
-    punpckhbw            m2, m3           ; 23  34    45  56
-.v_w4_loop:
-    pinsrd              xm0, [srcq+stride3q ], 1
-    lea                srcq, [srcq+strideq*4]
-    vpbroadcastd         m3, [srcq+strideq*0]
-    vpbroadcastd         m4, [srcq+strideq*1]
-    vpblendd             m3, m3, m4, 0x20 ; _ _ 8 _   8 9 _ _
-    vpblendd             m3, m3, m0, 0x03 ; 6 7 8 _   8 9 _ _
-    vpbroadcastd         m0, [srcq+strideq*2]
-    vpblendd             m3, m3, m0, 0x40 ; 6 7 8 _   8 9 a _
-    pshufb               m3, m6           ; 67  78    89  9a
-    pmaddubsw            m4, m1, m8
-    vperm2i128           m1, m2, m3, 0x21 ; 45  56    67  78
-    pmaddubsw            m2, m9
-    paddw                m4, m2
-    mova                 m2, m3
-    pmaddubsw            m3, m11
-    paddw                m3, m4
-    pmaddubsw            m4, m1, m10
-    paddw                m3, m4
-    pmulhrsw             m3, m7
-    mova             [tmpq], m3
+%if cpuflag(avx512)
+    AVX512_MM_PERMUTATION
+    PREP_8TAP_V_W4 23, 24, 25, 26, 27
+    AVX512_MM_PERMUTATION
+%else
+    PREP_8TAP_V_W4 7, 8, 9, 10, 11
+%endif
     add                tmpq, 32
     sub                  hd, 4
     jg .v_w4_loop
+%if cpuflag(avx512)
+    vzeroupper
+%endif
     RET
 .v_w8:
+%if cpuflag(avx512)
+    mov                 r3d, 0xf044
+    kmovw                k1, r3d
+    kshiftrw             k2, k1, 8
+    movq                xm0, [srcq+strideq*0]
+    vpbroadcastq        ym1, [srcq+strideq*1]
+    vpbroadcastq         m2, [srcq+strideq*2]
+    vpbroadcastq         m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m4, [srcq+strideq*0]
+    vpbroadcastq         m5, [srcq+strideq*1]
+    vpbroadcastq         m6, [srcq+strideq*2]
+    vmovdqa64       ym0{k1}, ym1
+    vmovdqa64       ym1{k1}, ym2
+    vmovdqa64        m2{k1}, m3
+    vmovdqa64        m3{k1}, m4
+    vmovdqa64        m4{k1}, m5
+    vmovdqa64        m5{k1}, m6
+    punpcklbw           ym0, ym1 ; 01 12 __ __
+    punpcklbw            m2, m3  ; 23 34 23 34
+    punpcklbw            m4, m5  ; 45 56 45 56
+    vmovdqa64        m0{k2}, m2  ; 01 12 23 34
+    vmovdqa64        m2{k2}, m4  ; 23 34 45 56
+.v_w8_loop:
+    vpbroadcastq         m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq         m3, [srcq+strideq*0]
+    vpbroadcastq         m5, [srcq+strideq*1]
+    pmaddubsw           m14, m0, m8
+    pmaddubsw           m15, m2, m9
+    vpblendmq        m0{k1}, m6, m1
+    vpblendmq        m2{k1}, m1, m3
+    vpbroadcastq         m6, [srcq+strideq*2]
+    paddw               m14, m15
+    punpcklbw            m2, m0, m2   ; 67 78 67 78
+    vpblendmq       m12{k1}, m3, m5
+    vpblendmq       m13{k1}, m5, m6
+    vpblendmq        m0{k2}, m4, m2   ; 45 56 67 78
+    punpcklbw            m4, m12, m13 ; 89 9a 89 9a
+    vmovdqa64        m2{k2}, m4       ; 67 78 89 9a
+    pmaddubsw           m12, m0, m10
+    pmaddubsw           m13, m2, m11
+    paddw               m14, m12
+    paddw               m14, m13
+    pmulhrsw            m14, m7
+    mova             [tmpq], m14
+%else
     movq                xm1, [srcq+strideq*0]
     vpbroadcastq         m4, [srcq+strideq*1]
     vpbroadcastq         m2, [srcq+strideq*2]
@@ -2788,11 +3015,73 @@
     pmulhrsw             m6, m7
     mova        [tmpq+32*0], m5
     mova        [tmpq+32*1], m6
+%endif
     add                tmpq, 32*2
     sub                  hd, 4
     jg .v_w8_loop
     RET
 .v_w16:
+%if cpuflag(avx512)
+    mov                 r3d, 0xf0
+    kmovb                k1, r3d
+    vbroadcasti128       m0, [srcq+strideq*0]
+    vbroadcasti128       m1, [srcq+strideq*1]
+    vbroadcasti128       m2, [srcq+strideq*2]
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vbroadcasti128       m4, [srcq+strideq*0]
+    vbroadcasti128       m5, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*2]
+    vmovdqa64        m0{k1}, m1
+    vmovdqa64        m1{k1}, m2
+    vmovdqa64        m2{k1}, m3
+    vmovdqa64        m3{k1}, m4
+    vmovdqa64        m4{k1}, m5
+    vmovdqa64        m5{k1}, m6
+    shufpd               m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+    shufpd               m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+    shufpd               m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+    shufpd               m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+    punpckhbw            m2, m0, m1   ;  23a   23b   34a   34b
+    punpcklbw            m0, m1       ;  01a   01b   12a   12b
+    punpcklbw            m4, m5       ;  45a   45b   56a   56b
+.v_w16_loop:
+    vbroadcasti128       m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vbroadcasti128       m5, [srcq+strideq*0]
+    vpblendmq        m1{k1}, m6, m3
+    vmovdqa64        m3{k1}, m5
+    pmaddubsw           m12, m0, m8
+    pmaddubsw           m13, m2, m8
+    pmaddubsw           m14, m2, m9
+    pmaddubsw           m15, m4, m9
+    pmaddubsw            m0, m4, m10
+    vbroadcasti128       m2, [srcq+strideq*1]
+    vbroadcasti128       m6, [srcq+strideq*2]
+    paddw               m12, m14
+    paddw               m13, m15
+    paddw               m12, m0
+    vmovdqa64        m5{k1}, m2
+    vmovdqa64        m2{k1}, m6
+    mova                 m0, m4
+    shufpd               m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+    shufpd               m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+    punpcklbw            m2, m1, m3   ;  67a   67b   78a   78b
+    punpckhbw            m4, m1, m3   ;  89a   89b   9Aa   9Ab
+    pmaddubsw           m14, m2, m10
+    pmaddubsw           m15, m2, m11
+    paddw               m13, m14
+    paddw               m12, m15
+    pmaddubsw           m14, m4, m11
+    paddw               m13, m14
+    pmulhrsw            m12, m7
+    pmulhrsw            m13, m7
+    mova          [tmpq+ 0], m12
+    mova          [tmpq+64], m13
+    add                tmpq, 64*2
+    sub                  hd, 4
+    jg .v_w16_loop
+%else
     lea                 r6d, [wq-16]
     mov                  r5, tmpq
     mov                  r7, srcq
@@ -2852,7 +3141,7 @@
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .v_w16_loop
-    mov                  hb, r6b
+    movzx                hd, r6b
     add                  r5, 32
     add                  r7, 16
     mov                tmpq, r5
@@ -2859,32 +3148,287 @@
     mov                srcq, r7
     sub                 r6d, 1<<8
     jg .v_w16_loop0
+%endif
     RET
+%if cpuflag(avx512)
+.v_w32:
+    mova                m18, [bilin_v_perm64]
+    movu                ym0, [srcq+strideq*0]
+    movu                ym1, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym2, [srcq+strideq*0]
+    movu                ym3, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym4, [srcq+strideq*0]
+    movu                ym5, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu                ym6, [srcq+strideq*0]
+    vpermq               m0, m18, m0
+    vpermq               m1, m18, m1
+    vpermq               m2, m18, m2
+    vpermq               m3, m18, m3
+    vpermq               m4, m18, m4
+    vpermq               m5, m18, m5
+    vpermq               m6, m18, m6
+    punpcklbw            m0, m1
+    punpcklbw            m1, m2
+    punpcklbw            m2, m3
+    punpcklbw            m3, m4
+    punpcklbw            m4, m5
+    punpcklbw            m5, m6
+.v_w32_loop:
+    movu               ym12, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    movu               ym13, [srcq+strideq*0]
+    pmaddubsw           m14, m0, m8
+    pmaddubsw           m16, m2, m9
+    pmaddubsw           m15, m1, m8
+    pmaddubsw           m17, m3, m9
+    mova                 m0, m2
+    mova                 m1, m3
+    vpermq              m12, m18, m12
+    vpermq              m13, m18, m13
+    paddw               m14, m16
+    paddw               m15, m17
+    pmaddubsw           m16, m4, m10
+    pmaddubsw           m17, m5, m10
+    punpcklbw            m6, m12
+    punpcklbw           m12, m13
+    mova                 m2, m4
+    mova                 m3, m5
+    paddw               m14, m16
+    paddw               m15, m17
+    pmaddubsw           m16, m6, m11
+    pmaddubsw           m17, m12, m11
+    mova                 m4, m6
+    mova                 m5, m12
+    paddw               m14, m16
+    paddw               m15, m17
+    pmulhrsw            m14, m7
+    pmulhrsw            m15, m7
+    mova                 m6, m13
+    mova          [tmpq+ 0], m14
+    mova          [tmpq+64], m15
+    add                tmpq, 64*2
+    sub                  hd, 2
+    jg .v_w32_loop
+    vzeroupper
+    RET
+.v_w64:
+    mov                 r6d, hd
+    mov                  wd, 64
+    jmp .v_start
+.v_w128:
+    lea                 r6d, [(1<<8)+hq]
+    mov                  wd, 128
+.v_start:
+    WIN64_SPILL_XMM      27
+    mova                m26, [bilin_v_perm64]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+.v_loop0:
+    vpermq               m0, m26, [srcq+strideq*0]
+    vpermq               m1, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m2, m26, [srcq+strideq*0]
+    vpermq               m3, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m4, m26, [srcq+strideq*0]
+    vpermq               m5, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq               m6, m26, [srcq+strideq*0]
+    punpckhbw           m12, m0, m1
+    punpcklbw            m0, m1
+    punpckhbw           m13, m1, m2
+    punpcklbw            m1, m2
+    punpckhbw           m14, m2, m3
+    punpcklbw            m2, m3
+    punpckhbw           m15, m3, m4
+    punpcklbw            m3, m4
+    punpckhbw           m16, m4, m5
+    punpcklbw            m4, m5
+    punpckhbw           m17, m5, m6
+    punpcklbw            m5, m6
+.v_loop:
+    vpermq              m18, m26, [srcq+strideq*1]
+    lea                srcq,      [srcq+strideq*2]
+    vpermq              m19, m26, [srcq+strideq*0]
+    pmaddubsw           m20, m0, m8
+    pmaddubsw           m21, m12, m8
+    pmaddubsw           m22, m1, m8
+    pmaddubsw           m23, m13, m8
+    mova                 m0, m2
+    mova                m12, m14
+    mova                 m1, m3
+    mova                m13, m15
+    pmaddubsw            m2, m9
+    pmaddubsw           m14, m9
+    pmaddubsw            m3, m9
+    pmaddubsw           m15, m9
+    punpckhbw           m24, m6, m18
+    punpcklbw            m6, m18
+    paddw               m20, m2
+    paddw               m21, m14
+    paddw               m22, m3
+    paddw               m23, m15
+    mova                 m2, m4
+    mova                m14, m16
+    mova                 m3, m5
+    mova                m15, m17
+    pmaddubsw            m4, m10
+    pmaddubsw           m16, m10
+    pmaddubsw            m5, m10
+    pmaddubsw           m17, m10
+    punpckhbw           m25, m18, m19
+    punpcklbw           m18, m19
+    paddw               m20, m4
+    paddw               m21, m16
+    paddw               m22, m5
+    paddw               m23, m17
+    mova                 m4, m6
+    mova                m16, m24
+    mova                 m5, m18
+    mova                m17, m25
+    pmaddubsw            m6, m11
+    pmaddubsw           m24, m11
+    pmaddubsw           m18, m11
+    pmaddubsw           m25, m11
+    paddw               m20, m6
+    paddw               m21, m24
+    paddw               m22, m18
+    paddw               m23, m25
+    pmulhrsw            m20, m7
+    pmulhrsw            m21, m7
+    pmulhrsw            m22, m7
+    pmulhrsw            m23, m7
+    mova                 m6, m19
+    mova     [tmpq+wq*0+ 0], m20
+    mova     [tmpq+wq*0+64], m21
+    mova     [tmpq+wq*2+ 0], m22
+    mova     [tmpq+wq*2+64], m23
+    lea                tmpq, [tmpq+wq*4]
+    sub                  hd, 2
+    jg .v_loop
+    movzx                hd, r6b
+    add                  r5, 64*2
+    add                  r7, 64
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .v_loop0
+%endif
+    RET
 .hv:
     %assign stack_offset stack_offset - stack_size_padded
+    %assign stack_size_padded 0
     WIN64_SPILL_XMM      16
     cmp                  wd, 4
-    jg .hv_w8
+    je .hv_w4
+    shr                 mxd, 16
+    sub                srcq, 3
+    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+    movzx               mxd, myb
+    shr                 myd, 16
+    cmp                  hd, 4
+    cmove               myd, mxd
+%if cpuflag(avx512)
+    tzcnt                wd, wd
+    vpbroadcastd         m8, [pd_2]
+    movzx                wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+    vpbroadcastd         m9, [pd_32]
+    add                  wq, r7
+%endif
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+    lea            stride3q, [strideq*3]
+    sub                srcq, stride3q
+    punpcklbw            m0, m0
+    psraw                m0, 8 ; sign-extend
+    pshufd              m12, m0, q0000
+    pshufd              m13, m0, q1111
+    pshufd              m14, m0, q2222
+    pshufd              m15, m0, q3333
+%if cpuflag(avx512)
+    jmp                  wq
+%else
+    jmp .hv_w8
+%endif
+.hv_w4:
     movzx               mxd, mxb
     dec                srcq
-    mova                 m7, [subpel_h_shuf4]
-    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep_avx2+2]
-    pmovzxbd             m9, [deint_shuf4]
+    vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 4
     cmove               myd, mxd
-    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep_avx2]
+    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
     lea            stride3q, [strideq*3]
     sub                srcq, stride3q
+%if cpuflag(avx512)
+    mov                 r3d, 0x04
+    kmovb                k1, r3d
+    kshiftlb             k2, k1, 2
+    kshiftlb             k3, k1, 4
+    vpbroadcastd        m10, [pd_2]
+    vbroadcasti128      m16, [subpel_h_shufA]
+%else
+    mova                 m7, [subpel_h_shuf4]
+    pmovzxbd             m9, [deint_shuf4]
+    vpbroadcastd        m10, [pw_8192]
+%endif
     punpcklbw            m0, m0
     psraw                m0, 8 ; sign-extend
-    vpbroadcastd        m10, [pw_8192]
     vpbroadcastd        m11, [pd_32]
     pshufd              m12, m0, q0000
     pshufd              m13, m0, q1111
     pshufd              m14, m0, q2222
     pshufd              m15, m0, q3333
+%if cpuflag(avx512icl)
+    movq                xm3, [srcq+strideq*0]
+    vpbroadcastq        ym2, [srcq+strideq*1]
+    vpbroadcastq    ym3{k1}, [srcq+strideq*2]
+    vpbroadcastq     m2{k2}, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vpbroadcastq     m3{k2}, [srcq+strideq*0]
+    vpbroadcastq     m2{k3}, [srcq+strideq*1]
+    vpbroadcastq     m3{k3}, [srcq+strideq*2]
+    mova                m17, [spel_hv_perm4a]
+    movu                m18, [spel_hv_perm4b]
+    mova                 m0, m10
+    mova                 m1, m10
+    pshufb               m2, m16
+    pshufb               m3, m16
+    vpdpbusd             m0, m2, m8
+    vpdpbusd             m1, m3, m8
+    packssdw             m0, m1        ; _ 0  1 2  3 4  5 6
+    psraw                m0, 2
+    vpermb               m1, m17, m0   ; 01 12 23 34
+    vpermb               m2, m18, m0   ; 23 34 45 56
+.hv_w4_loop:
+    movq                xm3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movq                xm4, [srcq+strideq*0]
+    vpbroadcastq    ym3{k1}, [srcq+strideq*1]
+    vpbroadcastq    ym4{k1}, [srcq+strideq*2]
+    mova                ym5, ym10
+    mova                ym6, ym10
+    pshufb              ym3, ym16
+    pshufb              ym4, ym16
+    vpdpbusd            ym5, ym3, ym8
+    vpdpbusd            ym6, ym4, ym8
+    mova                 m7, m11
+    packssdw            ym5, ym6       ; 7 8  9 a  _ _  _ _
+    psraw               ym5, 2
+    valignq              m0, m5, m0, 4 ; _ 4  5 6  7 8  9 a
+    vpdpwssd             m7, m1, m12
+    vpdpwssd             m7, m2, m13
+    vpermb               m1, m17, m0   ; 45 56 67 78
+    vpermb               m2, m18, m0   ; 67 78 89 9a
+    vpdpwssd             m7, m1, m14
+    vpdpwssd             m7, m2, m15
+    psrad                m7, 6
+    vpmovdw          [tmpq], m7
+%else
     vpbroadcastq         m2, [srcq+strideq*0]
     vpbroadcastq         m4, [srcq+strideq*1]
     vpbroadcastq         m0, [srcq+strideq*2]
@@ -2896,19 +3440,19 @@
     vpblendd             m2, m2, m4, 0xcc ; 0 1
     vpblendd             m0, m0, m5, 0xcc ; 2 3
     vpblendd             m3, m3, m6, 0xcc ; 4 5
-    pshufb               m2, m7
-    pshufb               m0, m7
-    pshufb               m3, m7
-    pshufb               m1, m7
+    pshufb               m2, m7 ; 00 01 10 11  02 03 12 13
+    pshufb               m0, m7 ; 20 21 30 31  22 23 32 33
+    pshufb               m3, m7 ; 40 41 50 51  42 43 52 53
+    pshufb               m1, m7 ; 60 61 60 61  62 63 62 63
     pmaddubsw            m2, m8
     pmaddubsw            m0, m8
     pmaddubsw            m3, m8
     pmaddubsw            m1, m8
-    phaddw               m2, m0
-    phaddw               m3, m1
+    phaddw               m2, m0 ; 0a 1a 2a 3a  0b 1b 2b 3b
+    phaddw               m3, m1 ; 4a 5a 6a __  4b 5b 6b __
     pmulhrsw             m2, m10
     pmulhrsw             m3, m10
-    palignr              m4, m3, m2, 4
+    palignr              m4, m3, m2, 4 ; 1a 2a 3a 4a  1b 2b 3b 4b
     punpcklwd            m1, m2, m4  ; 01 12
     punpckhwd            m2, m4      ; 23 34
     pshufd               m0, m3, q2121
@@ -2953,28 +3497,100 @@
     packssdw             m5, m6
     vpermd               m5, m9, m5
     mova             [tmpq], m5
+%endif
     add                tmpq, 32
     sub                  hd, 4
     jg .hv_w4_loop
+%if cpuflag(avx512)
+    vzeroupper
+%endif
     RET
 .hv_w8:
-    shr                 mxd, 16
-    sub                srcq, 3
-    vpbroadcastd        m10, [r7+mxq*8+subpel_filters-prep_avx2+0]
-    vpbroadcastd        m11, [r7+mxq*8+subpel_filters-prep_avx2+4]
-    movzx               mxd, myb
-    shr                 myd, 16
-    cmp                  hd, 4
-    cmove               myd, mxd
-    vpbroadcastq         m0, [r7+myq*8+subpel_filters-prep_avx2]
-    lea            stride3q, [strideq*3]
-    sub                srcq, stride3q
-    punpcklbw            m0, m0
-    psraw                m0, 8 ; sign-extend
-    pshufd              m12, m0, q0000
-    pshufd              m13, m0, q1111
-    pshufd              m14, m0, q2222
-    pshufd              m15, m0, q3333
+%if cpuflag(avx512icl)
+    WIN64_SPILL_XMM      24
+    vbroadcasti128      m16, [subpel_h_shufA]
+    vbroadcasti128      m17, [subpel_h_shufB]
+    vbroadcasti128      m18, [subpel_h_shufC]
+    vinserti128         ym0, [srcq+strideq*0], 1
+    vinserti128          m0, [srcq+strideq*1], 2
+    vinserti128          m0, [srcq+strideq*2], 3
+    movu                xm1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128         ym1, [srcq+strideq*0], 1
+    vinserti128          m1, [srcq+strideq*1], 2
+    vinserti128          m1, [srcq+strideq*2], 3
+    mova                 m2, m8
+    mova                 m4, m8
+    mova                 m3, m8
+    mova                 m5, m8
+    pshufb              m20, m0, m16
+    pshufb              m21, m0, m17
+    pshufb              m22, m0, m18
+    pshufb              m23, m1, m16
+    pshufb               m6, m1, m17
+    pshufb               m7, m1, m18
+    vpdpbusd             m2, m20, m10
+    vpdpbusd             m4, m21, m10
+    vpdpbusd             m2, m21, m11
+    vpdpbusd             m4, m22, m11
+    vpdpbusd             m3, m23, m10
+    vpdpbusd             m5,  m6, m10
+    vpdpbusd             m3,  m6, m11
+    vpdpbusd             m5,  m7, m11
+    packssdw             m2, m4
+    packssdw             m3, m5
+    psraw                m2, 2          ; _ 0 1 2
+    psraw                m3, 2          ; 3 4 5 6
+    valignq              m0, m3, m2, 2  ; 0 1 2 3
+    valignq              m1, m3, m2, 4  ; 1 2 3 4
+    valignq              m2, m3, m2, 6  ; 2 3 4 5
+    punpcklwd            m4, m0, m1 ; 01a 12a 23a 34a
+    punpckhwd            m5, m0, m1 ; 01b 12b 23b 34b
+    punpcklwd            m6, m2, m3 ; 23a 34a 45a 56a
+    punpckhwd            m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+    movu               xm19, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    vinserti128        ym19, [srcq+strideq*0], 1
+    vinserti128         m19, [srcq+strideq*1], 2
+    vinserti128         m19, [srcq+strideq*2], 3
+    mova                m20, m9
+    mova                m21, m9
+    mova                m22, m8
+    mova                m23, m8
+    vpdpwssd            m20, m4, m12
+    vpdpwssd            m21, m5, m12
+    vpdpwssd            m20, m6, m13
+    vpdpwssd            m21, m7, m13
+    pshufb               m0, m19, m16
+    pshufb               m1, m19, m17
+    pshufb               m2, m19, m18
+    vpdpbusd            m22, m0, m10
+    vpdpbusd            m23, m1, m10
+    vpdpbusd            m22, m1, m11
+    vpdpbusd            m23, m2, m11
+    packssdw            m22, m23
+    psraw               m22, 2          ; 7 8 9 A
+    valignq              m0, m22, m3, 2 ; 4 5 6 7
+    valignq              m1, m22, m3, 4 ; 5 6 7 8
+    valignq              m2, m22, m3, 6 ; 6 7 8 9
+    mova                 m3, m22
+    punpcklwd            m4, m0, m1 ; 45a 56a 67a 78a
+    punpckhwd            m5, m0, m1 ; 45b 56b 67b 78b
+    punpcklwd            m6, m2, m3 ; 67a 78a 89a 9Aa
+    punpckhwd            m7, m2, m3 ; 67b 78b 89b 9Ab
+    vpdpwssd            m20, m4, m14
+    vpdpwssd            m21, m5, m14
+    vpdpwssd            m20, m6, m15
+    vpdpwssd            m21, m7, m15
+    psrad               m20, 6
+    psrad               m21, 6
+    packssdw            m20, m21
+    mova             [tmpq], m20
+    add                tmpq, 64
+    sub                  hd, 4
+    jg .hv_w8_loop
+%else
     lea                 r6d, [wq-8]
     mov                  r5, tmpq
     mov                  r7, srcq
@@ -3060,7 +3676,7 @@
     lea                tmpq, [tmpq+wq*4]
     sub                  hd, 2
     jg .hv_w8_loop
-    mov                  hb, r6b
+    movzx                hd, r6b
     add                  r5, 16
     add                  r7, 8
     mov                tmpq, r5
@@ -3067,7 +3683,143 @@
     mov                srcq, r7
     sub                 r6d, 1<<8
     jg .hv_w8_loop0
+%endif
     RET
+%if cpuflag(avx512icl)
+.hv_w16:
+    mov                  wd, 16*2
+    jmp .hv_start
+.hv_w32:
+    mov                  wd, 32*2
+    jmp .hv_start
+.hv_w64:
+    mov                  wd, 64*2
+    jmp .hv_start
+.hv_w128:
+    mov                  wd, 128*2
+.hv_start:
+    WIN64_SPILL_XMM      31
+    mova                m16, [spel_h_perm16a]
+    mova                m17, [spel_h_perm16b]
+    mova                m18, [spel_h_perm16c]
+    lea                 r6d, [wq*8-16*2*8+hq]
+    mov                  r5, tmpq
+    mov                  r7, srcq
+.hv_loop0:
+    movu                ym0, [srcq+strideq*0]
+    vinserti32x8         m0, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym1, [srcq+strideq*0]
+    vinserti32x8         m1, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym2, [srcq+strideq*0]
+    vinserti32x8         m2, [srcq+strideq*1], 1
+    lea                srcq, [srcq+strideq*2]
+    movu                ym3, [srcq+strideq*0]
+    mova                 m4, m8
+    mova                 m5, m8
+    mova                 m6, m8
+    mova                 m7, m8
+    vpermb              m19, m16, m0
+    vpermb              m20, m17, m0
+    vpermb              m21, m18, m0
+    vpermb              m22, m16, m1
+    vpermb              m23, m17, m1
+    vpermb              m24, m18, m1
+    vpermb              m25, m16, m2
+    vpermb              m26, m17, m2
+    vpermb              m27, m18, m2
+    vpermb             ym28, ym16, ym3
+    vpermb             ym29, ym17, ym3
+    vpermb             ym30, ym18, ym3
+    mova                 m0, m8
+    mova                 m1, m8
+    mova                ym2, ym8
+    mova                ym3, ym8
+    vpdpbusd             m4, m19, m10
+    vpdpbusd             m5, m20, m10
+    vpdpbusd             m6, m22, m10
+    vpdpbusd             m7, m23, m10
+    vpdpbusd             m0, m25, m10
+    vpdpbusd             m1, m26, m10
+    vpdpbusd            ym2, ym28, ym10
+    vpdpbusd            ym3, ym29, ym10
+    vpdpbusd             m4, m20, m11
+    vpdpbusd             m5, m21, m11
+    vpdpbusd             m6, m23, m11
+    vpdpbusd             m7, m24, m11
+    vpdpbusd             m0, m26, m11
+    vpdpbusd             m1, m27, m11
+    vpdpbusd            ym2, ym29, ym11
+    vpdpbusd            ym3, ym30, ym11
+    packssdw             m4, m5
+    packssdw             m6, m7
+    packssdw             m0, m1
+    packssdw            ym2, ym3
+    psraw                m4, 2             ; 0a 0b 1a 1b
+    psraw                m6, 2             ; 2a 2b 3a 3b
+    psraw                m0, 2             ; 4a 4b 5a 5b
+    psraw               ym2, 2             ; 6a 6b __ __
+    vshufi32x4           m5, m4, m6, q1032 ; 1a 1b 2a 2b
+    vshufi32x4           m7, m6, m0, q1032 ; 3a 3b 4a 4b
+    vshufi32x4           m1, m0, m2, q1032 ; 5a 5b 6a 6b
+    punpcklwd            m2, m4, m5 ; 01a 01c 12a 12c
+    punpckhwd            m3, m4, m5 ; 01b 01d 12b 12d
+    punpcklwd            m4, m6, m7 ; 23a 23c 34a 34c
+    punpckhwd            m5, m6, m7 ; 23b 23d 34b 34d
+    punpcklwd            m6, m0, m1 ; 45a 45c 56a 56c
+    punpckhwd            m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+    movu               ym19, [srcq+strideq*1]
+    lea                srcq, [srcq+strideq*2]
+    vinserti32x8        m19, [srcq+strideq*0], 1
+    mova                m20, m9
+    mova                m21, m9
+    mova                m22, m8
+    mova                m23, m8
+    vpdpwssd            m20, m2, m12
+    vpdpwssd            m21, m3, m12
+    vpdpwssd            m20, m4, m13
+    vpdpwssd            m21, m5, m13
+    vpermb              m24, m16, m19
+    vpermb              m25, m17, m19
+    vpermb              m26, m18, m19
+    vpdpbusd            m22, m24, m10
+    vpdpbusd            m23, m25, m10
+    vpdpbusd            m22, m25, m11
+    vpdpbusd            m23, m26, m11
+    packssdw            m22, m23
+    psraw               m22, 2              ; 7a 7b 8a 8b
+    vshufi32x4           m0, m1, m22, q1032 ; 6a 6b 7a 7b
+    mova                 m2, m4
+    mova                 m3, m5
+    mova                 m1, m22
+    mova                 m4, m6
+    mova                 m5, m7
+    punpcklwd            m6, m0, m1 ; 67a 67c 78a 78c
+    punpckhwd            m7, m0, m1 ; 67b 67d 78b 78d
+    vpdpwssd            m20, m4, m14
+    vpdpwssd            m21, m5, m14
+    vpdpwssd            m20, m6, m15
+    vpdpwssd            m21, m7, m15
+    psrad               m20, 6
+    psrad               m21, 6
+    packssdw            m20, m21
+    mova          [tmpq+wq*0], ym20
+    vextracti32x8 [tmpq+wq*1], m20, 1
+    lea                tmpq, [tmpq+wq*2]
+    sub                  hd, 2
+    jg .hv_loop
+    movzx                hd, r6b
+    add                  r5, 32
+    add                  r7, 16
+    mov                tmpq, r5
+    mov                srcq, r7
+    sub                 r6d, 1<<8
+    jg .hv_loop0
+%endif
+    RET
+%endmacro
 
 %macro WARP_V 5 ; dst, 02, 46, 13, 57
     ; Can be done using gathers, but that's terribly slow on many CPU:s
@@ -4568,5 +5320,13 @@
 
 .end:
     RET
+
+INIT_YMM avx2
+PREP_BILIN
+PREP_8TAP
+
+INIT_ZMM avx512icl
+PREP_BILIN
+PREP_8TAP
 
 %endif ; ARCH_X86_64
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -49,22 +49,31 @@
 decl_mc_fn(dav1d_put_bilin_avx2);
 decl_mc_fn(dav1d_put_bilin_ssse3);
 
+decl_mct_fn(dav1d_prep_8tap_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx512icl);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
 decl_mct_fn(dav1d_prep_bilin_avx512icl);
@@ -209,6 +218,15 @@
         return;
 
 #if BITDEPTH == 8 && ARCH_X86_64
+    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   avx512icl);
+    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          avx512icl);
     init_mct_fn(FILTER_2D_BILINEAR,            bilin,               avx512icl);
 #endif
 }