shithub: dav1d

Download patch

ref: d6770f93447fa6fa5a95e15cb567657684410d79
parent: f816d5cf2d1742cbb3527d622701d134c409e051
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Dec 3 15:42:49 EST 2018

Add ipred_z1 AVX2 asm

--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -57,6 +57,21 @@
      18,  16,  15,  13,  12,  10,   9,   8, \
       7,   6,   6,   5,   5,   4,   4,   4
 
+; Note that the order of (some of) the following z constants matter
+z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+              db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
+              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
+z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
+              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+z_filter_t0:  db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
+z_filter_t1:  db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
+z_upsample:   db  1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  7
+z_shuf_w4:    db  0,  1,  1,  2,  2,  3,  3,  4,  8,  9,  9, 10, 10, 11, 11, 12
+z_base_inc:   dw  0*64,  1*64,  2*64,  3*64,  4*64,  5*64,  6*64,  7*64
+              dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
+
 ; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
 filter_shuf1: db 10,  4, 10,  4, 37,  6,  5,  6,103,  9,  7,  9, 72, -1,  8, -1
               db 16,  4,  0,  4, 53,  6,  5,  6,119, 11,  7, 11, 95, -1, 15, -1
@@ -67,6 +82,7 @@
 ipred_h_shuf: db  7,  7,  7,  7,  3,  3,  3,  3,  5,  5,  5,  5,  1,  1,  1,  1
               db  6,  6,  6,  6,  2,  2,  2,  2,  4,  4,  4,  4,  0,  0,  0,  0
 
+pb_0to15:
 cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
                         db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
                         ; w=8, w_pad=1 as well as second half of previous one
@@ -81,12 +97,21 @@
 
 pb_1:   times 4 db 1
 pb_2:   times 4 db 2
+pb_8:   times 4 db 8
+pb_12:  times 4 db 12
+pb_14:  times 4 db 14
+pb_15   times 4 db 15
+pb_31:  times 4 db 31
 pb_128: times 4 db 128
 pw_1:   times 2 dw 1
 pw_8:   times 2 dw 8
+pw_62:  times 2 dw 62
+pw_64:  times 2 dw 64
 pw_128: times 2 dw 128
 pw_255: times 2 dw 255
+pw_512: times 2 dw 512
 
+pb_36_m4:    times 2 db  36,   -4
 pb_127_m127: times 2 db 127, -127
 
 %macro JMP_TABLE 3-*
@@ -111,6 +136,7 @@
                                 s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
 JMP_TABLE ipred_dc_left,  avx2, h4, h8, h16, h32, h64
 JMP_TABLE ipred_h,        avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1,       avx2, w4, w8, w16, w32, w64
 JMP_TABLE ipred_cfl,      avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
                                 s4-8*4, s8-8*4, s16-8*4, s32-8*4
 JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
@@ -117,6 +143,7 @@
 JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
 JMP_TABLE pal_pred,       avx2, w4, w8, w16, w32, w64
 
+cextern dr_intra_derivative
 cextern filter_intra_taps
 
 SECTION .text
@@ -1258,6 +1285,849 @@
     sub                 tlq, hq
     sub                  r3, hq
     ret
+
+cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+    %assign org_stack_offset stack_offset
+    lea                  r6, [ipred_z1_avx2_table]
+    tzcnt                wd, wm
+    movifnidn        angled, anglem
+    movifnidn            hd, hm
+    lea                  r7, [dr_intra_derivative]
+    inc                 tlq
+    movsxd               wq, [r6+wq*4]
+    add                  wq, r6
+    movzx               dxd, angleb
+    add              angled, 165 ; ~90
+    movzx               dxd, word [r7+dxq*2]
+    xor              angled, 0x4ff ; d = 90 - angle
+    vpbroadcastd         m3, [pw_512]
+    vpbroadcastd         m4, [pw_62]
+    vpbroadcastd         m5, [pw_64]
+    jmp                  wq
+.w4:
+    cmp              angleb, 40
+    jae .w4_no_upsample
+    lea                 r3d, [angleq-1024]
+    sar                 r3d, 7
+    add                 r3d, hd
+    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+    ALLOC_STACK         -32, 8
+    mova                xm1, [tlq-1]
+    pshufb              xm0, xm1, [z_upsample]
+    vpbroadcastd        xm2, [pb_8]
+    pminub              xm2, [z_filter_s+6]
+    pshufb              xm1, xm2
+    vpbroadcastd        xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+    add                 dxd, dxd        ; pw_512 (which is already in m3)
+    pmaddubsw           xm0, xm2        ; for rounding instead of pw_2048
+    pextrd         [rsp+16], xm1, 3 ; top[max_base_x]
+    pmaddubsw           xm1, xm2
+    movd                xm7, dxd
+    mov                 r3d, dxd ; xpos
+    vpbroadcastw         m7, xm7
+    paddw               xm1, xm0
+    movq                xm0, [tlq]
+    pmulhrsw            xm1, xm3
+    pslldq               m6, m7, 8
+    paddw               xm2, xm7, xm7
+    lea                  r2, [strideq*3]
+    paddw                m6, m7
+    packuswb            xm1, xm1
+    paddw                m6, m2 ; xpos2 xpos3 xpos0 xpos1
+    punpcklbw           xm0, xm1
+    psllw                m7, 2
+    mova              [rsp], xm0
+.w4_upsample_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    vpbroadcastq         m1, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vpbroadcastq         m2, [rsp+r5]
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    movq                xm0, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    movhps              xm0, [rsp+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac << 1
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; (32 - frac) << 1
+    psllw                m2, 8
+    por                  m1, m2     ; (32-frac, frac) << 1
+    pmaddubsw            m0, m1
+    paddw                m6, m7     ; xpos += dx
+    pmulhrsw             m0, m3
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [dstq+strideq*2], xm0
+    pextrd [dstq+r2       ], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_upsample_loop
+    RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+    ; The C version uses a lot of branches, but we can do all the comparisons
+    ; in parallel and use popcnt to get the final filter strength value.
+    movd                xm0, maxbased
+    movd                xm2, angled
+    lea                  r3, [z_filter_t0]
+    shr              angled, 8 ; is_sm << 1
+    vpbroadcastb         m0, xm0
+    vpbroadcastb         m2, xm2
+    pcmpeqb              m1, m0, [r3-z_filter_t0+z_filter_wh]
+    pand                 m1, m2
+    mova                xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+    pcmpgtb              m1, m2
+    pmovmskb            r5d, m1
+    popcnt              r5d, r5d ; sets ZF which can be used by caller
+    ret
+.w4_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -16, 11
+    mov            maxbased, 7
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w4_main
+    lea            maxbased, [hq+3]
+    call .filter_strength
+    mov            maxbased, 7
+    jz .w4_main ; filter_strength == 0
+    lea                  r3, [z_filter_k-4]
+    vpbroadcastd         m7, [pb_8]
+    vbroadcasti128       m2, [tlq-1]
+    pminub               m1, m7, [r3-z_filter_k+z_filter_s+4]
+    vpbroadcastd         m8, [r3+r5*4+12*0]
+    pminub               m7, [r3-z_filter_k+z_filter_s+12]
+    vpbroadcastd         m9, [r3+r5*4+12*1]
+    vpbroadcastd        m10, [r3+r5*4+12*2]
+    pshufb               m0, m2, m1
+    shufps               m1, m7, q2121
+    pmaddubsw            m0, m8
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m9
+    pshufb               m2, m7
+    pmaddubsw            m2, m10
+    paddw                m0, m1
+    paddw                m0, m2
+    pmulhrsw             m0, m3
+    mov                 r3d, 9
+    mov                 tlq, rsp
+    cmp                  hd, 4
+    cmova          maxbased, r3d
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    mova              [tlq], xm0
+.w4_main:
+    movd                xm6, dxd
+    vpbroadcastq         m0, [z_base_inc] ; base_inc << 6
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    mov                 r3d, dxd ; xpos
+    movd                xm9, maxbased
+    vpbroadcastw         m9, xm9
+    vbroadcasti128       m8, [z_shuf_w4]
+    psrlw                m7, 8  ; top[max_base_x]
+    paddw               m10, m6, m6
+    psubw                m9, m0 ; max_base_x
+    vpblendd             m6, m10, 0xcc
+    mova                xm0, xm10
+    paddw                m6, m0 ; xpos2 xpos3 xpos0 xpos1
+    paddw               m10, m10
+.w4_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    vpbroadcastq         m1, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vpbroadcastq         m2, [tlq+r5]
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    movq                xm0, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    movhps              xm0, [tlq+r5]
+    vpblendd             m1, m2, 0xc0
+    pand                 m2, m4, m6 ; frac << 1
+    vpblendd             m0, m1, 0xf0
+    psubw                m1, m5, m2 ; (32 - frac) << 1
+    psllw                m2, 8
+    pshufb               m0, m8
+    por                  m1, m2     ; (32-frac, frac) << 1
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m6 ; base < max_base_x
+    pmulhrsw             m0, m3
+    paddsw               m6, m10    ; xpos += dx
+    lea                  r5, [dstq+strideq*2]
+    vpblendvb            m0, m7, m0, m1
+    packuswb             m0, m0
+    vextracti128        xm1, m0, 1
+    movd   [r5  +strideq*0], xm0
+    pextrd [r5  +strideq*1], xm0, 1
+    movd   [dstq+strideq*0], xm1
+    pextrd [dstq+strideq*1], xm1, 1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jz .w4_end
+    cmp                 r3d, maxbased
+    jb .w4_loop
+    packuswb            xm7, xm7
+    lea                  r6, [strideq*3]
+.w4_end_loop:
+    movd   [dstq+strideq*0], xm7
+    movd   [dstq+strideq*1], xm7
+    movd   [dstq+strideq*2], xm7
+    movd   [dstq+r6       ], xm7
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w4_end_loop
+.w4_end:
+    RET
+ALIGN function_align
+.w8:
+    lea                 r3d, [angleq+216]
+    mov                 r3b, hb
+    cmp                 r3d, 8
+    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 8
+    movu                xm2, [z_filter_s+6]
+    mova                xm0, [tlq-1]
+    movd                xm6, hd
+    vinserti128          m0, [tlq+7], 1
+    vpbroadcastb        xm6, xm6
+    vbroadcasti128       m1, [z_upsample]
+    pminub              xm6, xm2
+    vpbroadcastd         m7, [pb_36_m4]
+    vinserti128          m2, xm6, 1
+    add                 dxd, dxd
+    pshufb               m1, m0, m1
+    pshufb               m2, m0, m2
+    movd                xm6, dxd
+    pmaddubsw            m1, m7
+    pmaddubsw            m2, m7
+    vpbroadcastw         m6, xm6
+    mov                 r3d, dxd
+    psrldq               m0, 1
+    lea                  r2, [strideq*3]
+    paddw                m7, m6, m6
+    paddw                m1, m2
+    vpblendd             m6, m7, 0xf0
+    pmulhrsw             m1, m3
+    pslldq               m2, m7, 8
+    paddw                m7, m7
+    paddw                m6, m2
+    packuswb             m1, m1
+    punpcklbw            m0, m1
+    mova              [rsp], m0
+.w8_upsample_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    movu                xm0, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [rsp+r5], 1
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base2
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    punpcklqdq           m1, m2, m2 ; frac0 frac1
+    pmaddubsw            m0, m1
+    movu                xm1, [rsp+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base3
+    vinserti128          m1, [rsp+r5], 1
+    punpckhqdq           m2, m2 ; frac2 frac3
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    paddw                m6, m7
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    vextracti128        xm1, m0, 1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*2], xm0
+    movq   [dstq+strideq*1], xm1
+    movhps [dstq+r2       ], xm1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
+    jg .w8_upsample_loop
+    RET
+.w8_no_intra_edge_filter:
+    mov                 r3d, 15
+    cmp                  hd, 8
+    cmova          maxbased, r3d
+    jmp .w8_main
+.w8_no_upsample:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -32, 10
+    lea            maxbased, [hq+7]
+    test             angled, 0x400
+    jnz .w8_no_intra_edge_filter
+    call .filter_strength
+    vpbroadcastd        xm6, [pb_15]
+    pminub              xm6, xm0 ; imin(h, 8) + 7
+    movd           maxbased, xm6
+    movzx          maxbased, maxbaseb
+    jz .w8_main ; filter_strength == 0
+    lea                  r3, [z_filter_k-4]
+    movu                xm2, [tlq]
+    pminub              xm1, xm6, [r3-z_filter_k+z_filter_s+18]
+    vinserti128          m2, [tlq-1], 1
+    vinserti128          m1, [r3-z_filter_k+z_filter_s+ 4], 1
+    vpbroadcastd         m7, [r3+r5*4+12*0]
+    pminub              xm6, [r3-z_filter_k+z_filter_s+26]
+    vinserti128          m6, [r3-z_filter_k+z_filter_s+12], 1
+    pshufb               m0, m2, m1
+    pmaddubsw            m0, m7
+    vpbroadcastd         m7, [r3+r5*4+12*1]
+    movzx               r3d, byte [tlq+15]
+    shufps               m1, m6, q2121
+    pshufb               m1, m2, m1
+    pmaddubsw            m1, m7
+    paddw                m0, m1
+    sub                 r5d, 3
+    jnz .w8_3tap
+    ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+    ; which also results in an awkward edge case where out[w*2] is
+    ; slightly different from out[max_base_x] when h > w.
+    vpbroadcastd         m7, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq+14]
+    pshufb               m2, m6
+    pmaddubsw            m2, m7
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4]
+    shr                 r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+    mov            [rsp+16], r2b
+    paddw                m0, m2
+.w8_3tap:
+    pmulhrsw             m0, m3
+    sar                 r5d, 1
+    mov                 tlq, rsp
+    add                 r5d, 17 ; w*2 + (filter_strength == 3)
+    cmp                  hd, 8
+    cmova          maxbased, r5d
+    mov            [tlq+r5], r3b
+    vextracti128        xm1, m0, 1
+    packuswb            xm1, xm0
+    mova              [tlq], xm1
+.w8_main:
+    movd                xm2, dxd
+    vbroadcasti128       m0, [z_base_inc]
+    vpbroadcastw         m2, xm2
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    psrlw                m7, 8
+    psubw                m9, m0
+    mov                 r3d, dxd
+    paddw                m6, m2, m2
+    vpblendd             m2, m6, 0xf0
+.w8_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6
+    pand                 m0, m4, m2
+    psubw                m1, m5, m0
+    psllw                m0, 8
+    por                  m1, m0
+    movu                xm0, [tlq+r3]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [tlq+r5], 1
+    pshufb               m0, m8
+    pmaddubsw            m0, m1
+    pcmpgtw              m1, m9, m2
+    paddsw               m2, m6
+    pmulhrsw             m0, m3
+    vpblendvb            m0, m7, m0, m1
+    vextracti128        xm1, m0, 1
+    packuswb            xm0, xm1
+    movq   [dstq+strideq*0], xm0
+    movhps [dstq+strideq*1], xm0
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jz .w8_end
+    cmp                 r3d, maxbased
+    jb .w8_loop
+    packuswb            xm7, xm7
+.w8_end_loop:
+    movq   [dstq+strideq*0], xm7
+    movq   [dstq+strideq*1], xm7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w8_end_loop
+.w8_end:
+    RET
+.w16_no_intra_edge_filter:
+    mov                 r3d, 31
+    cmp                  hd, 16
+    cmova          maxbased, r3d
+    jmp .w16_main
+ALIGN function_align
+.w16:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -64, 12
+    lea            maxbased, [hq+15]
+    test             angled, 0x400
+    jnz .w16_no_intra_edge_filter
+    call .filter_strength
+    vpbroadcastd         m1, [pb_31]
+    pminub               m0, m1 ; imin(h, 16) + 15
+    movd           maxbased, xm0
+    movzx          maxbased, maxbaseb
+    jz .w16_main ; filter_strength == 0
+    lea                  r3, [z_filter_k-4]
+    vpbroadcastd         m1, [pb_12]
+    vpbroadcastd        m11, [pb_15]
+    vbroadcasti128       m6, [r3-z_filter_k+z_filter_s+12]
+    vinserti128          m2, m6, [r3-z_filter_k+z_filter_s+4], 0
+    vinserti128          m6, [r3-z_filter_k+z_filter_s+20], 1
+    mova               xm10, [tlq-1]
+    vinserti128         m10, [tlq+3], 1
+    vpbroadcastd         m9, [r3+r5*4+12*0]
+    vbroadcasti128       m7, [r3-z_filter_k+z_filter_s+18]
+    vinserti128          m8, m7, [r3-z_filter_k+z_filter_s+10], 0
+    vinserti128          m7, [r3-z_filter_k+z_filter_s+26], 1
+    psubw                m0, m1
+    pminub               m0, m11 ; imin(h+3, 15)
+    movu               xm11, [tlq+12]
+    vinserti128         m11, [tlq+16], 1
+    pminub               m8, m0
+    pminub               m7, m0
+    pshufb               m0, m10, m2
+    shufps               m2, m6, q2121
+    pmaddubsw            m0, m9
+    pshufb               m1, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw            m1, m9
+    vpbroadcastd         m9, [r3+r5*4+12*1]
+    movzx               r3d, byte [tlq+31]
+    pshufb               m2, m10, m2
+    pmaddubsw            m2, m9
+    pshufb               m8, m11, m8
+    pmaddubsw            m8, m9
+    paddw                m0, m2
+    paddw                m1, m8
+    sub                 r5d, 3
+    jnz .w16_3tap
+    vpbroadcastd         m9, [z_filter_k+4*8]
+    movzx               r2d, byte [tlq+30]
+    pshufb              m10, m6
+    pmaddubsw           m10, m9
+    pshufb              m11, m7
+    pmaddubsw           m11, m9
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4]
+    shr                 r2d, 3
+    mov            [rsp+32], r2b
+    paddw                m0, m10
+    paddw                m1, m11
+.w16_3tap:
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    sar                 r5d, 1
+    mov                 tlq, rsp
+    add                 r5d, 33
+    cmp                  hd, 16
+    cmova          maxbased, r5d
+    mov            [tlq+r5], r3b
+    packuswb             m0, m1
+    vpermq               m0, m0, q3120
+    mova              [tlq], m0
+.w16_main:
+    movd                xm6, dxd
+    vbroadcasti128       m0, [z_base_inc]
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    mov                 r3d, dxd
+    psubw                m9, m0
+    paddw               m11, m6, m6
+    psubw               m10, m9, m3 ; 64*8
+    vpblendd             m6, m11, 0xf0
+.w16_loop:
+    lea                 r5d, [r3+dxq]
+    shr                 r3d, 6 ; base0
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                xm0, [tlq+r3+0]
+    movu                xm1, [tlq+r3+8]
+    lea                 r3d, [r5+dxq]
+    shr                 r5d, 6 ; base1
+    vinserti128          m0, [tlq+r5+0], 1
+    vinserti128          m1, [tlq+r5+8], 1
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova         [dstq+strideq*0], xm0
+    vextracti128 [dstq+strideq*1], m0, 1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jz .w16_end
+    cmp                 r3d, maxbased
+    jb .w16_loop
+.w16_end_loop:
+    mova   [dstq+strideq*0], xm7
+    mova   [dstq+strideq*1], xm7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w16_end_loop
+.w16_end:
+    RET
+ALIGN function_align
+.w32:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK         -96, 15
+    lea                 r3d, [hq+31]
+    mov            maxbased, 63
+    cmp                  hd, 32
+    cmovb          maxbased, r3d
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w32_main
+    vbroadcasti128       m0, [pb_0to15]
+    sub                 r3d, 29 ; h+2
+    movu               xm13, [tlq+29]    ; 32-39
+    movd                xm1, r3d
+    movu               xm14, [tlq+37]    ; 40-47
+    sub                 r3d, 8 ; h-6
+    vinserti128         m14, [tlq+51], 1 ; 56-63
+    vpbroadcastb        xm1, xm1
+    mova               xm11, [tlq- 1]    ;  0- 7
+    vinserti128         m11, [tlq+13], 1 ; 16-23
+    movd                xm2, r3d
+    movu               xm12, [tlq+ 5]    ;  8-15
+    vinserti128         m12, [tlq+19], 1 ; 24-31
+    pminub              xm1, xm0 ; clip 32x8
+    mova                 m7, [z_filter_s+0]
+    pshufb             xm13, xm1
+    vpbroadcastd         m1, [pb_12]
+    vpbroadcastb        xm2, xm2
+    vinserti128         m13, [tlq+43], 1 ; 48-55
+    vinserti128          m8, m7, [z_filter_s+4], 1
+    vpblendd             m2, m1, 0xf0
+    vinserti128          m7, [z_filter_s+12], 0
+    pminub               m2, m0 ; clip 32x16 and 32x(32|64)
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m14, m2
+    pshufb               m0, m11, m8
+    shufps               m8, m7, q1021
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    shufps               m8, m7, q2121
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m8
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m8
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m8
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*2]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m7
+    pmaddubsw           m12, m9
+    movzx               r3d, byte [tlq+63]
+    movzx               r2d, byte [tlq+62]
+    paddw                m0, m11
+    paddw                m2, m12
+    pshufb              m13, m7
+    pmaddubsw           m13, m9
+    pshufb              m14, m7
+    pmaddubsw           m14, m9
+    paddw                m1, m13
+    paddw                m6, m14
+    sub                 r2d, r3d
+    lea                 r2d, [r2+r3*8+4] ; edge case for 32x64
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    shr                 r2d, 3
+    mov            [rsp+64], r2b
+    mov                 tlq, rsp
+    mov            [tlq+65], r3b
+    mov                 r3d, 65
+    cmp                  hd, 32
+    cmova          maxbased, r3d
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova           [tlq+ 0], m0
+    mova           [tlq+32], m1
+.w32_main:
+    movd                xm6, dxd
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    movd                xm9, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    vpbroadcastw         m9, xm9
+    mov                 r3d, dxd
+    psubw                m9, [z_base_inc]
+    mova                m11, m6
+    psubw               m10, m9, m3 ; 64*8
+.w32_loop:
+    mov                 r5d, r3d
+    shr                 r5d, 6
+    pand                 m1, m4, m6
+    psubw                m2, m5, m1
+    psllw                m1, 8
+    por                  m2, m1
+    movu                 m0, [tlq+r5+0]
+    movu                 m1, [tlq+r5+8]
+    add                 r3d, dxd
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m9, m6
+    pcmpgtw              m2, m10, m6
+    packsswb             m1, m2
+    paddsw               m6, m11
+    vpblendvb            m0, m7, m0, m1
+    mova             [dstq], m0
+    add                dstq, strideq
+    dec                  hd
+    jz .w32_end
+    cmp                 r3d, maxbased
+    jb .w32_loop
+    test                 hb, 1
+    jz .w32_end_loop
+    mova             [dstq], m7
+    add                dstq, strideq
+    dec                  hd
+    jz .w32_end
+.w32_end_loop:
+    mova   [dstq+strideq*0], m7
+    mova   [dstq+strideq*1], m7
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
+    jg .w32_end_loop
+.w32_end:
+    RET
+ALIGN function_align
+.w64:
+    %assign stack_offset org_stack_offset
+    ALLOC_STACK        -128, 16
+    lea            maxbased, [hq+63]
+    test             angled, 0x400 ; !enable_intra_edge_filter
+    jnz .w64_main
+    mova               xm11, [tlq- 1]    ;  0- 7
+    vinserti128         m11, [tlq+13], 1 ; 16-23
+    movu               xm12, [tlq+ 5]    ;  8-15
+    vinserti128         m12, [tlq+19], 1 ; 24-31
+    mova                 m7, [z_filter_s+0]
+    vinserti128          m8, m7, [z_filter_s+4], 1
+    vinserti128          m7, [z_filter_s+12], 0
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    movu               xm13, [tlq+29]    ; 32-39
+    vinserti128         m13, [tlq+43], 1 ; 48-55
+    movu               xm14, [tlq+37]    ; 40-47
+    vinserti128         m14, [tlq+51], 1 ; 56-63
+    pshufb               m0, m11, m8
+    shufps               m8, m7, q1021
+    pmaddubsw            m0, m9
+    pshufb               m2, m12, m8
+    pmaddubsw            m2, m9
+    pshufb               m1, m13, m8
+    pmaddubsw            m1, m9
+    pshufb               m6, m14, m8
+    pmaddubsw            m6, m9
+    vpbroadcastd         m9, [z_filter_k+4*2+12*1]
+    pshufb              m10, m11, m8
+    shufps              m15, m8, m7, q2121
+    pmaddubsw           m10, m9
+    paddw                m0, m10
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m2, m10
+    pshufb              m10, m13, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m6, m10
+    vpbroadcastd        m10, [z_filter_k+4*2+12*2]
+    pshufb              m11, m15
+    pmaddubsw           m11, m10
+    pshufb              m12, m7
+    pmaddubsw           m12, m10
+    pshufb              m13, m7
+    pmaddubsw           m13, m10
+    pshufb              m14, m7
+    pmaddubsw           m14, m10
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    movu               xm11, [tlq+ 61]    ;  64- 71
+    vinserti128         m11, [tlq+ 75], 1 ;  80- 87
+    movu               xm12, [tlq+ 69]    ;  72- 79
+    vinserti128         m12, [tlq+ 83], 1 ;  88- 95
+    movu               xm13, [tlq+ 93]    ;  96-103
+    vinserti128         m13, [tlq+107], 1 ; 112-119
+    movu               xm14, [tlq+101]    ; 104-111
+    vinserti128         m14, [tlq+115], 1 ; 120-127
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    lea                 r3d, [hq-20]
+    mov                 tlq, rsp
+    packuswb             m0, m2
+    packuswb             m1, m6
+    vpbroadcastd        xm2, [pb_14]
+    vbroadcasti128       m6, [pb_0to15]
+    mova         [tlq+32*0], m0
+    mova         [tlq+32*1], m1
+    movd                xm0, r3d
+    vpbroadcastd         m1, [pb_12]
+    vpbroadcastb         m0, xm0
+    paddb                m0, m2
+    pminub               m0, m6 ; clip 64x16 and 64x32
+    pshufb              m12, m0
+    pminub               m1, m6 ; clip 64x64
+    pshufb              m14, m1
+    pshufb               m0, m11, m7
+    pmaddubsw            m0, m10
+    pshufb               m2, m12, m7
+    pmaddubsw            m2, m10
+    pshufb               m1, m13, m7
+    pmaddubsw            m1, m10
+    pshufb               m6, m14, m7
+    pmaddubsw            m6, m10
+    pshufb               m7, m11, m15
+    pmaddubsw            m7, m9
+    pshufb              m10, m12, m15
+    pmaddubsw           m10, m9
+    paddw                m0, m7
+    pshufb               m7, m13, m15
+    pmaddubsw            m7, m9
+    paddw                m2, m10
+    pshufb              m10, m14, m15
+    pmaddubsw           m10, m9
+    paddw                m1, m7
+    paddw                m6, m10
+    vpbroadcastd         m9, [z_filter_k+4*2+12*0]
+    pshufb              m11, m8
+    pmaddubsw           m11, m9
+    pshufb              m12, m8
+    pmaddubsw           m12, m9
+    pshufb              m13, m8
+    pmaddubsw           m13, m9
+    pshufb              m14, m8
+    pmaddubsw           m14, m9
+    paddw                m0, m11
+    paddw                m2, m12
+    paddw                m1, m13
+    paddw                m6, m14
+    pmulhrsw             m0, m3
+    pmulhrsw             m2, m3
+    pmulhrsw             m1, m3
+    pmulhrsw             m6, m3
+    packuswb             m0, m2
+    packuswb             m1, m6
+    mova         [tlq+32*2], m0
+    mova         [tlq+32*3], m1
+.w64_main:
+    movd                xm6, dxd
+    vpbroadcastb         m7, [tlq+maxbaseq]
+    shl            maxbased, 6
+    vpbroadcastw         m6, xm6
+    movd               xm10, maxbased
+    vbroadcasti128       m8, [z_filter_s+2]
+    mov                 r3d, dxd
+    vpbroadcastw        m10, xm10
+    psllw                m0, m3, 2   ; 64*32
+    psubw               m10, [z_base_inc]
+    mova                m14, m6
+    psubw               m11, m10, m3 ; 64*8
+    psubw               m12, m10, m0
+    psubw               m13, m11, m0
+.w64_loop:
+    mov                 r5d, r3d
+    shr                 r5d, 6
+    movu                 m0, [tlq+r5+ 0]
+    movu                 m1, [tlq+r5+ 8]
+    pand                 m2, m4, m6
+    psubw                m9, m5, m2
+    psllw                m2, 8
+    por                  m9, m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    packuswb             m0, m1
+    pcmpgtw              m1, m10, m6
+    pcmpgtw              m2, m11, m6
+    packsswb             m1, m2
+    vpblendvb            m2, m7, m0, m1
+    movu                 m0, [tlq+r5+32]
+    movu                 m1, [tlq+r5+40]
+    add                 r3d, dxd
+    mova          [dstq+ 0], m2
+    pshufb               m0, m8
+    pshufb               m1, m8
+    pmaddubsw            m0, m9
+    pmaddubsw            m1, m9
+    pcmpgtw              m9, m12, m6
+    pcmpgtw              m2, m13, m6
+    pmulhrsw             m0, m3
+    pmulhrsw             m1, m3
+    paddsw               m6, m14
+    packsswb             m9, m2
+    packuswb             m0, m1
+    vpblendvb            m0, m7, m0, m9
+    mova          [dstq+32], m0
+    add                dstq, strideq
+    dec                  hd
+    jz .w64_end
+    cmp                 r3d, maxbased
+    jb .w64_loop
+.w64_end_loop:
+    mova          [dstq+ 0], m7
+    mova          [dstq+32], m7
+    add                dstq, strideq
+    dec                  hd
+    jg .w64_end_loop
+.w64_end:
+    RET
 
 %macro FILTER_XMM 4 ; dst, src, tmp, shuf
 %ifnum %4
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -38,6 +38,7 @@
 decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
 decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
 decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
 
 decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
@@ -65,6 +66,7 @@
     c->intra_pred[SMOOTH_PRED]   = dav1d_ipred_smooth_avx2;
     c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
     c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
+    c->intra_pred[Z1_PRED]       = dav1d_ipred_z1_avx2;
     c->intra_pred[FILTER_PRED]   = dav1d_ipred_filter_avx2;
 
     c->cfl_pred[DC_PRED]      = dav1d_ipred_cfl_avx2;
--- a/tests/checkasm/ipred.c
+++ b/tests/checkasm/ipred.c
@@ -84,7 +84,8 @@
 
                     int a = 0;
                     if (mode >= Z1_PRED && mode <= Z3_PRED) /* angle */
-                        a = 90 * (mode - Z1_PRED) + z_angles[rand() % 27];
+                        a = (90 * (mode - Z1_PRED) + z_angles[rand() % 27]) |
+                            (rand() & 0x600);
                     else if (mode == FILTER_PRED) /* filter_idx */
                         a = (rand() % 5) | (rand() & ~511);