ref: d6770f93447fa6fa5a95e15cb567657684410d79
parent: f816d5cf2d1742cbb3527d622701d134c409e051
author: Henrik Gramner <gramner@twoorioles.com>
date: Mon Dec 3 15:42:49 EST 2018
Add ipred_z1 AVX2 asm
--- a/src/x86/ipred.asm
+++ b/src/x86/ipred.asm
@@ -57,6 +57,21 @@
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
+; Note that the order of (some of) the following z constants matter
+z_filter_wh: db 7, 7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
+ db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
+z_filter_k: db 0, 16, 0, 16, 0, 20, 0, 20, 8, 16, 8, 16
+ db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
+ db 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 8, 0
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
+z_filter_t0: db 55,127, 39,127, 39,127, 7, 15, 31, 7, 15, 31, 0, 3, 31, 0
+z_filter_t1: db 39, 63, 19, 47, 19, 47, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0
+z_upsample: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+ dw 16*64, 17*64, 18*64, 19*64, 20*64, 21*64, 22*64, 23*64
+
; vpermd indices in bits 4..6 of filter_shuf1: 0, 2, 6, 4, 1, 3, 7, 5
filter_shuf1: db 10, 4, 10, 4, 37, 6, 5, 6,103, 9, 7, 9, 72, -1, 8, -1
db 16, 4, 0, 4, 53, 6, 5, 6,119, 11, 7, 11, 95, -1, 15, -1
@@ -67,6 +82,7 @@
ipred_h_shuf: db 7, 7, 7, 7, 3, 3, 3, 3, 5, 5, 5, 5, 1, 1, 1, 1
db 6, 6, 6, 6, 2, 2, 2, 2, 4, 4, 4, 4, 0, 0, 0, 0
+pb_0to15:
cfl_ac_w16_pad_shuffle: ; w=16, w_pad=1
db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
; w=8, w_pad=1 as well as second half of previous one
@@ -81,12 +97,21 @@
pb_1: times 4 db 1
pb_2: times 4 db 2
+pb_8: times 4 db 8
+pb_12: times 4 db 12
+pb_14: times 4 db 14
+pb_15 times 4 db 15
+pb_31: times 4 db 31
pb_128: times 4 db 128
pw_1: times 2 dw 1
pw_8: times 2 dw 8
+pw_62: times 2 dw 62
+pw_64: times 2 dw 64
pw_128: times 2 dw 128
pw_255: times 2 dw 255
+pw_512: times 2 dw 512
+pb_36_m4: times 2 db 36, -4
pb_127_m127: times 2 db 127, -127
%macro JMP_TABLE 3-*
@@ -111,6 +136,7 @@
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left, avx2, h4, h8, h16, h32, h64
JMP_TABLE ipred_h, avx2, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, avx2, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, avx2, h4, h8, h16, h32
@@ -117,6 +143,7 @@
JMP_TABLE ipred_cfl_ac_420, avx2, w16_pad1, w16_pad2, w16_pad3
JMP_TABLE pal_pred, avx2, w4, w8, w16, w32, w64
+cextern dr_intra_derivative
cextern filter_intra_taps
SECTION .text
@@ -1258,6 +1285,849 @@
sub tlq, hq
sub r3, hq
ret
+
+cglobal ipred_z1, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
+ %assign org_stack_offset stack_offset
+ lea r6, [ipred_z1_avx2_table]
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ movifnidn hd, hm
+ lea r7, [dr_intra_derivative]
+ inc tlq
+ movsxd wq, [r6+wq*4]
+ add wq, r6
+ movzx dxd, angleb
+ add angled, 165 ; ~90
+ movzx dxd, word [r7+dxq*2]
+ xor angled, 0x4ff ; d = 90 - angle
+ vpbroadcastd m3, [pw_512]
+ vpbroadcastd m4, [pw_62]
+ vpbroadcastd m5, [pw_64]
+ jmp wq
+.w4:
+ cmp angleb, 40
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ ALLOC_STACK -32, 8
+ mova xm1, [tlq-1]
+ pshufb xm0, xm1, [z_upsample]
+ vpbroadcastd xm2, [pb_8]
+ pminub xm2, [z_filter_s+6]
+ pshufb xm1, xm2
+ vpbroadcastd xm2, [pb_36_m4] ; upshifted by 2 to be able to reuse
+ add dxd, dxd ; pw_512 (which is already in m3)
+ pmaddubsw xm0, xm2 ; for rounding instead of pw_2048
+ pextrd [rsp+16], xm1, 3 ; top[max_base_x]
+ pmaddubsw xm1, xm2
+ movd xm7, dxd
+ mov r3d, dxd ; xpos
+ vpbroadcastw m7, xm7
+ paddw xm1, xm0
+ movq xm0, [tlq]
+ pmulhrsw xm1, xm3
+ pslldq m6, m7, 8
+ paddw xm2, xm7, xm7
+ lea r2, [strideq*3]
+ paddw m6, m7
+ packuswb xm1, xm1
+ paddw m6, m2 ; xpos2 xpos3 xpos0 xpos1
+ punpcklbw xm0, xm1
+ psllw m7, 2
+ mova [rsp], xm0
+.w4_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [rsp+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [rsp+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac << 1
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; (32 - frac) << 1
+ psllw m2, 8
+ por m1, m2 ; (32-frac, frac) << 1
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m3
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*2], xm0
+ pextrd [dstq+r2 ], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_upsample_loop
+ RET
+ALIGN function_align
+.filter_strength: ; w4/w8/w16
+ ; The C version uses a lot of branches, but we can do all the comparisons
+ ; in parallel and use popcnt to get the final filter strength value.
+ movd xm0, maxbased
+ movd xm2, angled
+ lea r3, [z_filter_t0]
+ shr angled, 8 ; is_sm << 1
+ vpbroadcastb m0, xm0
+ vpbroadcastb m2, xm2
+ pcmpeqb m1, m0, [r3-z_filter_t0+z_filter_wh]
+ pand m1, m2
+ mova xm2, [r3+angleq*8] ; upper ymm half zero in both cases
+ pcmpgtb m1, m2
+ pmovmskb r5d, m1
+ popcnt r5d, r5d ; sets ZF which can be used by caller
+ ret
+.w4_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -16, 11
+ mov maxbased, 7
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+ lea maxbased, [hq+3]
+ call .filter_strength
+ mov maxbased, 7
+ jz .w4_main ; filter_strength == 0
+ lea r3, [z_filter_k-4]
+ vpbroadcastd m7, [pb_8]
+ vbroadcasti128 m2, [tlq-1]
+ pminub m1, m7, [r3-z_filter_k+z_filter_s+4]
+ vpbroadcastd m8, [r3+r5*4+12*0]
+ pminub m7, [r3-z_filter_k+z_filter_s+12]
+ vpbroadcastd m9, [r3+r5*4+12*1]
+ vpbroadcastd m10, [r3+r5*4+12*2]
+ pshufb m0, m2, m1
+ shufps m1, m7, q2121
+ pmaddubsw m0, m8
+ pshufb m1, m2, m1
+ pmaddubsw m1, m9
+ pshufb m2, m7
+ pmaddubsw m2, m10
+ paddw m0, m1
+ paddw m0, m2
+ pmulhrsw m0, m3
+ mov r3d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmova maxbased, r3d
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ mova [tlq], xm0
+.w4_main:
+ movd xm6, dxd
+ vpbroadcastq m0, [z_base_inc] ; base_inc << 6
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ mov r3d, dxd ; xpos
+ movd xm9, maxbased
+ vpbroadcastw m9, xm9
+ vbroadcasti128 m8, [z_shuf_w4]
+ psrlw m7, 8 ; top[max_base_x]
+ paddw m10, m6, m6
+ psubw m9, m0 ; max_base_x
+ vpblendd m6, m10, 0xcc
+ mova xm0, xm10
+ paddw m6, m0 ; xpos2 xpos3 xpos0 xpos1
+ paddw m10, m10
+.w4_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ vpbroadcastq m1, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vpbroadcastq m2, [tlq+r5]
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ movq xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ movhps xm0, [tlq+r5]
+ vpblendd m1, m2, 0xc0
+ pand m2, m4, m6 ; frac << 1
+ vpblendd m0, m1, 0xf0
+ psubw m1, m5, m2 ; (32 - frac) << 1
+ psllw m2, 8
+ pshufb m0, m8
+ por m1, m2 ; (32-frac, frac) << 1
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m6 ; base < max_base_x
+ pmulhrsw m0, m3
+ paddsw m6, m10 ; xpos += dx
+ lea r5, [dstq+strideq*2]
+ vpblendvb m0, m7, m0, m1
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r5 +strideq*0], xm0
+ pextrd [r5 +strideq*1], xm0, 1
+ movd [dstq+strideq*0], xm1
+ pextrd [dstq+strideq*1], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jz .w4_end
+ cmp r3d, maxbased
+ jb .w4_loop
+ packuswb xm7, xm7
+ lea r6, [strideq*3]
+.w4_end_loop:
+ movd [dstq+strideq*0], xm7
+ movd [dstq+strideq*1], xm7
+ movd [dstq+strideq*2], xm7
+ movd [dstq+r6 ], xm7
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w4_end_loop
+.w4_end:
+ RET
+ALIGN function_align
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 8
+ movu xm2, [z_filter_s+6]
+ mova xm0, [tlq-1]
+ movd xm6, hd
+ vinserti128 m0, [tlq+7], 1
+ vpbroadcastb xm6, xm6
+ vbroadcasti128 m1, [z_upsample]
+ pminub xm6, xm2
+ vpbroadcastd m7, [pb_36_m4]
+ vinserti128 m2, xm6, 1
+ add dxd, dxd
+ pshufb m1, m0, m1
+ pshufb m2, m0, m2
+ movd xm6, dxd
+ pmaddubsw m1, m7
+ pmaddubsw m2, m7
+ vpbroadcastw m6, xm6
+ mov r3d, dxd
+ psrldq m0, 1
+ lea r2, [strideq*3]
+ paddw m7, m6, m6
+ paddw m1, m2
+ vpblendd m6, m7, 0xf0
+ pmulhrsw m1, m3
+ pslldq m2, m7, 8
+ paddw m7, m7
+ paddw m6, m2
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp], m0
+.w8_upsample_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu xm0, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [rsp+r5], 1
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base2
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ punpcklqdq m1, m2, m2 ; frac0 frac1
+ pmaddubsw m0, m1
+ movu xm1, [rsp+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base3
+ vinserti128 m1, [rsp+r5], 1
+ punpckhqdq m2, m2 ; frac2 frac3
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ paddw m6, m7
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*2], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+r2 ], xm1
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w8_upsample_loop
+ RET
+.w8_no_intra_edge_filter:
+ mov r3d, 15
+ cmp hd, 8
+ cmova maxbased, r3d
+ jmp .w8_main
+.w8_no_upsample:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -32, 10
+ lea maxbased, [hq+7]
+ test angled, 0x400
+ jnz .w8_no_intra_edge_filter
+ call .filter_strength
+ vpbroadcastd xm6, [pb_15]
+ pminub xm6, xm0 ; imin(h, 8) + 7
+ movd maxbased, xm6
+ movzx maxbased, maxbaseb
+ jz .w8_main ; filter_strength == 0
+ lea r3, [z_filter_k-4]
+ movu xm2, [tlq]
+ pminub xm1, xm6, [r3-z_filter_k+z_filter_s+18]
+ vinserti128 m2, [tlq-1], 1
+ vinserti128 m1, [r3-z_filter_k+z_filter_s+ 4], 1
+ vpbroadcastd m7, [r3+r5*4+12*0]
+ pminub xm6, [r3-z_filter_k+z_filter_s+26]
+ vinserti128 m6, [r3-z_filter_k+z_filter_s+12], 1
+ pshufb m0, m2, m1
+ pmaddubsw m0, m7
+ vpbroadcastd m7, [r3+r5*4+12*1]
+ movzx r3d, byte [tlq+15]
+ shufps m1, m6, q2121
+ pshufb m1, m2, m1
+ pmaddubsw m1, m7
+ paddw m0, m1
+ sub r5d, 3
+ jnz .w8_3tap
+ ; filter_strength == 3 uses a 5-tap filter instead of a 3-tap one,
+ ; which also results in an awkward edge case where out[w*2] is
+ ; slightly different from out[max_base_x] when h > w.
+ vpbroadcastd m7, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+14]
+ pshufb m2, m6
+ pmaddubsw m2, m7
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3 ; (tlq[w*2-2] + tlq[w*2-1]*7 + 4) >> 3
+ mov [rsp+16], r2b
+ paddw m0, m2
+.w8_3tap:
+ pmulhrsw m0, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 17 ; w*2 + (filter_strength == 3)
+ cmp hd, 8
+ cmova maxbased, r5d
+ mov [tlq+r5], r3b
+ vextracti128 xm1, m0, 1
+ packuswb xm1, xm0
+ mova [tlq], xm1
+.w8_main:
+ movd xm2, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastw m2, xm2
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ psrlw m7, 8
+ psubw m9, m0
+ mov r3d, dxd
+ paddw m6, m2, m2
+ vpblendd m2, m6, 0xf0
+.w8_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6
+ pand m0, m4, m2
+ psubw m1, m5, m0
+ psllw m0, 8
+ por m1, m0
+ movu xm0, [tlq+r3]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5], 1
+ pshufb m0, m8
+ pmaddubsw m0, m1
+ pcmpgtw m1, m9, m2
+ paddsw m2, m6
+ pmulhrsw m0, m3
+ vpblendvb m0, m7, m0, m1
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xm0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jz .w8_end
+ cmp r3d, maxbased
+ jb .w8_loop
+ packuswb xm7, xm7
+.w8_end_loop:
+ movq [dstq+strideq*0], xm7
+ movq [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16_no_intra_edge_filter:
+ mov r3d, 31
+ cmp hd, 16
+ cmova maxbased, r3d
+ jmp .w16_main
+ALIGN function_align
+.w16:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -64, 12
+ lea maxbased, [hq+15]
+ test angled, 0x400
+ jnz .w16_no_intra_edge_filter
+ call .filter_strength
+ vpbroadcastd m1, [pb_31]
+ pminub m0, m1 ; imin(h, 16) + 15
+ movd maxbased, xm0
+ movzx maxbased, maxbaseb
+ jz .w16_main ; filter_strength == 0
+ lea r3, [z_filter_k-4]
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastd m11, [pb_15]
+ vbroadcasti128 m6, [r3-z_filter_k+z_filter_s+12]
+ vinserti128 m2, m6, [r3-z_filter_k+z_filter_s+4], 0
+ vinserti128 m6, [r3-z_filter_k+z_filter_s+20], 1
+ mova xm10, [tlq-1]
+ vinserti128 m10, [tlq+3], 1
+ vpbroadcastd m9, [r3+r5*4+12*0]
+ vbroadcasti128 m7, [r3-z_filter_k+z_filter_s+18]
+ vinserti128 m8, m7, [r3-z_filter_k+z_filter_s+10], 0
+ vinserti128 m7, [r3-z_filter_k+z_filter_s+26], 1
+ psubw m0, m1
+ pminub m0, m11 ; imin(h+3, 15)
+ movu xm11, [tlq+12]
+ vinserti128 m11, [tlq+16], 1
+ pminub m8, m0
+ pminub m7, m0
+ pshufb m0, m10, m2
+ shufps m2, m6, q2121
+ pmaddubsw m0, m9
+ pshufb m1, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m1, m9
+ vpbroadcastd m9, [r3+r5*4+12*1]
+ movzx r3d, byte [tlq+31]
+ pshufb m2, m10, m2
+ pmaddubsw m2, m9
+ pshufb m8, m11, m8
+ pmaddubsw m8, m9
+ paddw m0, m2
+ paddw m1, m8
+ sub r5d, 3
+ jnz .w16_3tap
+ vpbroadcastd m9, [z_filter_k+4*8]
+ movzx r2d, byte [tlq+30]
+ pshufb m10, m6
+ pmaddubsw m10, m9
+ pshufb m11, m7
+ pmaddubsw m11, m9
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4]
+ shr r2d, 3
+ mov [rsp+32], r2b
+ paddw m0, m10
+ paddw m1, m11
+.w16_3tap:
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ sar r5d, 1
+ mov tlq, rsp
+ add r5d, 33
+ cmp hd, 16
+ cmova maxbased, r5d
+ mov [tlq+r5], r3b
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ mova [tlq], m0
+.w16_main:
+ movd xm6, dxd
+ vbroadcasti128 m0, [z_base_inc]
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r3d, dxd
+ psubw m9, m0
+ paddw m11, m6, m6
+ psubw m10, m9, m3 ; 64*8
+ vpblendd m6, m11, 0xf0
+.w16_loop:
+ lea r5d, [r3+dxq]
+ shr r3d, 6 ; base0
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu xm0, [tlq+r3+0]
+ movu xm1, [tlq+r3+8]
+ lea r3d, [r5+dxq]
+ shr r5d, 6 ; base1
+ vinserti128 m0, [tlq+r5+0], 1
+ vinserti128 m1, [tlq+r5+8], 1
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddsw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jz .w16_end
+ cmp r3d, maxbased
+ jb .w16_loop
+.w16_end_loop:
+ mova [dstq+strideq*0], xm7
+ mova [dstq+strideq*1], xm7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w16_end_loop
+.w16_end:
+ RET
+ALIGN function_align
+.w32:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -96, 15
+ lea r3d, [hq+31]
+ mov maxbased, 63
+ cmp hd, 32
+ cmovb maxbased, r3d
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+ vbroadcasti128 m0, [pb_0to15]
+ sub r3d, 29 ; h+2
+ movu xm13, [tlq+29] ; 32-39
+ movd xm1, r3d
+ movu xm14, [tlq+37] ; 40-47
+ sub r3d, 8 ; h-6
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ vpbroadcastb xm1, xm1
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movd xm2, r3d
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ pminub xm1, xm0 ; clip 32x8
+ mova m7, [z_filter_s+0]
+ pshufb xm13, xm1
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb xm2, xm2
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vpblendd m2, m1, 0xf0
+ vinserti128 m7, [z_filter_s+12], 0
+ pminub m2, m0 ; clip 32x16 and 32x(32|64)
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m14, m2
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m8
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m8
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m8
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*2]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m7
+ pmaddubsw m12, m9
+ movzx r3d, byte [tlq+63]
+ movzx r2d, byte [tlq+62]
+ paddw m0, m11
+ paddw m2, m12
+ pshufb m13, m7
+ pmaddubsw m13, m9
+ pshufb m14, m7
+ pmaddubsw m14, m9
+ paddw m1, m13
+ paddw m6, m14
+ sub r2d, r3d
+ lea r2d, [r2+r3*8+4] ; edge case for 32x64
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ shr r2d, 3
+ mov [rsp+64], r2b
+ mov tlq, rsp
+ mov [tlq+65], r3b
+ mov r3d, 65
+ cmp hd, 32
+ cmova maxbased, r3d
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+ 0], m0
+ mova [tlq+32], m1
+.w32_main:
+ movd xm6, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm9, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ vpbroadcastw m9, xm9
+ mov r3d, dxd
+ psubw m9, [z_base_inc]
+ mova m11, m6
+ psubw m10, m9, m3 ; 64*8
+.w32_loop:
+ mov r5d, r3d
+ shr r5d, 6
+ pand m1, m4, m6
+ psubw m2, m5, m1
+ psllw m1, 8
+ por m2, m1
+ movu m0, [tlq+r5+0]
+ movu m1, [tlq+r5+8]
+ add r3d, dxd
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m2
+ pmaddubsw m1, m2
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m9, m6
+ pcmpgtw m2, m10, m6
+ packsswb m1, m2
+ paddsw m6, m11
+ vpblendvb m0, m7, m0, m1
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jz .w32_end
+ cmp r3d, maxbased
+ jb .w32_loop
+ test hb, 1
+ jz .w32_end_loop
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jz .w32_end
+.w32_end_loop:
+ mova [dstq+strideq*0], m7
+ mova [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_end_loop
+.w32_end:
+ RET
+ALIGN function_align
+.w64:
+ %assign stack_offset org_stack_offset
+ ALLOC_STACK -128, 16
+ lea maxbased, [hq+63]
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ mova xm11, [tlq- 1] ; 0- 7
+ vinserti128 m11, [tlq+13], 1 ; 16-23
+ movu xm12, [tlq+ 5] ; 8-15
+ vinserti128 m12, [tlq+19], 1 ; 24-31
+ mova m7, [z_filter_s+0]
+ vinserti128 m8, m7, [z_filter_s+4], 1
+ vinserti128 m7, [z_filter_s+12], 0
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ movu xm13, [tlq+29] ; 32-39
+ vinserti128 m13, [tlq+43], 1 ; 48-55
+ movu xm14, [tlq+37] ; 40-47
+ vinserti128 m14, [tlq+51], 1 ; 56-63
+ pshufb m0, m11, m8
+ shufps m8, m7, q1021
+ pmaddubsw m0, m9
+ pshufb m2, m12, m8
+ pmaddubsw m2, m9
+ pshufb m1, m13, m8
+ pmaddubsw m1, m9
+ pshufb m6, m14, m8
+ pmaddubsw m6, m9
+ vpbroadcastd m9, [z_filter_k+4*2+12*1]
+ pshufb m10, m11, m8
+ shufps m15, m8, m7, q2121
+ pmaddubsw m10, m9
+ paddw m0, m10
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m2, m10
+ pshufb m10, m13, m15
+ pmaddubsw m10, m9
+ paddw m1, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m6, m10
+ vpbroadcastd m10, [z_filter_k+4*2+12*2]
+ pshufb m11, m15
+ pmaddubsw m11, m10
+ pshufb m12, m7
+ pmaddubsw m12, m10
+ pshufb m13, m7
+ pmaddubsw m13, m10
+ pshufb m14, m7
+ pmaddubsw m14, m10
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ movu xm11, [tlq+ 61] ; 64- 71
+ vinserti128 m11, [tlq+ 75], 1 ; 80- 87
+ movu xm12, [tlq+ 69] ; 72- 79
+ vinserti128 m12, [tlq+ 83], 1 ; 88- 95
+ movu xm13, [tlq+ 93] ; 96-103
+ vinserti128 m13, [tlq+107], 1 ; 112-119
+ movu xm14, [tlq+101] ; 104-111
+ vinserti128 m14, [tlq+115], 1 ; 120-127
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ lea r3d, [hq-20]
+ mov tlq, rsp
+ packuswb m0, m2
+ packuswb m1, m6
+ vpbroadcastd xm2, [pb_14]
+ vbroadcasti128 m6, [pb_0to15]
+ mova [tlq+32*0], m0
+ mova [tlq+32*1], m1
+ movd xm0, r3d
+ vpbroadcastd m1, [pb_12]
+ vpbroadcastb m0, xm0
+ paddb m0, m2
+ pminub m0, m6 ; clip 64x16 and 64x32
+ pshufb m12, m0
+ pminub m1, m6 ; clip 64x64
+ pshufb m14, m1
+ pshufb m0, m11, m7
+ pmaddubsw m0, m10
+ pshufb m2, m12, m7
+ pmaddubsw m2, m10
+ pshufb m1, m13, m7
+ pmaddubsw m1, m10
+ pshufb m6, m14, m7
+ pmaddubsw m6, m10
+ pshufb m7, m11, m15
+ pmaddubsw m7, m9
+ pshufb m10, m12, m15
+ pmaddubsw m10, m9
+ paddw m0, m7
+ pshufb m7, m13, m15
+ pmaddubsw m7, m9
+ paddw m2, m10
+ pshufb m10, m14, m15
+ pmaddubsw m10, m9
+ paddw m1, m7
+ paddw m6, m10
+ vpbroadcastd m9, [z_filter_k+4*2+12*0]
+ pshufb m11, m8
+ pmaddubsw m11, m9
+ pshufb m12, m8
+ pmaddubsw m12, m9
+ pshufb m13, m8
+ pmaddubsw m13, m9
+ pshufb m14, m8
+ pmaddubsw m14, m9
+ paddw m0, m11
+ paddw m2, m12
+ paddw m1, m13
+ paddw m6, m14
+ pmulhrsw m0, m3
+ pmulhrsw m2, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ packuswb m0, m2
+ packuswb m1, m6
+ mova [tlq+32*2], m0
+ mova [tlq+32*3], m1
+.w64_main:
+ movd xm6, dxd
+ vpbroadcastb m7, [tlq+maxbaseq]
+ shl maxbased, 6
+ vpbroadcastw m6, xm6
+ movd xm10, maxbased
+ vbroadcasti128 m8, [z_filter_s+2]
+ mov r3d, dxd
+ vpbroadcastw m10, xm10
+ psllw m0, m3, 2 ; 64*32
+ psubw m10, [z_base_inc]
+ mova m14, m6
+ psubw m11, m10, m3 ; 64*8
+ psubw m12, m10, m0
+ psubw m13, m11, m0
+.w64_loop:
+ mov r5d, r3d
+ shr r5d, 6
+ movu m0, [tlq+r5+ 0]
+ movu m1, [tlq+r5+ 8]
+ pand m2, m4, m6
+ psubw m9, m5, m2
+ psllw m2, 8
+ por m9, m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ pcmpgtw m1, m10, m6
+ pcmpgtw m2, m11, m6
+ packsswb m1, m2
+ vpblendvb m2, m7, m0, m1
+ movu m0, [tlq+r5+32]
+ movu m1, [tlq+r5+40]
+ add r3d, dxd
+ mova [dstq+ 0], m2
+ pshufb m0, m8
+ pshufb m1, m8
+ pmaddubsw m0, m9
+ pmaddubsw m1, m9
+ pcmpgtw m9, m12, m6
+ pcmpgtw m2, m13, m6
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddsw m6, m14
+ packsswb m9, m2
+ packuswb m0, m1
+ vpblendvb m0, m7, m0, m9
+ mova [dstq+32], m0
+ add dstq, strideq
+ dec hd
+ jz .w64_end
+ cmp r3d, maxbased
+ jb .w64_loop
+.w64_end_loop:
+ mova [dstq+ 0], m7
+ mova [dstq+32], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
%macro FILTER_XMM 4 ; dst, src, tmp, shuf
%ifnum %4
--- a/src/x86/ipred_init_tmpl.c
+++ b/src/x86/ipred_init_tmpl.c
@@ -38,6 +38,7 @@
decl_angular_ipred_fn(dav1d_ipred_smooth_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_v_avx2);
decl_angular_ipred_fn(dav1d_ipred_smooth_h_avx2);
+decl_angular_ipred_fn(dav1d_ipred_z1_avx2);
decl_angular_ipred_fn(dav1d_ipred_filter_avx2);
decl_cfl_pred_fn(dav1d_ipred_cfl_avx2);
@@ -65,6 +66,7 @@
c->intra_pred[SMOOTH_PRED] = dav1d_ipred_smooth_avx2;
c->intra_pred[SMOOTH_V_PRED] = dav1d_ipred_smooth_v_avx2;
c->intra_pred[SMOOTH_H_PRED] = dav1d_ipred_smooth_h_avx2;
+ c->intra_pred[Z1_PRED] = dav1d_ipred_z1_avx2;
c->intra_pred[FILTER_PRED] = dav1d_ipred_filter_avx2;
c->cfl_pred[DC_PRED] = dav1d_ipred_cfl_avx2;
--- a/tests/checkasm/ipred.c
+++ b/tests/checkasm/ipred.c
@@ -84,7 +84,8 @@
int a = 0;
if (mode >= Z1_PRED && mode <= Z3_PRED) /* angle */
- a = 90 * (mode - Z1_PRED) + z_angles[rand() % 27];
+ a = (90 * (mode - Z1_PRED) + z_angles[rand() % 27]) |
+ (rand() & 0x600);
else if (mode == FILTER_PRED) /* filter_idx */
a = (rand() % 5) | (rand() & ~511);