ref: 604d93c5f77d3e250a27d7d93570b72c5e0c2df5
parent: 95068df6a6597106973031df62bf52c695561361
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Tue Apr 7 11:51:36 EDT 2020
x86: Split AVX2 / AVX-512 CDEF into dedicated files
--- a/src/meson.build
+++ b/src/meson.build
@@ -175,7 +175,8 @@
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
- 'x86/cdef.asm',
+ 'x86/cdef_avx512.asm',
+ 'x86/cdef_avx2.asm',
'x86/film_grain.asm',
'x86/ipred.asm',
'x86/itx.asm',
--- a/src/x86/cdef.asm
+++ /dev/null
@@ -1,2633 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-; list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-; this list of conditions and the following disclaimer in the documentation
-; and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "ext/x86/x86inc.asm"
-
-%if ARCH_X86_64
-
-%macro DUP4 1-*
- %rep %0
- times 4 db %1
- %rotate 1
- %endrep
-%endmacro
-
-%macro DIRS 16 ; cdef_directions[]
- %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1
- ; masking away unused bits allows us to use a single vpaddd {1to16}
- ; instruction instead of having to do vpbroadcastd + paddb
- db %13 & 0x3f, -%13 & 0x3f
- %rotate 1
- %endrep
-%endmacro
-
-%macro JMP_TABLE 2-*
- %xdefine %1_jmptable %%table
- %xdefine %%base mangle(private_prefix %+ _%1_avx2)
- %%table:
- %rep %0 - 1
- dd %%base %+ .%2 - %%table
- %rotate 1
- %endrep
-%endmacro
-
-%macro CDEF_FILTER_JMP_TABLE 1
-JMP_TABLE cdef_filter_%1, \
- d6k0, d6k1, d7k0, d7k1, \
- d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
- d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
- d0k0, d0k1, d1k0, d1k1
-%endmacro
-
-SECTION_RODATA 64
-
-lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
- db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
- db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
- db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
-lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
- db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
-lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
- db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
- db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
- db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
-pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
-lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
- db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
- db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
- db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
-lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
- db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
- db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
- db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
-edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
- dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
- dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
- dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
- dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
- dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
- dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
- dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
-px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
-cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15
-gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
- dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
- dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
- dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
- times 16 db 0 ; realign (introduced by cdef_dirs)
-end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
- db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
- db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
- db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
-end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
- db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
-pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
-sec_tap: db 32, 32, 16, 16
-pd_268435568: dd 268435568
-blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
- dd 0x80, 0x00, 0x00
-blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
-blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
- dd 0x00, 0x00
-blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
- dd 0x0000
-blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
- dd 0x0000, 0x0000
-blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
-blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
-pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
-div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
-shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
-shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
-pw_128: times 2 dw 128
-pw_2048: times 2 dw 2048
-tap_table: ; masks for 8 bit shifts
- db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
- ; weights
- db 4, 2, 3, 3, 2, 1
- db -1 * 16 + 1, -2 * 16 + 2
- db 0 * 16 + 1, -1 * 16 + 2
- db 0 * 16 + 1, 0 * 16 + 2
- db 0 * 16 + 1, 1 * 16 + 2
- db 1 * 16 + 1, 2 * 16 + 2
- db 1 * 16 + 0, 2 * 16 + 1
- db 1 * 16 + 0, 2 * 16 + 0
- db 1 * 16 + 0, 2 * 16 - 1
- ; the last 6 are repeats of the first 6 so we don't need to & 7
- db -1 * 16 + 1, -2 * 16 + 2
- db 0 * 16 + 1, -1 * 16 + 2
- db 0 * 16 + 1, 0 * 16 + 2
- db 0 * 16 + 1, 1 * 16 + 2
- db 1 * 16 + 1, 2 * 16 + 2
- db 1 * 16 + 0, 2 * 16 + 1
-
-CDEF_FILTER_JMP_TABLE 4x4
-CDEF_FILTER_JMP_TABLE 4x8
-CDEF_FILTER_JMP_TABLE 8x8
-
-SECTION .text
-
-%macro PREP_REGS 2 ; w, h
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- mov dird, r6m
- lea tableq, [cdef_filter_%1x%2_jmptable]
- lea dirq, [tableq+dirq*2*4]
-%if %1 == 4
- %if %2 == 4
- DEFINE_ARGS dst, stride, left, top, pri, sec, \
- table, dir, dirjmp, dst4, stride3, k
- %else
- DEFINE_ARGS dst, stride, left, top, pri, sec, \
- table, dir, dirjmp, dst4, dst8, stride3, k
- lea dst8q, [dstq+strideq*8]
- %endif
-%else
- DEFINE_ARGS dst, stride, h, top1, pri, sec, \
- table, dir, dirjmp, top2, dst4, stride3, k
- mov hq, -8
- lea top1q, [top1q+strideq*0]
- lea top2q, [top1q+strideq*1]
-%endif
- lea dst4q, [dstq+strideq*4]
-%if %1 == 4
- lea stride3q, [strideq*3]
-%endif
-%endmacro
-
-%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
- mov kd, 1
- pxor m15, m15 ; sum
-%if %2 == 8
- pxor m12, m12
- %if %1 == 4
- movd xm4, [dstq +strideq*0]
- movd xm6, [dstq +strideq*1]
- movd xm5, [dstq +strideq*2]
- movd xm7, [dstq +stride3q ]
- vinserti128 m4, [dst4q+strideq*0], 1
- vinserti128 m6, [dst4q+strideq*1], 1
- vinserti128 m5, [dst4q+strideq*2], 1
- vinserti128 m7, [dst4q+stride3q ], 1
- punpckldq m4, m6
- punpckldq m5, m7
- %else
- movq xm4, [dstq+strideq*0]
- movq xm5, [dstq+strideq*1]
- vinserti128 m4, [dstq+strideq*2], 1
- vinserti128 m5, [dstq+stride3q ], 1
- %endif
- punpcklqdq m4, m5
-%else
- movd xm4, [dstq+strideq*0]
- movd xm5, [dstq+strideq*1]
- vinserti128 m4, [dstq+strideq*2], 1
- vinserti128 m5, [dstq+stride3q ], 1
- punpckldq m4, m5
-%endif
-%if %3 == 1
- mova m7, m4 ; min
- mova m8, m4 ; max
-%endif
-%endmacro
-
-%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
- ; mul_tap, w, h, clip
- ; load p0/p1
- movsxd dirjmpq, [dirq+kq*4+%1*2*4]
- add dirjmpq, tableq
- call dirjmpq
-
-%if %8 == 1
- pmaxub m7, m5
- pminub m8, m5
- pmaxub m7, m6
- pminub m8, m6
-%endif
-
- ; accumulate sum[m15] over p0/p1
-%if %7 == 4
- punpcklbw m5, m6
- punpcklbw m6, m4, m4
- psubusb m9, m5, m6
- psubusb m5, m6, m5
- por m9, m5 ; abs_diff_p01(p01 - px)
- pcmpeqb m5, m9
- por m5, %5
- psignb m6, %5, m5
- psrlw m5, m9, %2 ; emulate 8-bit shift
- pand m5, %3
- psubusb m5, %4, m5
- pminub m5, m9
- pmaddubsw m5, m6
- paddw m15, m5
-%else
- psubusb m9, m5, m4
- psubusb m5, m4, m5
- psubusb m11, m6, m4
- psubusb m6, m4, m6
- por m9, m5 ; abs_diff_p0(p0 - px)
- por m11, m6 ; abs_diff_p1(p1 - px)
- pcmpeqb m5, m9
- pcmpeqb m6, m11
- punpckhbw m10, m9, m11
- punpcklbw m9, m11
- por m5, %5
- por m11, m6, %5
- punpckhbw m6, m5, m11
- punpcklbw m5, m11
- psignb m11, %5, m6
- psrlw m6, m10, %2 ; emulate 8-bit shift
- pand m6, %3
- psubusb m6, %4, m6
- pminub m6, m10
- pmaddubsw m6, m11
- paddw m12, m6
- psignb m11, %5, m5
- psrlw m5, m9, %2 ; emulate 8-bit shift
- pand m5, %3
- psubusb m5, %4, m5
- pminub m5, m9
- pmaddubsw m5, m11
- paddw m15, m5
-%endif
-%endmacro
-
-%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
-%if %2 == 4
- %if %5 == 1
- punpcklbw m4, %3
- %endif
- pcmpgtw %3, m15
- paddw m15, %3
- pmulhrsw m15, %4
- %if %5 == 0
- packsswb m15, m15
- paddb m4, m15
- %else
- paddw m4, m15
- packuswb m4, m4 ; clip px in [0x0,0xff]
- pminub m4, m7
- pmaxub m4, m8
- %endif
- vextracti128 xm5, m4, 1
- movd [dstq+strideq*0], xm4
- movd [dstq+strideq*2], xm5
- pextrd [dstq+strideq*1], xm4, 1
- pextrd [dstq+stride3q ], xm5, 1
-%else
- pcmpgtw m6, %3, m12
- pcmpgtw m5, %3, m15
- paddw m12, m6
- paddw m15, m5
- %if %5 == 1
- punpckhbw m5, m4, %3
- punpcklbw m4, %3
- %endif
- pmulhrsw m12, %4
- pmulhrsw m15, %4
- %if %5 == 0
- packsswb m15, m12
- paddb m4, m15
- %else
- paddw m5, m12
- paddw m4, m15
- packuswb m4, m5 ; clip px in [0x0,0xff]
- pminub m4, m7
- pmaxub m4, m8
- %endif
- vextracti128 xm5, m4, 1
- %if %1 == 4
- movd [dstq +strideq*0], xm4
- movd [dst4q+strideq*0], xm5
- pextrd [dstq +strideq*1], xm4, 1
- pextrd [dst4q+strideq*1], xm5, 1
- pextrd [dstq +strideq*2], xm4, 2
- pextrd [dst4q+strideq*2], xm5, 2
- pextrd [dstq +stride3q ], xm4, 3
- pextrd [dst4q+stride3q ], xm5, 3
- %else
- movq [dstq+strideq*0], xm4
- movq [dstq+strideq*2], xm5
- movhps [dstq+strideq*1], xm4
- movhps [dstq+stride3q ], xm5
- %endif
-%endif
-%endmacro
-
-%macro BORDER_PREP_REGS 2 ; w, h
- ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
- mov dird, r6m
- lea dirq, [tableq+dirq*2+14]
-%if %1*%2*2/mmsize > 1
- %if %1 == 4
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
- %else
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
- %endif
- mov hd, %1*%2*2/mmsize
-%else
- DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
-%endif
- lea stkq, [px]
- pxor m11, m11
-%endmacro
-
-%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
- mov kd, 1
-%if %1 == 4
- movq xm4, [stkq+32*0]
- movhps xm4, [stkq+32*1]
- movq xm5, [stkq+32*2]
- movhps xm5, [stkq+32*3]
- vinserti128 m4, xm5, 1
-%else
- mova xm4, [stkq+32*0] ; px
- vinserti128 m4, [stkq+32*1], 1
-%endif
- pxor m15, m15 ; sum
-%if %3 == 1
- mova m7, m4 ; max
- mova m8, m4 ; min
-%endif
-%endmacro
-
-%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
- ; mul_tap, w, clip
- ; load p0/p1
- movsx offq, byte [dirq+kq+%1] ; off1
-%if %6 == 4
- movq xm5, [stkq+offq*2+32*0] ; p0
- movq xm6, [stkq+offq*2+32*2]
- movhps xm5, [stkq+offq*2+32*1]
- movhps xm6, [stkq+offq*2+32*3]
- vinserti128 m5, xm6, 1
-%else
- movu xm5, [stkq+offq*2+32*0] ; p0
- vinserti128 m5, [stkq+offq*2+32*1], 1
-%endif
- neg offq ; -off1
-%if %6 == 4
- movq xm6, [stkq+offq*2+32*0] ; p1
- movq xm9, [stkq+offq*2+32*2]
- movhps xm6, [stkq+offq*2+32*1]
- movhps xm9, [stkq+offq*2+32*3]
- vinserti128 m6, xm9, 1
-%else
- movu xm6, [stkq+offq*2+32*0] ; p1
- vinserti128 m6, [stkq+offq*2+32*1], 1
-%endif
-%if %7 == 1
- ; out of bounds values are set to a value that is a both a large unsigned
- ; value and a negative signed value.
- ; use signed max and unsigned min to remove them
- pmaxsw m7, m5 ; max after p0
- pminuw m8, m5 ; min after p0
- pmaxsw m7, m6 ; max after p1
- pminuw m8, m6 ; min after p1
-%endif
-
- ; accumulate sum[m15] over p0/p1
- ; calculate difference before converting
- psubw m5, m4 ; diff_p0(p0 - px)
- psubw m6, m4 ; diff_p1(p1 - px)
-
- ; convert to 8-bits with signed saturation
- ; saturating to large diffs has no impact on the results
- packsswb m5, m6
-
- ; group into pairs so we can accumulate using maddubsw
- pshufb m5, m12
- pabsb m9, m5
- psignb m10, %5, m5
- psrlw m5, m9, %2 ; emulate 8-bit shift
- pand m5, %3
- psubusb m5, %4, m5
-
- ; use unsigned min since abs diff can equal 0x80
- pminub m5, m9
- pmaddubsw m5, m10
- paddw m15, m5
-%endmacro
-
-%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
- pcmpgtw m9, m11, m15
- paddw m15, m9
- pmulhrsw m15, %2
- paddw m4, m15
-%if %3 == 1
- pminsw m4, m7
- pmaxsw m4, m8
-%endif
- packuswb m4, m4
- vextracti128 xm5, m4, 1
-%if %1 == 4
- movd [dstq+strideq*0], xm4
- pextrd [dstq+strideq*1], xm4, 1
- movd [dstq+strideq*2], xm5
- pextrd [dstq+stride3q], xm5, 1
-%else
- movq [dstq+strideq*0], xm4
- movq [dstq+strideq*1], xm5
-%endif
-%endmacro
-
-%macro CDEF_FILTER 2 ; w, h
-INIT_YMM avx2
-cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
- pri, sec, dir, damping, edge
-%assign stack_offset_entry stack_offset
- mov edged, edgem
- cmp edged, 0xf
- jne .border_block
-
- PUSH r9
- PUSH r10
- PUSH r11
-%if %2 == 4
- %assign regs_used 12
- %if STACK_ALIGNMENT < 32
- PUSH r%+regs_used
- %assign regs_used regs_used+1
- %endif
- ALLOC_STACK 0x60, 16
- pmovzxbw xm0, [leftq+1]
- vpermq m0, m0, q0110
- psrldq m1, m0, 4
- vpalignr m2, m0, m0, 12
- movu [rsp+0x10], m0
- movu [rsp+0x28], m1
- movu [rsp+0x40], m2
-%elif %1 == 4
- PUSH r12
- %assign regs_used 13
- %if STACK_ALIGNMENT < 32
- PUSH r%+regs_used
- %assign regs_used regs_used+1
- %endif
- ALLOC_STACK 8*2+%1*%2*1, 16
- pmovzxwd m0, [leftq]
- mova [rsp+0x10], m0
-%else
- PUSH r12
- PUSH r13
- %assign regs_used 14
- %if STACK_ALIGNMENT < 32
- PUSH r%+regs_used
- %assign regs_used regs_used+1
- %endif
- ALLOC_STACK 8*2+%1*%2*2+32, 16
- lea r11, [strideq*3]
- movu xm4, [dstq+strideq*2]
- pmovzxwq m0, [leftq+0]
- pmovzxwq m1, [leftq+8]
- vinserti128 m4, [dstq+r11], 1
- pmovzxbd m2, [leftq+1]
- pmovzxbd m3, [leftq+9]
- mova [rsp+0x10], m0
- mova [rsp+0x30], m1
- mova [rsp+0x50], m2
- mova [rsp+0x70], m3
- mova [rsp+0x90], m4
-%endif
-
- DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
- mov dampingd, r7m
- xor zerod, zerod
- movifnidn prid, prim
- sub dampingd, 31
- movifnidn secdmpd, secdmpm
- or prid, 0
- jz .sec_only
- movd xm0, prid
- lzcnt pridmpd, prid
- add pridmpd, dampingd
- cmovs pridmpd, zerod
- mov [rsp+0], pridmpq ; pri_shift
- or secdmpd, 0
- jz .pri_only
- movd xm1, secdmpd
- lzcnt secdmpd, secdmpd
- add secdmpd, dampingd
- cmovs secdmpd, zerod
- mov [rsp+8], secdmpq ; sec_shift
-
- DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
- lea tableq, [tap_table]
- vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
- vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
-
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
- vpbroadcastb m0, xm0 ; pri_strength
- vpbroadcastb m1, xm1 ; sec_strength
- and prid, 1
- lea priq, [tableq+priq*2+8] ; pri_taps
- lea secq, [tableq+12] ; sec_taps
-
- PREP_REGS %1, %2
-%if %1*%2 > mmsize
-.v_loop:
-%endif
- LOAD_BLOCK %1, %2, 1
-.k_loop:
- vpbroadcastb m2, [priq+kq] ; pri_taps
- vpbroadcastb m3, [secq+kq] ; sec_taps
- ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
- ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
- ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
- dec kq
- jge .k_loop
-
- vpbroadcastd m10, [pw_2048]
- pxor m9, m9
- ADJUST_PIXEL %1, %2, m9, m10, 1
-%if %1*%2 > mmsize
- mov dstq, dst4q
- lea top1q, [rsp+0x90]
- lea top2q, [rsp+0xA0]
- lea dst4q, [dst4q+strideq*4]
- add hq, 4
- jl .v_loop
-%endif
- RET
-
-.pri_only:
- DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
- lea tableq, [tap_table]
- vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
- vpbroadcastb m0, xm0 ; pri_strength
- and prid, 1
- lea priq, [tableq+priq*2+8] ; pri_taps
- PREP_REGS %1, %2
- vpbroadcastd m3, [pw_2048]
- pxor m1, m1
-%if %1*%2 > mmsize
-.pri_v_loop:
-%endif
- LOAD_BLOCK %1, %2
-.pri_k_loop:
- vpbroadcastb m2, [priq+kq] ; pri_taps
- ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
- dec kq
- jge .pri_k_loop
- ADJUST_PIXEL %1, %2, m1, m3
-%if %1*%2 > mmsize
- mov dstq, dst4q
- lea top1q, [rsp+0x90]
- lea top2q, [rsp+0xA0]
- lea dst4q, [dst4q+strideq*4]
- add hq, 4
- jl .pri_v_loop
-%endif
- RET
-
-.sec_only:
- DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
- movd xm1, secdmpd
- lzcnt secdmpd, secdmpd
- add secdmpd, dampingd
- cmovs secdmpd, zerod
- mov [rsp+8], secdmpq ; sec_shift
- DEFINE_ARGS dst, stride, left, top, _, secdmp, table
- lea tableq, [tap_table]
- vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
- vpbroadcastb m1, xm1 ; sec_strength
- lea secq, [tableq+12] ; sec_taps
- PREP_REGS %1, %2
- vpbroadcastd m2, [pw_2048]
- pxor m0, m0
-%if %1*%2 > mmsize
-.sec_v_loop:
-%endif
- LOAD_BLOCK %1, %2
-.sec_k_loop:
- vpbroadcastb m3, [secq+kq] ; sec_taps
- ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
- ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
- dec kq
- jge .sec_k_loop
- ADJUST_PIXEL %1, %2, m0, m2
-%if %1*%2 > mmsize
- mov dstq, dst4q
- lea top1q, [rsp+0x90]
- lea top2q, [rsp+0xA0]
- lea dst4q, [dst4q+strideq*4]
- add hq, 4
- jl .sec_v_loop
-%endif
- RET
-
-.d0k0:
-%if %1 == 4
- %if %2 == 4
- vpbroadcastq m6, [dstq+strideq*1-1]
- vpbroadcastq m10, [dstq+strideq*2-1]
- movd xm5, [topq+strideq*1+1]
- movd xm9, [dstq+strideq*0+1]
- psrldq m11, m6, 2
- psrldq m12, m10, 2
- vinserti128 m6, [dstq+stride3q -1], 1
- vinserti128 m10, [dstq+strideq*4-1], 1
- vpblendd m5, m11, 0x10
- vpblendd m9, m12, 0x10
- movu m11, [blend_4x4+16]
- punpckldq m6, m10
- punpckldq m5, m9
- vpblendvb m6, [rsp+gprsize+0x28], m11
- %else
- movd xm5, [topq +strideq*1+1]
- movq xm6, [dstq +strideq*1-1]
- movq xm10, [dstq +stride3q -1]
- movq xm11, [dst4q+strideq*1-1]
- pinsrd xm5, [dstq +strideq*0+1], 1
- movhps xm6, [dstq +strideq*2-1]
- movhps xm10, [dst4q+strideq*0-1]
- movhps xm11, [dst4q+strideq*2-1]
- psrldq xm9, xm6, 2
- shufps xm5, xm9, q2010 ; -1 +0 +1 +2
- shufps xm6, xm10, q2020 ; +1 +2 +3 +4
- psrldq xm9, xm11, 2
- psrldq xm10, 2
- shufps xm10, xm9, q2020 ; +3 +4 +5 +6
- movd xm9, [dst4q+stride3q -1]
- pinsrd xm9, [dst4q+strideq*4-1], 1
- shufps xm11, xm9, q1020 ; +5 +6 +7 +8
- pmovzxbw m9, [leftq+3]
- vinserti128 m6, xm11, 1
- movu m11, [blend_4x8_0+4]
- vinserti128 m5, xm10, 1
- vpblendvb m6, m9, m11
- %endif
-%else
- lea r13, [blend_8x8_0+16]
- movq xm5, [top2q +1]
- vbroadcasti128 m10, [dstq+strideq*1-1]
- vbroadcasti128 m11, [dstq+strideq*2-1]
- movhps xm5, [dstq+strideq*0+1]
- vinserti128 m6, m10, [dstq+stride3q -1], 1
- vinserti128 m9, m11, [dstq+strideq*4-1], 1
- psrldq m10, 2
- psrldq m11, 2
- punpcklqdq m6, m9
- movu m9, [r13+hq*2*1+16*1]
- punpcklqdq m10, m11
- vpblendd m5, m10, 0xF0
- vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9
-%endif
- ret
-.d1k0:
-.d2k0:
-.d3k0:
-%if %1 == 4
- %if %2 == 4
- movq xm6, [dstq+strideq*0-1]
- movq xm9, [dstq+strideq*1-1]
- vinserti128 m6, [dstq+strideq*2-1], 1
- vinserti128 m9, [dstq+stride3q -1], 1
- movu m11, [rsp+gprsize+0x10]
- pcmpeqd m12, m12
- psrldq m5, m6, 2
- psrldq m10, m9, 2
- psrld m12, 24
- punpckldq m6, m9
- punpckldq m5, m10
- vpblendvb m6, m11, m12
- %else
- movq xm6, [dstq +strideq*0-1]
- movq xm9, [dstq +strideq*2-1]
- movhps xm6, [dstq +strideq*1-1]
- movhps xm9, [dstq +stride3q -1]
- movq xm10, [dst4q+strideq*0-1]
- movhps xm10, [dst4q+strideq*1-1]
- psrldq xm5, xm6, 2
- psrldq xm11, xm9, 2
- shufps xm5, xm11, q2020
- movq xm11, [dst4q+strideq*2-1]
- movhps xm11, [dst4q+stride3q -1]
- shufps xm6, xm9, q2020
- shufps xm9, xm10, xm11, q2020
- vinserti128 m6, xm9, 1
- pmovzxbw m9, [leftq+1]
- psrldq xm10, 2
- psrldq xm11, 2
- shufps xm10, xm11, q2020
- vpbroadcastd m11, [blend_4x8_0+4]
- vinserti128 m5, xm10, 1
- vpblendvb m6, m9, m11
- %endif
-%else
- movu xm5, [dstq+strideq*0-1]
- movu xm9, [dstq+strideq*1-1]
- vinserti128 m5, [dstq+strideq*2-1], 1
- vinserti128 m9, [dstq+stride3q -1], 1
- mova m10, [blend_8x8_0+16]
- punpcklqdq m6, m5, m9
- vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10
- psrldq m5, 2
- psrldq m9, 2
- punpcklqdq m5, m9
-%endif
- ret
-.d4k0:
-%if %1 == 4
- %if %2 == 4
- vpbroadcastq m10, [dstq+strideq*1-1]
- vpbroadcastq m11, [dstq+strideq*2-1]
- movd xm6, [topq+strideq*1-1]
- movd xm9, [dstq+strideq*0-1]
- psrldq m5, m10, 2
- psrldq m12, m11, 2
- vpblendd m6, m10, 0x10
- vpblendd m9, m11, 0x10
- movu m10, [blend_4x4]
- vinserti128 m5, [dstq+stride3q +1], 1
- vinserti128 m12, [dstq+strideq*4+1], 1
- punpckldq m6, m9
- punpckldq m5, m12
- vpblendvb m6, [rsp+gprsize+0x40], m10
- %else
- movd xm6, [topq +strideq*1-1]
- movq xm9, [dstq +strideq*1-1]
- movq xm10, [dstq +stride3q -1]
- movq xm11, [dst4q+strideq*1-1]
- pinsrd xm6, [dstq +strideq*0-1], 1
- movhps xm9, [dstq +strideq*2-1]
- movhps xm10, [dst4q+strideq*0-1]
- movhps xm11, [dst4q+strideq*2-1]
- psrldq xm5, xm9, 2
- shufps xm6, xm9, q2010
- psrldq xm9, xm10, 2
- shufps xm5, xm9, q2020
- shufps xm10, xm11, q2020
- movd xm9, [dst4q+stride3q +1]
- vinserti128 m6, xm10, 1
- pinsrd xm9, [dst4q+strideq*4+1], 1
- psrldq xm11, 2
- pmovzxbw m10, [leftq-1]
- shufps xm11, xm9, q1020
- movu m9, [blend_4x8_0]
- vinserti128 m5, xm11, 1
- vpblendvb m6, m10, m9
- %endif
-%else
- lea r13, [blend_8x8_0+8]
- movq xm6, [top2q -1]
- vbroadcasti128 m5, [dstq+strideq*1-1]
- vbroadcasti128 m9, [dstq+strideq*2-1]
- movhps xm6, [dstq+strideq*0-1]
- movu m11, [r13+hq*2*1+16*1]
- punpcklqdq m10, m5, m9
- vinserti128 m5, [dstq+stride3q -1], 1
- vinserti128 m9, [dstq+strideq*4-1], 1
- vpblendd m6, m10, 0xF0
- vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11
- psrldq m5, 2
- psrldq m9, 2
- punpcklqdq m5, m9
-%endif
- ret
-.d5k0:
-.d6k0:
-.d7k0:
-%if %1 == 4
- %if %2 == 4
- movd xm6, [topq+strideq*1 ]
- vpbroadcastd m5, [dstq+strideq*1 ]
- vpbroadcastd m9, [dstq+strideq*2 ]
- vpblendd xm6, [dstq+strideq*0-4], 0x2
- vpblendd m5, m9, 0x22
- vpblendd m6, m5, 0x30
- vinserti128 m5, [dstq+stride3q ], 1
- vpblendd m5, [dstq+strideq*4-20], 0x20
- %else
- movd xm6, [topq +strideq*1]
- movd xm5, [dstq +strideq*1]
- movd xm9, [dstq +stride3q ]
- movd xm10, [dst4q+strideq*1]
- movd xm11, [dst4q+stride3q ]
- pinsrd xm6, [dstq +strideq*0], 1
- pinsrd xm5, [dstq +strideq*2], 1
- pinsrd xm9, [dst4q+strideq*0], 1
- pinsrd xm10, [dst4q+strideq*2], 1
- pinsrd xm11, [dst4q+strideq*4], 1
- punpcklqdq xm6, xm5
- punpcklqdq xm5, xm9
- punpcklqdq xm9, xm10
- punpcklqdq xm10, xm11
- vinserti128 m6, xm9, 1
- vinserti128 m5, xm10, 1
- %endif
-%else
- movq xm6, [top2q ]
- movq xm5, [dstq+strideq*1]
- movq xm9, [dstq+stride3q ]
- movhps xm6, [dstq+strideq*0]
- movhps xm5, [dstq+strideq*2]
- movhps xm9, [dstq+strideq*4]
- vinserti128 m6, xm5, 1
- vinserti128 m5, xm9, 1
-%endif
- ret
-.d0k1:
-%if %1 == 4
- %if %2 == 4
- movd xm6, [dstq +strideq*2-2]
- movd xm9, [dstq +stride3q -2]
- movd xm5, [topq +strideq*0+2]
- movd xm10, [topq +strideq*1+2]
- pinsrw xm6, [leftq+4], 0
- pinsrw xm9, [leftq+6], 0
- vinserti128 m5, [dstq +strideq*0+2], 1
- vinserti128 m10, [dstq +strideq*1+2], 1
- vinserti128 m6, [dst4q+strideq*0-2], 1
- vinserti128 m9, [dst4q+strideq*1-2], 1
- punpckldq m5, m10
- punpckldq m6, m9
- %else
- movq xm6, [dstq +strideq*2-2]
- movd xm10, [dst4q+strideq*2-2]
- movd xm5, [topq +strideq*0+2]
- movq xm9, [dst4q+strideq*0-2]
- movhps xm6, [dstq +stride3q -2]
- pinsrw xm10, [dst4q+stride3q ], 3
- pinsrd xm5, [topq +strideq*1+2], 1
- movhps xm9, [dst4q+strideq*1-2]
- pinsrd xm10, [dst8q+strideq*0-2], 2
- pinsrd xm5, [dstq +strideq*0+2], 2
- pinsrd xm10, [dst8q+strideq*1-2], 3
- pinsrd xm5, [dstq +strideq*1+2], 3
- shufps xm11, xm6, xm9, q3131
- shufps xm6, xm9, q2020
- movu m9, [blend_4x8_3+8]
- vinserti128 m6, xm10, 1
- vinserti128 m5, xm11, 1
- vpblendvb m6, [rsp+gprsize+16+8], m9
- %endif
-%else
- lea r13, [blend_8x8_1+16]
- movq xm6, [dstq +strideq*2-2]
- movq xm9, [dstq +stride3q -2]
- movq xm5, [top1q +2]
- movq xm10, [top2q +2]
- movu m11, [r13+hq*2*2+16*2]
- vinserti128 m6, [dst4q+strideq*0-2], 1
- vinserti128 m9, [dst4q+strideq*1-2], 1
- vinserti128 m5, [dstq +strideq*0+2], 1
- vinserti128 m10, [dstq +strideq*1+2], 1
- punpcklqdq m6, m9
- punpcklqdq m5, m10
- vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11
-%endif
- ret
-.d1k1:
-%if %1 == 4
- %if %2 == 4
- vpbroadcastq m6, [dstq+strideq*1-2]
- vpbroadcastq m9, [dstq+strideq*2-2]
- movd xm5, [topq+strideq*1+2]
- movd xm10, [dstq+strideq*0+2]
- psrldq m11, m6, 4
- psrldq m12, m9, 4
- vpblendd m5, m11, 0x10
- movq xm11, [leftq+2]
- vinserti128 m6, [dstq+stride3q -2], 1
- punpckldq xm11, xm11
- vpblendd m10, m12, 0x10
- pcmpeqd m12, m12
- pmovzxwd m11, xm11
- psrld m12, 16
- punpckldq m6, m9
- vpbroadcastd m9, [dstq+strideq*4-2]
- vpblendvb m6, m11, m12
- punpckldq m5, m10
- vpblendd m6, m9, 0x20
- %else
- movd xm5, [topq +strideq*1+2]
- movq xm6, [dstq +strideq*1-2]
- movq xm9, [dstq +stride3q -2]
- movq xm10, [dst4q+strideq*1-2]
- movd xm11, [dst4q+stride3q -2]
- pinsrd xm5, [dstq +strideq*0+2], 1
- movhps xm6, [dstq +strideq*2-2]
- movhps xm9, [dst4q+strideq*0-2]
- movhps xm10, [dst4q+strideq*2-2]
- pinsrd xm11, [dst4q+strideq*4-2], 1
- shufps xm5, xm6, q3110
- shufps xm6, xm9, q2020
- shufps xm9, xm10, q3131
- shufps xm10, xm11, q1020
- movu m11, [blend_4x8_2+4]
- vinserti128 m6, xm10, 1
- vinserti128 m5, xm9, 1
- vpblendvb m6, [rsp+gprsize+16+4], m11
- %endif
-%else
- lea r13, [blend_8x8_1+16]
- movq xm5, [top2q +2]
- vbroadcasti128 m6, [dstq+strideq*1-2]
- vbroadcasti128 m9, [dstq+strideq*2-2]
- movhps xm5, [dstq+strideq*0+2]
- shufps m10, m6, m9, q2121
- vinserti128 m6, [dstq+stride3q -2], 1
- vinserti128 m9, [dstq+strideq*4-2], 1
- movu m11, [r13+hq*2*1+16*1]
- vpblendd m5, m10, 0xF0
- punpcklqdq m6, m9
- vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11
-%endif
- ret
-.d2k1:
-%if %1 == 4
- %if %2 == 4
- movq xm11, [leftq]
- movq xm6, [dstq+strideq*0-2]
- movq xm9, [dstq+strideq*1-2]
- vinserti128 m6, [dstq+strideq*2-2], 1
- vinserti128 m9, [dstq+stride3q -2], 1
- punpckldq xm11, xm11
- psrldq m5, m6, 4
- psrldq m10, m9, 4
- pmovzxwd m11, xm11
- punpckldq m6, m9
- punpckldq m5, m10
- pblendw m6, m11, 0x05
- %else
- movq xm5, [dstq +strideq*0-2]
- movq xm9, [dstq +strideq*2-2]
- movq xm10, [dst4q+strideq*0-2]
- movq xm11, [dst4q+strideq*2-2]
- movhps xm5, [dstq +strideq*1-2]
- movhps xm9, [dstq +stride3q -2]
- movhps xm10, [dst4q+strideq*1-2]
- movhps xm11, [dst4q+stride3q -2]
- shufps xm6, xm5, xm9, q2020
- shufps xm5, xm9, q3131
- shufps xm9, xm10, xm11, q2020
- shufps xm10, xm11, q3131
- pmovzxwd m11, [leftq]
- vinserti128 m6, xm9, 1
- vinserti128 m5, xm10, 1
- pblendw m6, m11, 0x55
- %endif
-%else
- mova m11, [rsp+gprsize+16+hq*8+64]
- movu xm5, [dstq+strideq*0-2]
- movu xm9, [dstq+strideq*1-2]
- vinserti128 m5, [dstq+strideq*2-2], 1
- vinserti128 m9, [dstq+stride3q -2], 1
- shufps m6, m5, m9, q1010
- shufps m5, m9, q2121
- pblendw m6, m11, 0x11
-%endif
- ret
-.d3k1:
-%if %1 == 4
- %if %2 == 4
- vpbroadcastq m11, [dstq+strideq*1-2]
- vpbroadcastq m12, [dstq+strideq*2-2]
- movd xm6, [topq+strideq*1-2]
- movd xm9, [dstq+strideq*0-2]
- pblendw m11, [leftq-16+2], 0x01
- pblendw m12, [leftq-16+4], 0x01
- pinsrw xm9, [leftq- 0+0], 0
- psrldq m5, m11, 4
- psrldq m10, m12, 4
- vinserti128 m5, [dstq+stride3q +2], 1
- vinserti128 m10, [dstq+strideq*4+2], 1
- vpblendd m6, m11, 0x10
- vpblendd m9, m12, 0x10
- punpckldq m6, m9
- punpckldq m5, m10
- %else
- movd xm6, [topq +strideq*1-2]
- movq xm5, [dstq +strideq*1-2]
- movq xm9, [dstq +stride3q -2]
- movq xm10, [dst4q+strideq*1-2]
- movd xm11, [dst4q+stride3q +2]
- pinsrw xm6, [dstq +strideq*0 ], 3
- movhps xm5, [dstq +strideq*2-2]
- movhps xm9, [dst4q+strideq*0-2]
- movhps xm10, [dst4q+strideq*2-2]
- pinsrd xm11, [dst4q+strideq*4+2], 1
- shufps xm6, xm5, q2010
- shufps xm5, xm9, q3131
- shufps xm9, xm10, q2020
- shufps xm10, xm11, q1031
- movu m11, [blend_4x8_2]
- vinserti128 m6, xm9, 1
- vinserti128 m5, xm10, 1
- vpblendvb m6, [rsp+gprsize+16-4], m11
- %endif
-%else
- lea r13, [blend_8x8_1+8]
- movq xm6, [top2q -2]
- vbroadcasti128 m5, [dstq+strideq*1-2]
- vbroadcasti128 m10, [dstq+strideq*2-2]
- movhps xm6, [dstq+strideq*0-2]
- punpcklqdq m9, m5, m10
- vinserti128 m5, [dstq+stride3q -2], 1
- vinserti128 m10, [dstq+strideq*4-2], 1
- movu m11, [r13+hq*2*1+16*1]
- vpblendd m6, m9, 0xF0
- shufps m5, m10, q2121
- vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11
-%endif
- ret
-.d4k1:
-%if %1 == 4
- %if %2 == 4
- vinserti128 m6, [dstq +strideq*0-2], 1
- vinserti128 m9, [dstq +strideq*1-2], 1
- movd xm5, [dstq +strideq*2+2]
- movd xm10, [dstq +stride3q +2]
- pblendw m6, [leftq-16+0], 0x01
- pblendw m9, [leftq-16+2], 0x01
- vinserti128 m5, [dst4q+strideq*0+2], 1
- vinserti128 m10, [dst4q+strideq*1+2], 1
- vpblendd m6, [topq +strideq*0-2], 0x01
- vpblendd m9, [topq +strideq*1-2], 0x01
- punpckldq m5, m10
- punpckldq m6, m9
- %else
- movd xm6, [topq +strideq*0-2]
- movq xm5, [dstq +strideq*2-2]
- movq xm9, [dst4q+strideq*0-2]
- movd xm10, [dst4q+strideq*2+2]
- pinsrd xm6, [topq +strideq*1-2], 1
- movhps xm5, [dstq +stride3q -2]
- movhps xm9, [dst4q+strideq*1-2]
- pinsrd xm10, [dst4q+stride3q +2], 1
- pinsrd xm6, [dstq +strideq*0-2], 2
- pinsrd xm10, [dst8q+strideq*0+2], 2
- pinsrd xm6, [dstq +strideq*1-2], 3
- pinsrd xm10, [dst8q+strideq*1+2], 3
- shufps xm11, xm5, xm9, q2020
- shufps xm5, xm9, q3131
- movu m9, [blend_4x8_3]
- vinserti128 m6, xm11, 1
- vinserti128 m5, xm10, 1
- vpblendvb m6, [rsp+gprsize+16-8], m9
- %endif
-%else
- lea r13, [blend_8x8_1]
- movu m11, [r13+hq*2*2+16*2]
- movq xm6, [top1q -2]
- movq xm9, [top2q -2]
- movq xm5, [dstq +strideq*2+2]
- movq xm10, [dstq +stride3q +2]
- vinserti128 m6, [dstq +strideq*0-2], 1
- vinserti128 m9, [dstq +strideq*1-2], 1
- vinserti128 m5, [dst4q+strideq*0+2], 1
- vinserti128 m10, [dst4q+strideq*1+2], 1
- punpcklqdq m6, m9
- vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11
- punpcklqdq m5, m10
-%endif
- ret
-.d5k1:
-%if %1 == 4
- %if %2 == 4
- movd xm6, [topq +strideq*0-1]
- movd xm9, [topq +strideq*1-1]
- movd xm5, [dstq +strideq*2+1]
- movd xm10, [dstq +stride3q +1]
- pcmpeqd m12, m12
- pmovzxbw m11, [leftq-8+1]
- psrld m12, 24
- vinserti128 m6, [dstq +strideq*0-1], 1
- vinserti128 m9, [dstq +strideq*1-1], 1
- vinserti128 m5, [dst4q+strideq*0+1], 1
- vinserti128 m10, [dst4q+strideq*1+1], 1
- punpckldq m6, m9
- pxor m9, m9
- vpblendd m12, m9, 0x0F
- punpckldq m5, m10
- vpblendvb m6, m11, m12
- %else
- movd xm6, [topq +strideq*0-1]
- movq xm5, [dstq +strideq*2-1]
- movq xm9, [dst4q+strideq*0-1]
- movd xm10, [dst4q+strideq*2+1]
- pinsrd xm6, [topq +strideq*1-1], 1
- movhps xm5, [dstq +stride3q -1]
- movhps xm9, [dst4q+strideq*1-1]
- pinsrd xm10, [dst4q+stride3q +1], 1
- pinsrd xm6, [dstq +strideq*0-1], 2
- pinsrd xm10, [dst8q+strideq*0+1], 2
- pinsrd xm6, [dstq +strideq*1-1], 3
- pinsrd xm10, [dst8q+strideq*1+1], 3
- shufps xm11, xm5, xm9, q2020
- vinserti128 m6, xm11, 1
- pmovzxbw m11, [leftq-3]
- psrldq xm5, 2
- psrldq xm9, 2
- shufps xm5, xm9, q2020
- movu m9, [blend_4x8_1]
- vinserti128 m5, xm10, 1
- vpblendvb m6, m11, m9
- %endif
-%else
- lea r13, [blend_8x8_0]
- movu m11, [r13+hq*2*2+16*2]
- movq xm6, [top1q -1]
- movq xm9, [top2q -1]
- movq xm5, [dstq +strideq*2+1]
- movq xm10, [dstq +stride3q +1]
- vinserti128 m6, [dstq +strideq*0-1], 1
- vinserti128 m9, [dstq +strideq*1-1], 1
- vinserti128 m5, [dst4q+strideq*0+1], 1
- vinserti128 m10, [dst4q+strideq*1+1], 1
- punpcklqdq m6, m9
- punpcklqdq m5, m10
- vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11
-%endif
- ret
-.d6k1:
-%if %1 == 4
- %if %2 == 4
- movd xm6, [topq +strideq*0]
- movd xm9, [topq +strideq*1]
- movd xm5, [dstq +strideq*2]
- movd xm10, [dstq +stride3q ]
- vinserti128 m6, [dstq +strideq*0], 1
- vinserti128 m9, [dstq +strideq*1], 1
- vinserti128 m5, [dst4q+strideq*0], 1
- vinserti128 m10, [dst4q+strideq*1], 1
- punpckldq m6, m9
- punpckldq m5, m10
- %else
- movd xm5, [dstq +strideq*2]
- movd xm6, [topq +strideq*0]
- movd xm9, [dst4q+strideq*2]
- pinsrd xm5, [dstq +stride3q ], 1
- pinsrd xm6, [topq +strideq*1], 1
- pinsrd xm9, [dst4q+stride3q ], 1
- pinsrd xm5, [dst4q+strideq*0], 2
- pinsrd xm6, [dstq +strideq*0], 2
- pinsrd xm9, [dst8q+strideq*0], 2
- pinsrd xm5, [dst4q+strideq*1], 3
- pinsrd xm6, [dstq +strideq*1], 3
- pinsrd xm9, [dst8q+strideq*1], 3
- vinserti128 m6, xm5, 1
- vinserti128 m5, xm9, 1
- %endif
-%else
- movq xm5, [dstq +strideq*2]
- movq xm9, [dst4q+strideq*0]
- movq xm6, [top1q ]
- movq xm10, [dstq +strideq*0]
- movhps xm5, [dstq +stride3q ]
- movhps xm9, [dst4q+strideq*1]
- movhps xm6, [top2q ]
- movhps xm10, [dstq +strideq*1]
- vinserti128 m5, xm9, 1
- vinserti128 m6, xm10, 1
-%endif
- ret
-.d7k1:
-%if %1 == 4
- %if %2 == 4
- movd xm5, [dstq +strideq*2-1]
- movd xm9, [dstq +stride3q -1]
- movd xm6, [topq +strideq*0+1]
- movd xm10, [topq +strideq*1+1]
- pinsrb xm5, [leftq+ 5], 0
- pinsrb xm9, [leftq+ 7], 0
- vinserti128 m6, [dstq +strideq*0+1], 1
- vinserti128 m10, [dstq +strideq*1+1], 1
- vinserti128 m5, [dst4q+strideq*0-1], 1
- vinserti128 m9, [dst4q+strideq*1-1], 1
- punpckldq m6, m10
- punpckldq m5, m9
- %else
- movd xm6, [topq +strideq*0+1]
- movq xm9, [dstq +strideq*2-1]
- movq xm10, [dst4q+strideq*0-1]
- movd xm11, [dst4q+strideq*2-1]
- pinsrd xm6, [topq +strideq*1+1], 1
- movhps xm9, [dstq +stride3q -1]
- movhps xm10, [dst4q+strideq*1-1]
- pinsrd xm11, [dst4q+stride3q -1], 1
- pinsrd xm6, [dstq +strideq*0+1], 2
- pinsrd xm11, [dst8q+strideq*0-1], 2
- pinsrd xm6, [dstq +strideq*1+1], 3
- pinsrd xm11, [dst8q+strideq*1-1], 3
- shufps xm5, xm9, xm10, q2020
- vinserti128 m5, xm11, 1
- pmovzxbw m11, [leftq+5]
- psrldq xm9, 2
- psrldq xm10, 2
- shufps xm9, xm10, q2020
- movu m10, [blend_4x8_1+8]
- vinserti128 m6, xm9, 1
- vpblendvb m5, m11, m10
- %endif
-%else
- lea r13, [blend_8x8_0+16]
- movq xm5, [dstq +strideq*2-1]
- movq xm9, [dst4q+strideq*0-1]
- movq xm6, [top1q +1]
- movq xm10, [dstq +strideq*0+1]
- movhps xm5, [dstq +stride3q -1]
- movhps xm9, [dst4q+strideq*1-1]
- movhps xm6, [top2q +1]
- movhps xm10, [dstq +strideq*1+1]
- movu m11, [r13+hq*2*2+16*2]
- vinserti128 m5, xm9, 1
- vinserti128 m6, xm10, 1
- vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11
-%endif
- ret
-
-.border_block:
- DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
-%define rstk rsp
-%assign stack_offset stack_offset_entry
-%if %1 == 4 && %2 == 8
- PUSH r9
- %assign regs_used 10
-%else
- %assign regs_used 9
-%endif
-%if STACK_ALIGNMENT < 32
- PUSH r%+regs_used
- %assign regs_used regs_used+1
-%endif
- ALLOC_STACK 2*16+(%2+4)*32, 16
-%define px rsp+2*16+2*32
-
- pcmpeqw m14, m14
- psllw m14, 15 ; 0x8000
-
- ; prepare pixel buffers - body/right
-%if %1 == 4
- INIT_XMM avx2
-%endif
-%if %2 == 8
- lea dst4q, [dstq+strideq*4]
-%endif
- lea stride3q, [strideq*3]
- test edgeb, 2 ; have_right
- jz .no_right
- pmovzxbw m1, [dstq+strideq*0]
- pmovzxbw m2, [dstq+strideq*1]
- pmovzxbw m3, [dstq+strideq*2]
- pmovzxbw m4, [dstq+stride3q]
- mova [px+0*32], m1
- mova [px+1*32], m2
- mova [px+2*32], m3
- mova [px+3*32], m4
-%if %2 == 8
- pmovzxbw m1, [dst4q+strideq*0]
- pmovzxbw m2, [dst4q+strideq*1]
- pmovzxbw m3, [dst4q+strideq*2]
- pmovzxbw m4, [dst4q+stride3q]
- mova [px+4*32], m1
- mova [px+5*32], m2
- mova [px+6*32], m3
- mova [px+7*32], m4
-%endif
- jmp .body_done
-.no_right:
-%if %1 == 4
- movd xm1, [dstq+strideq*0]
- movd xm2, [dstq+strideq*1]
- movd xm3, [dstq+strideq*2]
- movd xm4, [dstq+stride3q]
- pmovzxbw xm1, xm1
- pmovzxbw xm2, xm2
- pmovzxbw xm3, xm3
- pmovzxbw xm4, xm4
- movq [px+0*32], xm1
- movq [px+1*32], xm2
- movq [px+2*32], xm3
- movq [px+3*32], xm4
-%else
- pmovzxbw xm1, [dstq+strideq*0]
- pmovzxbw xm2, [dstq+strideq*1]
- pmovzxbw xm3, [dstq+strideq*2]
- pmovzxbw xm4, [dstq+stride3q]
- mova [px+0*32], xm1
- mova [px+1*32], xm2
- mova [px+2*32], xm3
- mova [px+3*32], xm4
-%endif
- movd [px+0*32+%1*2], xm14
- movd [px+1*32+%1*2], xm14
- movd [px+2*32+%1*2], xm14
- movd [px+3*32+%1*2], xm14
-%if %2 == 8
- %if %1 == 4
- movd xm1, [dst4q+strideq*0]
- movd xm2, [dst4q+strideq*1]
- movd xm3, [dst4q+strideq*2]
- movd xm4, [dst4q+stride3q]
- pmovzxbw xm1, xm1
- pmovzxbw xm2, xm2
- pmovzxbw xm3, xm3
- pmovzxbw xm4, xm4
- movq [px+4*32], xm1
- movq [px+5*32], xm2
- movq [px+6*32], xm3
- movq [px+7*32], xm4
- %else
- pmovzxbw xm1, [dst4q+strideq*0]
- pmovzxbw xm2, [dst4q+strideq*1]
- pmovzxbw xm3, [dst4q+strideq*2]
- pmovzxbw xm4, [dst4q+stride3q]
- mova [px+4*32], xm1
- mova [px+5*32], xm2
- mova [px+6*32], xm3
- mova [px+7*32], xm4
- %endif
- movd [px+4*32+%1*2], xm14
- movd [px+5*32+%1*2], xm14
- movd [px+6*32+%1*2], xm14
- movd [px+7*32+%1*2], xm14
-%endif
-.body_done:
-
- ; top
- test edgeb, 4 ; have_top
- jz .no_top
- test edgeb, 1 ; have_left
- jz .top_no_left
- test edgeb, 2 ; have_right
- jz .top_no_right
- pmovzxbw m1, [topq+strideq*0-(%1/2)]
- pmovzxbw m2, [topq+strideq*1-(%1/2)]
- movu [px-2*32-%1], m1
- movu [px-1*32-%1], m2
- jmp .top_done
-.top_no_right:
- pmovzxbw m1, [topq+strideq*0-%1]
- pmovzxbw m2, [topq+strideq*1-%1]
- movu [px-2*32-%1*2], m1
- movu [px-1*32-%1*2], m2
- movd [px-2*32+%1*2], xm14
- movd [px-1*32+%1*2], xm14
- jmp .top_done
-.top_no_left:
- test edgeb, 2 ; have_right
- jz .top_no_left_right
- pmovzxbw m1, [topq+strideq*0]
- pmovzxbw m2, [topq+strideq*1]
- mova [px-2*32+0], m1
- mova [px-1*32+0], m2
- movd [px-2*32-4], xm14
- movd [px-1*32-4], xm14
- jmp .top_done
-.top_no_left_right:
-%if %1 == 4
- movd xm1, [topq+strideq*0]
- pinsrd xm1, [topq+strideq*1], 1
- pmovzxbw xm1, xm1
- movq [px-2*32+0], xm1
- movhps [px-1*32+0], xm1
-%else
- pmovzxbw xm1, [topq+strideq*0]
- pmovzxbw xm2, [topq+strideq*1]
- mova [px-2*32+0], xm1
- mova [px-1*32+0], xm2
-%endif
- movd [px-2*32-4], xm14
- movd [px-1*32-4], xm14
- movd [px-2*32+%1*2], xm14
- movd [px-1*32+%1*2], xm14
- jmp .top_done
-.no_top:
- movu [px-2*32-%1], m14
- movu [px-1*32-%1], m14
-.top_done:
-
- ; left
- test edgeb, 1 ; have_left
- jz .no_left
- pmovzxbw xm1, [leftq+ 0]
-%if %2 == 8
- pmovzxbw xm2, [leftq+ 8]
-%endif
- movd [px+0*32-4], xm1
- pextrd [px+1*32-4], xm1, 1
- pextrd [px+2*32-4], xm1, 2
- pextrd [px+3*32-4], xm1, 3
-%if %2 == 8
- movd [px+4*32-4], xm2
- pextrd [px+5*32-4], xm2, 1
- pextrd [px+6*32-4], xm2, 2
- pextrd [px+7*32-4], xm2, 3
-%endif
- jmp .left_done
-.no_left:
- movd [px+0*32-4], xm14
- movd [px+1*32-4], xm14
- movd [px+2*32-4], xm14
- movd [px+3*32-4], xm14
-%if %2 == 8
- movd [px+4*32-4], xm14
- movd [px+5*32-4], xm14
- movd [px+6*32-4], xm14
- movd [px+7*32-4], xm14
-%endif
-.left_done:
-
- ; bottom
- DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
- test edgeb, 8 ; have_bottom
- jz .no_bottom
- lea dst8q, [dstq+%2*strideq]
- test edgeb, 1 ; have_left
- jz .bottom_no_left
- test edgeb, 2 ; have_right
- jz .bottom_no_right
- pmovzxbw m1, [dst8q-(%1/2)]
- pmovzxbw m2, [dst8q+strideq-(%1/2)]
- movu [px+(%2+0)*32-%1], m1
- movu [px+(%2+1)*32-%1], m2
- jmp .bottom_done
-.bottom_no_right:
- pmovzxbw m1, [dst8q-%1]
- pmovzxbw m2, [dst8q+strideq-%1]
- movu [px+(%2+0)*32-%1*2], m1
- movu [px+(%2+1)*32-%1*2], m2
-%if %1 == 8
- movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
-%endif
- movd [px+(%2+0)*32+%1*2], xm14
- movd [px+(%2+1)*32+%1*2], xm14
- jmp .bottom_done
-.bottom_no_left:
- test edgeb, 2 ; have_right
- jz .bottom_no_left_right
- pmovzxbw m1, [dst8q]
- pmovzxbw m2, [dst8q+strideq]
- mova [px+(%2+0)*32+0], m1
- mova [px+(%2+1)*32+0], m2
- movd [px+(%2+0)*32-4], xm14
- movd [px+(%2+1)*32-4], xm14
- jmp .bottom_done
-.bottom_no_left_right:
-%if %1 == 4
- movd xm1, [dst8q]
- pinsrd xm1, [dst8q+strideq], 1
- pmovzxbw xm1, xm1
- movq [px+(%2+0)*32+0], xm1
- movhps [px+(%2+1)*32+0], xm1
-%else
- pmovzxbw xm1, [dst8q]
- pmovzxbw xm2, [dst8q+strideq]
- mova [px+(%2+0)*32+0], xm1
- mova [px+(%2+1)*32+0], xm2
-%endif
- movd [px+(%2+0)*32-4], xm14
- movd [px+(%2+1)*32-4], xm14
- movd [px+(%2+0)*32+%1*2], xm14
- movd [px+(%2+1)*32+%1*2], xm14
- jmp .bottom_done
-.no_bottom:
- movu [px+(%2+0)*32-%1], m14
- movu [px+(%2+1)*32-%1], m14
-.bottom_done:
-
- ; actual filter
- INIT_YMM avx2
- DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
-%undef edged
- ; register to shuffle values into after packing
- vbroadcasti128 m12, [shufb_lohi]
-
- mov dampingd, r7m
- xor zerod, zerod
- movifnidn prid, prim
- sub dampingd, 31
- movifnidn secdmpd, secdmpm
- or prid, 0
- jz .border_sec_only
- movd xm0, prid
- lzcnt pridmpd, prid
- add pridmpd, dampingd
- cmovs pridmpd, zerod
- mov [rsp+0], pridmpq ; pri_shift
- or secdmpd, 0
- jz .border_pri_only
- movd xm1, secdmpd
- lzcnt secdmpd, secdmpd
- add secdmpd, dampingd
- cmovs secdmpd, zerod
- mov [rsp+8], secdmpq ; sec_shift
-
- DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
- lea tableq, [tap_table]
- vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
- vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
-
- ; pri/sec_taps[k] [4 total]
- DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
- vpbroadcastb m0, xm0 ; pri_strength
- vpbroadcastb m1, xm1 ; sec_strength
- and prid, 1
- lea priq, [tableq+priq*2+8] ; pri_taps
- lea secq, [tableq+12] ; sec_taps
-
- BORDER_PREP_REGS %1, %2
-%if %1*%2*2/mmsize > 1
-.border_v_loop:
-%endif
- BORDER_LOAD_BLOCK %1, %2, 1
-.border_k_loop:
- vpbroadcastb m2, [priq+kq] ; pri_taps
- vpbroadcastb m3, [secq+kq] ; sec_taps
- ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
- ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
- ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
- dec kq
- jge .border_k_loop
-
- vpbroadcastd m10, [pw_2048]
- BORDER_ADJUST_PIXEL %1, m10, 1
-%if %1*%2*2/mmsize > 1
- %define vloop_lines (mmsize/(%1*2))
- lea dstq, [dstq+strideq*vloop_lines]
- add stkq, 32*vloop_lines
- dec hd
- jg .border_v_loop
-%endif
- RET
-
-.border_pri_only:
- DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
- lea tableq, [tap_table]
- vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
- DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
- vpbroadcastb m0, xm0 ; pri_strength
- and prid, 1
- lea priq, [tableq+priq*2+8] ; pri_taps
- BORDER_PREP_REGS %1, %2
- vpbroadcastd m1, [pw_2048]
-%if %1*%2*2/mmsize > 1
-.border_pri_v_loop:
-%endif
- BORDER_LOAD_BLOCK %1, %2
-.border_pri_k_loop:
- vpbroadcastb m2, [priq+kq] ; pri_taps
- ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
- dec kq
- jge .border_pri_k_loop
- BORDER_ADJUST_PIXEL %1, m1
-%if %1*%2*2/mmsize > 1
- %define vloop_lines (mmsize/(%1*2))
- lea dstq, [dstq+strideq*vloop_lines]
- add stkq, 32*vloop_lines
- dec hd
- jg .border_pri_v_loop
-%endif
- RET
-
-.border_sec_only:
- DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
- movd xm1, secdmpd
- lzcnt secdmpd, secdmpd
- add secdmpd, dampingd
- cmovs secdmpd, zerod
- mov [rsp+8], secdmpq ; sec_shift
- DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
- lea tableq, [tap_table]
- vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
- DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
- vpbroadcastb m1, xm1 ; sec_strength
- lea secq, [tableq+12] ; sec_taps
- BORDER_PREP_REGS %1, %2
- vpbroadcastd m0, [pw_2048]
-%if %1*%2*2/mmsize > 1
-.border_sec_v_loop:
-%endif
- BORDER_LOAD_BLOCK %1, %2
-.border_sec_k_loop:
- vpbroadcastb m3, [secq+kq] ; sec_taps
- ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
- ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
- dec kq
- jge .border_sec_k_loop
- BORDER_ADJUST_PIXEL %1, m0
-%if %1*%2*2/mmsize > 1
- %define vloop_lines (mmsize/(%1*2))
- lea dstq, [dstq+strideq*vloop_lines]
- add stkq, 32*vloop_lines
- dec hd
- jg .border_sec_v_loop
-%endif
- RET
-%endmacro
-
-CDEF_FILTER 8, 8
-CDEF_FILTER 4, 8
-CDEF_FILTER 4, 4
-
-INIT_YMM avx2
-cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
- lea stride3q, [strideq*3]
- movq xm0, [srcq+strideq*0]
- movq xm1, [srcq+strideq*1]
- movq xm2, [srcq+strideq*2]
- movq xm3, [srcq+stride3q]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m4, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- vpbroadcastq m6, [srcq+strideq*2]
- vpbroadcastq m7, [srcq+stride3q]
- vpbroadcastd m8, [pw_128]
- pxor m9, m9
-
- vpblendd m0, m0, m7, 0xf0
- vpblendd m1, m1, m6, 0xf0
- vpblendd m2, m2, m5, 0xf0
- vpblendd m3, m3, m4, 0xf0
-
- punpcklbw m0, m9
- punpcklbw m1, m9
- punpcklbw m2, m9
- punpcklbw m3, m9
-
- psubw m0, m8
- psubw m1, m8
- psubw m2, m8
- psubw m3, m8
-
- ; shuffle registers to generate partial_sum_diag[0-1] together
- vpermq m7, m0, q1032
- vpermq m6, m1, q1032
- vpermq m5, m2, q1032
- vpermq m4, m3, q1032
-
- ; start with partial_sum_hv[0-1]
- paddw m8, m0, m1
- paddw m9, m2, m3
- phaddw m10, m0, m1
- phaddw m11, m2, m3
- paddw m8, m9
- phaddw m10, m11
- vextracti128 xm9, m8, 1
- vextracti128 xm11, m10, 1
- paddw xm8, xm9 ; partial_sum_hv[1]
- phaddw xm10, xm11 ; partial_sum_hv[0]
- vinserti128 m8, xm10, 1
- vpbroadcastd m9, [div_table+44]
- pmaddwd m8, m8
- pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
-
- ; create aggregates [lower half]:
- ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
- ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
- ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
- ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
- ; and [upper half]:
- ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
- ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
- ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
- ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
- ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
-
- pslldq m9, m1, 2
- psrldq m10, m1, 14
- pslldq m11, m2, 4
- psrldq m12, m2, 12
- pslldq m13, m3, 6
- psrldq m14, m3, 10
- paddw m9, m11
- paddw m10, m12
- paddw m9, m13
- paddw m10, m14
- pslldq m11, m4, 8
- psrldq m12, m4, 8
- pslldq m13, m5, 10
- psrldq m14, m5, 6
- paddw m9, m11
- paddw m10, m12
- paddw m9, m13
- paddw m10, m14
- pslldq m11, m6, 12
- psrldq m12, m6, 4
- pslldq m13, m7, 14
- psrldq m14, m7, 2
- paddw m9, m11
- paddw m10, m12
- paddw m9, m13
- paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
- vbroadcasti128 m14, [shufw_6543210x]
- vbroadcasti128 m13, [div_table+16]
- vbroadcasti128 m12, [div_table+0]
- paddw m9, m0 ; partial_sum_diag[0/1][0-7]
- pshufb m10, m14
- punpckhwd m11, m9, m10
- punpcklwd m9, m10
- pmaddwd m11, m11
- pmaddwd m9, m9
- pmulld m11, m13
- pmulld m9, m12
- paddd m9, m11 ; cost0[a-d] | cost4[a-d]
-
- ; merge horizontally and vertically for partial_sum_alt[0-3]
- paddw m10, m0, m1
- paddw m11, m2, m3
- paddw m12, m4, m5
- paddw m13, m6, m7
- phaddw m0, m4
- phaddw m1, m5
- phaddw m2, m6
- phaddw m3, m7
-
- ; create aggregates [lower half]:
- ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
- ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
- ; and [upper half]:
- ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
- ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
- ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
-
- pslldq m4, m11, 2
- psrldq m11, 14
- pslldq m5, m12, 4
- psrldq m12, 12
- pslldq m6, m13, 6
- psrldq m13, 10
- paddw m4, m10
- paddw m11, m12
- vpbroadcastd m12, [div_table+44]
- paddw m5, m6
- paddw m11, m13 ; partial_sum_alt[3/2] right
- vbroadcasti128 m13, [div_table+32]
- paddw m4, m5 ; partial_sum_alt[3/2] left
- pshuflw m5, m11, q3012
- punpckhwd m6, m11, m4
- punpcklwd m4, m5
- pmaddwd m6, m6
- pmaddwd m4, m4
- pmulld m6, m12
- pmulld m4, m13
- paddd m4, m6 ; cost7[a-d] | cost5[a-d]
-
- ; create aggregates [lower half]:
- ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
- ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
- ; and [upper half]:
- ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
- ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
- ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
-
- pslldq m5, m1, 2
- psrldq m1, 14
- pslldq m6, m2, 4
- psrldq m2, 12
- pslldq m7, m3, 6
- psrldq m3, 10
- paddw m5, m0
- paddw m1, m2
- paddw m6, m7
- paddw m1, m3 ; partial_sum_alt[0/1] right
- paddw m5, m6 ; partial_sum_alt[0/1] left
- pshuflw m0, m1, q3012
- punpckhwd m1, m5
- punpcklwd m5, m0
- pmaddwd m1, m1
- pmaddwd m5, m5
- pmulld m1, m12
- pmulld m5, m13
- paddd m5, m1 ; cost1[a-d] | cost3[a-d]
-
- mova xm0, [pd_47130256+ 16]
- mova m1, [pd_47130256]
- phaddd m9, m8
- phaddd m5, m4
- phaddd m9, m5
- vpermd m0, m9 ; cost[0-3]
- vpermd m1, m9 ; cost[4-7] | cost[0-3]
-
- ; now find the best cost
- pmaxsd xm2, xm0, xm1
- pshufd xm3, xm2, q1032
- pmaxsd xm2, xm3
- pshufd xm3, xm2, q2301
- pmaxsd xm2, xm3 ; best cost
-
- ; find the idx using minpos
- ; make everything other than the best cost negative via subtraction
- ; find the min of unsigned 16-bit ints to sort out the negative values
- psubd xm4, xm1, xm2
- psubd xm3, xm0, xm2
- packssdw xm3, xm4
- phminposuw xm3, xm3
-
- ; convert idx to 32-bits
- psrld xm3, 16
- movd eax, xm3
-
- ; get idx^4 complement
- vpermd m3, m1
- psubd xm2, xm3
- psrld xm2, 10
- movd [varq], xm2
- RET
-
-%if WIN64
-DECLARE_REG_TMP 5, 6
-%else
-DECLARE_REG_TMP 8, 5
-%endif
-
-; lut:
-; t0 t1 t2 t3 t4 t5 t6 t7
-; T0 T1 T2 T3 T4 T5 T6 T7
-; L0 L1 00 01 02 03 04 05
-; L2 L3 10 11 12 13 14 15
-; L4 L5 20 21 22 23 24 25
-; L6 L7 30 31 32 33 34 35
-; 4e 4f 40 41 42 43 44 45
-; 5e 5f 50 51 52 53 54 55
-
-%if HAVE_AVX512ICL
-
-INIT_ZMM avx512icl
-cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
-%define base r7-edge_mask
- movq xmm0, [dstq+strideq*0]
- movhps xmm0, [dstq+strideq*1]
- lea r7, [edge_mask]
- movq xmm1, [topq+strideq*0-2]
- movhps xmm1, [topq+strideq*1-2]
- mov r6d, edgem
- vinserti32x4 ym0, ymm0, [leftq], 1
- lea r2, [strideq*3]
- vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1
- mova m5, [base+lut_perm_4x4]
- vinserti32x4 m0, [dstq+r2], 2
- test r6b, 0x08 ; avoid buffer overread
- jz .main
- lea r3, [dstq+strideq*4-4]
- vinserti32x4 m1, [r3+strideq*0], 2
- vinserti32x4 m0, [r3+strideq*1], 3
-.main:
- movifnidn prid, prim
- mov t0d, dirm
- mova m3, [base+px_idx]
- mov r3d, dampingm
- vpermi2b m5, m0, m1 ; lut
- vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
- pxor m7, m7
- lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
- vpermb m6, m3, m5 ; px
- cmp r6d, 0x0f
- jne .mask_edges ; mask edges only if required
- test prid, prid
- jz .sec_only
- vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
- vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1
-%macro CDEF_FILTER_4x4_PRI 0
- vpcmpub k1, m6, m1, 6 ; px > pN
- psubb m2, m1, m6
- lzcnt r6d, prid
- vpsubb m2{k1}, m6, m1 ; abs(diff)
- vpbroadcastb m4, prid
- and prid, 1
- vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
- movifnidn t1d, secm
- vpbroadcastd m10, [base+pri_tap+priq*4]
- vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap)
- psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift)))
- pminub m2, m4
- vpdpbusd m0, m2, m10 ; sum
-%endmacro
- CDEF_FILTER_4x4_PRI
- test t1d, t1d ; sec
- jz .end_no_clip
- call .sec
-.end_clip:
- pminub m4, m6, m1
- pmaxub m1, m6
- pminub m5, m2, m3
- pmaxub m2, m3
- pminub m4, m5
- pmaxub m2, m1
- psrldq m1, m4, 2
- psrldq m3, m2, 2
- pminub m1, m4
- vpcmpw k1, m0, m7, 1
- vpshldd m6, m0, 8
- pmaxub m2, m3
- pslldq m3, m1, 1
- psubw m7, m0
- paddusw m0, m6 ; clip >0xff
- vpsubusw m0{k1}, m6, m7 ; clip <0x00
- pslldq m4, m2, 1
- pminub m1, m3
- pmaxub m2, m4
- pmaxub m0, m1
- pminub m0, m2
- jmp .end
-.sec_only:
- movifnidn t1d, secm
- call .sec
-.end_no_clip:
- vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4)
- paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
-.end:
- mova xm1, [base+end_perm]
- vpermb m0, m1, m0 ; output in bits 8-15 of each dword
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- pextrd [dstq+strideq*2], xm0, 2
- pextrd [dstq+r2 ], xm0, 3
- RET
-.mask_edges_sec_only:
- movifnidn t1d, secm
- call .mask_edges_sec
- jmp .end_no_clip
-ALIGN function_align
-.mask_edges:
- vpbroadcastq m8, [base+edge_mask+r6*8]
- test prid, prid
- jz .mask_edges_sec_only
- vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
- vpshufbitqmb k1, m8, m2 ; index in-range
- mova m1, m6
- vpermb m1{k1}, m2, m5
- CDEF_FILTER_4x4_PRI
- test t1d, t1d
- jz .end_no_clip
- call .mask_edges_sec
- jmp .end_clip
-.mask_edges_sec:
- vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
- vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
- vpshufbitqmb k1, m8, m4
- mova m2, m6
- vpermb m2{k1}, m4, m5
- vpshufbitqmb k1, m8, m9
- mova m3, m6
- vpermb m3{k1}, m9, m5
- jmp .sec_main
-ALIGN function_align
-.sec:
- vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
- vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
- vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
- vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
-.sec_main:
- vpbroadcastd m8, [base+sec_tap]
- vpcmpub k1, m6, m2, 6
- psubb m4, m2, m6
- vpbroadcastb m12, t1d
- lzcnt t1d, t1d
- vpsubb m4{k1}, m6, m2
- vpcmpub k2, m6, m3, 6
- vpbroadcastq m11, [r3+t1*8]
- gf2p8affineqb m10, m4, m11, 0
- psubb m5, m3, m6
- mova m9, m8
- vpsubb m8{k1}, m7, m8
- psubusb m10, m12, m10
- vpsubb m5{k2}, m6, m3
- pminub m4, m10
- vpdpbusd m0, m4, m8
- gf2p8affineqb m11, m5, m11, 0
- vpsubb m9{k2}, m7, m9
- psubusb m12, m11
- pminub m5, m12
- vpdpbusd m0, m5, m9
- ret
-
-DECLARE_REG_TMP 2, 7
-
-; lut top lut bottom
-; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
-; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
-; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
-; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
-; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
-; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
-; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
-; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
-
-cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
- pri, sec, dir, damping, edge
-%define base r8-edge_mask
- vpbroadcastd ym21, strided
- mov r6d, edgem
- lea r8, [edge_mask]
- movq xm1, [topq+strideq*0-2]
- pmulld ym21, [base+pd_01234567]
- kxnorb k1, k1, k1
- movq xm2, [topq+strideq*1-2]
- vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
- mova m14, [base+lut_perm_4x8a]
- movu m15, [base+lut_perm_4x8b]
- test r6b, 0x08 ; avoid buffer overread
- jz .main
- lea r7, [dstq+strideq*8-2]
- vinserti32x4 ym1, [r7+strideq*0], 1
- vinserti32x4 ym2, [r7+strideq*1], 1
-.main:
- punpcklqdq ym1, ym2
- vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
- movifnidn prid, prim
- mov t0d, dirm
- mova m16, [base+px_idx]
- mov r3d, dampingm
- vpermi2b m14, m0, m1 ; lut top
- vpermi2b m15, m0, m1 ; lut bottom
- vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
- pxor m20, m20
- lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
- vpermb m2, m16, m14 ; pxt
- vpermb m3, m16, m15 ; pxb
- mova m1, m0
- cmp r6b, 0x0f
- jne .mask_edges ; mask edges only if required
- test prid, prid
- jz .sec_only
- vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
- vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
- vpermb m5, m6, m15 ; pNb
-%macro CDEF_FILTER_4x8_PRI 0
- vpcmpub k1, m2, m4, 6 ; pxt > pNt
- vpcmpub k2, m3, m5, 6 ; pxb > pNb
- psubb m6, m4, m2
- psubb m7, m5, m3
- lzcnt r6d, prid
- vpsubb m6{k1}, m2, m4 ; abs(diff_top)
- vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
- vpbroadcastb m13, prid
- vpbroadcastq m9, [r3+r6*8]
- and prid, 1
- vpbroadcastd m11, [base+pri_tap+priq*4]
- vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
- vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
- mova m10, m11
- movifnidn t1d, secm
- vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
- vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
- psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
- psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
- pminub m6, m12
- pminub m7, m13
- vpdpbusd m0, m6, m10 ; sum top
- vpdpbusd m1, m7, m11 ; sum bottom
-%endmacro
- CDEF_FILTER_4x8_PRI
- test t1d, t1d ; sec
- jz .end_no_clip
- call .sec
-.end_clip:
- pminub m10, m4, m2
- pminub m12, m6, m8
- pminub m11, m5, m3
- pminub m13, m7, m9
- pmaxub m4, m2
- pmaxub m6, m8
- pmaxub m5, m3
- pmaxub m7, m9
- pminub m10, m12
- pminub m11, m13
- pmaxub m4, m6
- pmaxub m5, m7
- mov r2d, 0xAAAAAAAA
- kmovd k1, r2d
- kxnorb k2, k2, k2 ; hw lw
- vpshrdd m12, m0, m1, 16 ; m1lw m0hw
- vpshrdd m6, m10, m11, 16 ; m11lw m10hw
- vpshrdd m8, m4, m5, 16 ; m5lw m4hw
- vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
- vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
- vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
- vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
- vpshrdd m2, m3, 16
- pminub m6, m7
- pmaxub m8, m9
- mova ym14, [base+end_perm]
- vpcmpw k1, m4, m20, 1
- vpshldw m2, m5, 8
- pslldq m7, m6, 1
- pslldq m9, m8, 1
- psubw m5, m20, m4
- paddusw m0, m4, m2 ; clip >0xff
- pminub m6, m7
- pmaxub m8, m9
- psubusw m0{k1}, m2, m5 ; clip <0x00
- pmaxub m0, m6
- pminub m0, m8
- vpermb m0, m14, m0
- vpscatterdd [dstq+ym21]{k2}, ym0
- RET
-.sec_only:
- movifnidn t1d, secm
- call .sec
-.end_no_clip:
- mova ym4, [base+end_perm]
- kxnorb k1, k1, k1
- vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
- vpshldd m3, m1, 8
- paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
- paddw m1, m3
- pslld m0, 16
- vpshrdd m0, m1, 16
- vpermb m0, m4, m0 ; output in bits 8-15 of each word
- vpscatterdd [dstq+ym21]{k1}, ym0
- RET
-.mask_edges_sec_only:
- movifnidn t1d, secm
- call .mask_edges_sec
- jmp .end_no_clip
-ALIGN function_align
-.mask_edges:
- mov t1d, r6d
- or r6d, 8 ; top 4x4 has bottom
- or t1d, 4 ; bottom 4x4 has top
- vpbroadcastq m17, [base+edge_mask+r6*8]
- vpbroadcastq m18, [base+edge_mask+t1*8]
- test prid, prid
- jz .mask_edges_sec_only
- vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
- vpshufbitqmb k1, m17, m6 ; index in-range
- vpshufbitqmb k2, m18, m6
- mova m4, m2
- mova m5, m3
- vpermb m4{k1}, m6, m14
- vpermb m5{k2}, m6, m15
- CDEF_FILTER_4x8_PRI
- test t1d, t1d
- jz .end_no_clip
- call .mask_edges_sec
- jmp .end_clip
-.mask_edges_sec:
- vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
- vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
- vpshufbitqmb k1, m17, m10
- vpshufbitqmb k2, m18, m10
- vpshufbitqmb k3, m17, m11
- vpshufbitqmb k4, m18, m11
- mova m6, m2
- mova m7, m3
- mova m8, m2
- mova m9, m3
- vpermb m6{k1}, m10, m14
- vpermb m7{k2}, m10, m15
- vpermb m8{k3}, m11, m14
- vpermb m9{k4}, m11, m15
- jmp .sec_main
-ALIGN function_align
-.sec:
- vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
- vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
- vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
- vpermb m7, m8, m15 ; pNb
- vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
- vpermb m9, m9, m15 ; pNb
-.sec_main:
- vpbroadcastb m18, t1d
- lzcnt t1d, t1d
- vpcmpub k1, m2, m6, 6
- vpcmpub k2, m3, m7, 6
- vpcmpub k3, m2, m8, 6
- vpcmpub k4, m3, m9, 6
- vpbroadcastq m17, [r3+t1*8]
- psubb m10, m6, m2
- psubb m11, m7, m3
- psubb m12, m8, m2
- psubb m13, m9, m3
- vpsubb m10{k1}, m2, m6 ; abs(dt0)
- vpsubb m11{k2}, m3, m7 ; abs(db0)
- vpsubb m12{k3}, m2, m8 ; abs(dt1)
- vpsubb m13{k4}, m3, m9 ; abs(db1)
- vpbroadcastd m19, [base+sec_tap]
- gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
- gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
- gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
- gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
- psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
- psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
- psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
- psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
- pminub m10, m14
- pminub m11, m15
- pminub m12, m16
- pminub m13, m17
- mova m14, m19
- mova m15, m19
- mova m16, m19
- vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
- vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
- vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
- vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
- vpdpbusd m0, m10, m14
- vpdpbusd m1, m11, m15
- vpdpbusd m0, m12, m16
- vpdpbusd m1, m13, m19
- ret
-
-; lut tl lut tr
-; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td
-; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD
-; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b
-; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b
-; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
-; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
-; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
-; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
-; lut bl lut br
-; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
-; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
-; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
-; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
-; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b
-; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b
-; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
-; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
-
-cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
- pri, sec, dir, damping, edge
-%define base r8-edge_mask
- mov r6d, edgem
- lea r10, [dstq+strideq*4-2]
- movu xmm0, [topq+strideq*0-2]
- movu xmm1, [dstq+strideq*2-2]
- movu xmm2, [r10 +strideq*2 ]
- lea r8, [edge_mask]
- lea r9, [strideq*3]
- pmovzxwq m10, [leftq-4]
- vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1
- vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1
- vinserti32x4 ym2, ymm2, [r10 +r9 ], 1
- lea r7, [r10 +strideq*4 ]
- pmovzxwq m11, [leftq+4]
- vinserti32x4 m0, [dstq+strideq*0-2], 2
- vinserti32x4 m1, [r10 +strideq*0 ], 2
- mova m12, [base+lut_perm_8x8a]
- movu m13, [base+lut_perm_8x8b]
- vinserti32x4 m0, [dstq+strideq*1-2], 3
- vinserti32x4 m1, [r10 +strideq*1 ], 3
- test r6b, 0x08 ; avoid buffer overread
- jz .main
- vinserti32x4 m2, [r7 +strideq*0], 2
- vinserti32x4 m2, [r7 +strideq*1], 3
-.main:
- mov t1d, 0x11111100
- mova m14, m12
- mova m15, m13
- kmovd k1, t1d
- kshiftrd k2, k1, 8
- movifnidn prid, prim
- mov t0d, dirm
- mova m30, [base+px_idx]
- mov r3d, dampingm
- vpermi2b m12, m0, m1 ; lut tl
- vpermi2b m14, m1, m2 ; lut bl
- vpermi2b m13, m0, m1 ; lut tr
- vpermi2b m15, m1, m2 ; lut br
- vpblendmw m12{k1}, m12, m10
- vpblendmw m14{k2}, m14, m11
- vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
- pxor m31, m31
- lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
- vpermb m4, m30, m12 ; pxtl
- vpermb m5, m30, m13 ; pxtr
- vpermb m6, m30, m14 ; pxbl
- vpermb m7, m30, m15 ; pxbr
- mova m1, m0
- mova m2, m0
- mova m3, m0
- cmp r6b, 0x0f
- jne .mask_edges ; mask edges only if required
- test prid, prid
- jz .sec_only
- vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
- vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
- vpermb m9, m11, m13 ; pNtr
- vpermb m10, m11, m14 ; pNbl
- vpermb m11, m11, m15 ; pNbr
-%macro CDEF_FILTER_8x8_PRI 0
- vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
- vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
- vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
- vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
- psubb m16, m8, m4
- psubb m17, m9, m5
- psubb m18, m10, m6
- psubb m19, m11, m7
- lzcnt r6d, prid
- vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
- vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
- vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
- vpsubb m19{k4}, m7, m11 ; abs(diff_br)
- vpbroadcastq m28, [r3+r6*8]
- vpbroadcastb m29, prid
- and prid, 1
- vpbroadcastd m27, [base+pri_tap+priq*4]
- vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
- vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
- vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
- vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
- mova m24, m27
- mova m25, m27
- mova m26, m27
- movifnidn t1d, secm
- vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
- vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
- vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
- vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
- psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
- psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
- psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
- psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
- pminub m16, m20
- pminub m17, m21
- pminub m18, m22
- pminub m19, m23
- vpdpbusd m0, m16, m24 ; sum tl
- vpdpbusd m1, m17, m25 ; sum tr
- vpdpbusd m2, m18, m26 ; sum bl
- vpdpbusd m3, m19, m27 ; sum br
-%endmacro
- CDEF_FILTER_8x8_PRI
- test t1d, t1d ; sec
- jz .end_no_clip
- call .sec
-.end_clip:
- pminub m20, m8, m4
- pminub m24, m12, m16
- pminub m21, m9, m5
- pminub m25, m13, m17
- pminub m22, m10, m6
- pminub m26, m14, m18
- pminub m23, m11, m7
- pminub m27, m15, m19
- pmaxub m8, m4
- pmaxub m12, m16
- pmaxub m9, m5
- pmaxub m13, m17
- pmaxub m10, m6
- pmaxub m14, m18
- pmaxub m11, m7
- pmaxub m15, m19
- pminub m20, m24
- pminub m21, m25
- pminub m22, m26
- pminub m23, m27
- pmaxub m8, m12
- pmaxub m9, m13
- pmaxub m10, m14
- pmaxub m11, m15
- mov r2d, 0xAAAAAAAA
- kmovd k1, r2d
- vpshrdd m24, m0, m1, 16
- vpshrdd m25, m2, m3, 16
- vpshrdd m12, m20, m21, 16
- vpshrdd m14, m22, m23, 16
- vpshrdd m16, m8, m9, 16
- vpshrdd m18, m10, m11, 16
- vpblendmw m13{k1}, m20, m21
- vpblendmw m15{k1}, m22, m23
- vpblendmw m17{k1}, m8, m9
- vpblendmw m19{k1}, m10, m11
- vpblendmw m20{k1}, m0, m24
- vpblendmw m21{k1}, m24, m1
- vpblendmw m22{k1}, m2, m25
- vpblendmw m23{k1}, m25, m3
- vpshrdd m4, m5, 16
- vpshrdd m6, m7, 16
- pminub m12, m13
- pminub m14, m15
- pmaxub m16, m17
- pmaxub m18, m19
- mova m8, [base+end_perm_w8clip]
- vpcmpw k2, m20, m31, 1
- vpcmpw k3, m22, m31, 1
- vpshldw m4, m21, 8
- vpshldw m6, m23, 8
- kunpckdq k1, k1, k1
- kxnorb k4, k4, k4
- vpshrdw m11, m12, m14, 8
- vpshrdw m15, m16, m18, 8
- vpblendmb m13{k1}, m12, m14
- vpblendmb m17{k1}, m16, m18
- psubw m21, m31, m20
- psubw m23, m31, m22
- paddusw m0, m20, m4 ; clip >0xff
- paddusw m1, m22, m6
- pminub m11, m13
- pmaxub m15, m17
- psubusw m0{k2}, m4, m21 ; clip <0x00
- psubusw m1{k3}, m6, m23
- psrlw m0, 8
- vmovdqu8 m0{k1}, m1
- pmaxub m0, m11
- pminub m0, m15
- vpermb m0, m8, m0
- add r10, 2
- vextracti32x4 xm1, m0, 1
- vextracti32x4 xm2, m0, 2
- vextracti32x4 xm3, m0, 3
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*2], xm1
- movq [r10 +strideq*0], xm2
- movq [r10 +strideq*2], xm3
- movhps [dstq+strideq*1], xm0
- movhps [dstq+r9 ], xm1
- movhps [r10 +strideq*1], xm2
- movhps [r10 +r9 ], xm3
- RET
-.sec_only:
- movifnidn t1d, secm
- call .sec
-.end_no_clip:
- mova xm8, [base+end_perm]
- kxnorb k1, k1, k1
- vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
- vpshldd m5, m1, 8
- vpshldd m6, m2, 8
- vpshldd m7, m3, 8
- paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
- vpermb m0, m8, m0
- vpermb m1, m8, m1
- vpermb m2, m8, m2
- vpermb m3, m8, m3
- add r10, 2
- punpckldq m4, m0, m1
- punpckhdq m0, m1
- punpckldq m5, m2, m3
- punpckhdq m2, m3
- movq [dstq+strideq*0], xm4
- movq [dstq+strideq*2], xm0
- movq [r10 +strideq*0], xm5
- movq [r10 +strideq*2], xm2
- movhps [dstq+strideq*1], xm4
- movhps [dstq+r9 ], xm0
- movhps [r10 +strideq*1], xm5
- movhps [r10 +r9 ], xm2
- RET
-.mask_edges_sec_only:
- movifnidn t1d, secm
- call .mask_edges_sec
- jmp .end_no_clip
-ALIGN function_align
-.mask_edges:
- mov t0d, r6d
- mov t1d, r6d
- or t0d, 0xA ; top-left 4x4 has bottom and right
- or t1d, 0x9 ; top-right 4x4 has bottom and left
- vpbroadcastq m26, [base+edge_mask+t0*8]
- vpbroadcastq m27, [base+edge_mask+t1*8]
- mov t1d, r6d
- or r6d, 0x6 ; bottom-left 4x4 has top and right
- or t1d, 0x5 ; bottom-right 4x4 has top and left
- vpbroadcastq m28, [base+edge_mask+r6*8]
- vpbroadcastq m29, [base+edge_mask+t1*8]
- mov t0d, dirm
- test prid, prid
- jz .mask_edges_sec_only
- vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
- vpshufbitqmb k1, m26, m20 ; index in-range
- vpshufbitqmb k2, m27, m20
- vpshufbitqmb k3, m28, m20
- vpshufbitqmb k4, m29, m20
- mova m8, m4
- mova m9, m5
- mova m10, m6
- mova m11, m7
- vpermb m8{k1}, m20, m12
- vpermb m9{k2}, m20, m13
- vpermb m10{k3}, m20, m14
- vpermb m11{k4}, m20, m15
- mova [rsp+0x00], m26
- mova [rsp+0x40], m27
- mova [rsp+0x80], m28
- mova [rsp+0xC0], m29
- CDEF_FILTER_8x8_PRI
- test t1d, t1d
- jz .end_no_clip
- mova m26, [rsp+0x00]
- mova m27, [rsp+0x40]
- mova m28, [rsp+0x80]
- mova m29, [rsp+0xC0]
- call .mask_edges_sec
- jmp .end_clip
-.mask_edges_sec:
- vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
- vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
- vpshufbitqmb k1, m26, m20
- vpshufbitqmb k2, m27, m20
- vpshufbitqmb k3, m28, m20
- vpshufbitqmb k4, m29, m20
- mova m16, m4
- mova m17, m5
- mova m18, m6
- mova m19, m7
- vpermb m16{k1}, m20, m12
- vpermb m17{k2}, m20, m13
- vpermb m18{k3}, m20, m14
- vpermb m19{k4}, m20, m15
- vpshufbitqmb k1, m26, m21
- vpshufbitqmb k2, m27, m21
- vpshufbitqmb k3, m28, m21
- vpshufbitqmb k4, m29, m21
- vpermb m12, m21, m12
- vpermb m13, m21, m13
- vpermb m14, m21, m14
- vpermb m15, m21, m15
- vpblendmb m12{k1}, m4, m12
- vpblendmb m13{k2}, m5, m13
- vpblendmb m14{k3}, m6, m14
- vpblendmb m15{k4}, m7, m15
- jmp .sec_main
-ALIGN function_align
-.sec:
- vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
- vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
- vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
- vpermb m17, m20, m13 ; pNtr
- vpermb m18, m20, m14 ; pNbl
- vpermb m19, m20, m15 ; pNbr
- vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
- vpermb m13, m21, m13 ; pNtr
- vpermb m14, m21, m14 ; pNbl
- vpermb m15, m21, m15 ; pNbr
-.sec_main:
-%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
- vpcmpub k1, m4, %1, 6
- vpcmpub k2, m5, %2, 6
- vpcmpub k3, m6, %3, 6
- vpcmpub k4, m7, %4, 6
- psubb m20, %1, m4
- psubb m21, %2, m5
- psubb m22, %3, m6
- psubb m23, %4, m7
-%if %5
- vpbroadcastb m28, t1d
- lzcnt t1d, t1d
- vpbroadcastq m29, [r3+t1*8]
-%endif
- vpsubb m20{k1}, m4, %1
- vpsubb m21{k2}, m5, %2
- vpsubb m22{k3}, m6, %3
- vpsubb m23{k4}, m7, %4
- gf2p8affineqb m24, m20, m29, 0
- gf2p8affineqb m25, m21, m29, 0
- gf2p8affineqb m26, m22, m29, 0
- gf2p8affineqb m27, m23, m29, 0
-%if %5
- vpbroadcastd m30, [base+sec_tap]
-%endif
- psubusb m24, m28, m24
- psubusb m25, m28, m25
- psubusb m26, m28, m26
- psubusb m27, m28, m27
- pminub m20, m24
- pminub m21, m25
- pminub m22, m26
- pminub m23, m27
- mova m24, m30
- mova m25, m30
- mova m26, m30
- mova m27, m30
- vpsubb m24{k1}, m31, m30
- vpsubb m25{k2}, m31, m30
- vpsubb m26{k3}, m31, m30
- vpsubb m27{k4}, m31, m30
- vpdpbusd m0, m20, m24
- vpdpbusd m1, m21, m25
- vpdpbusd m2, m22, m26
- vpdpbusd m3, m23, m27
-%endmacro
- CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
- CDEF_FILTER_8x8_SEC m12, m13, m14, m15
- ret
-
-%endif ; HAVE_AVX512ICL
-%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/cdef_avx2.asm
@@ -1,0 +1,1798 @@
+; Copyright © 2018, VideoLAN and dav1d authors
+; Copyright © 2018, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+ dd %%base %+ .%2 - %%table
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1, \
+ d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+ d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+ d0k0, d0k1, d1k0, d1k1
+%endmacro
+
+SECTION_RODATA 32
+
+pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
+blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+ dd 0x80, 0x00, 0x00
+blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+ dd 0x00, 0x00
+blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000
+blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+ dd 0x0000, 0x0000
+blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
+div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
+shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+pw_128: times 2 dw 128
+pw_2048: times 2 dw 2048
+tap_table: ; masks for 8 bit shifts
+ db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
+ ; weights
+ db 4, 2, 3, 3, 2, 1
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+ db 1 * 16 + 0, 2 * 16 + 0
+ db 1 * 16 + 0, 2 * 16 - 1
+ ; the last 6 are repeats of the first 6 so we don't need to & 7
+ db -1 * 16 + 1, -2 * 16 + 2
+ db 0 * 16 + 1, -1 * 16 + 2
+ db 0 * 16 + 1, 0 * 16 + 2
+ db 0 * 16 + 1, 1 * 16 + 2
+ db 1 * 16 + 1, 2 * 16 + 2
+ db 1 * 16 + 0, 2 * 16 + 1
+
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
+SECTION .text
+
+%macro PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r6m
+ lea tableq, [cdef_filter_%1x%2_jmptable]
+ lea dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+ DEFINE_ARGS dst, stride, left, top, pri, sec, \
+ table, dir, dirjmp, dst4, stride3, k
+ %else
+ DEFINE_ARGS dst, stride, left, top, pri, sec, \
+ table, dir, dirjmp, dst4, dst8, stride3, k
+ lea dst8q, [dstq+strideq*8]
+ %endif
+%else
+ DEFINE_ARGS dst, stride, h, top1, pri, sec, \
+ table, dir, dirjmp, top2, dst4, stride3, k
+ mov hq, -8
+ lea top1q, [top1q+strideq*0]
+ lea top2q, [top1q+strideq*1]
+%endif
+ lea dst4q, [dstq+strideq*4]
+%if %1 == 4
+ lea stride3q, [strideq*3]
+%endif
+%endmacro
+
+%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+ pxor m15, m15 ; sum
+%if %2 == 8
+ pxor m12, m12
+ %if %1 == 4
+ movd xm4, [dstq +strideq*0]
+ movd xm6, [dstq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm7, [dstq +stride3q ]
+ vinserti128 m4, [dst4q+strideq*0], 1
+ vinserti128 m6, [dst4q+strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*2], 1
+ vinserti128 m7, [dst4q+stride3q ], 1
+ punpckldq m4, m6
+ punpckldq m5, m7
+ %else
+ movq xm4, [dstq+strideq*0]
+ movq xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ %endif
+ punpcklqdq m4, m5
+%else
+ movd xm4, [dstq+strideq*0]
+ movd xm5, [dstq+strideq*1]
+ vinserti128 m4, [dstq+strideq*2], 1
+ vinserti128 m5, [dstq+stride3q ], 1
+ punpckldq m4, m5
+%endif
+%if %3 == 1
+ mova m7, m4 ; min
+ mova m8, m4 ; max
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, h, clip
+ ; load p0/p1
+ movsxd dirjmpq, [dirq+kq*4+%1*2*4]
+ add dirjmpq, tableq
+ call dirjmpq
+
+%if %8 == 1
+ pmaxub m7, m5
+ pminub m8, m5
+ pmaxub m7, m6
+ pminub m8, m6
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+ punpcklbw m5, m6
+ punpcklbw m6, m4, m4
+ psubusb m9, m5, m6
+ psubusb m5, m6, m5
+ por m9, m5 ; abs_diff_p01(p01 - px)
+ pcmpeqb m5, m9
+ por m5, %5
+ psignb m6, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m6
+ paddw m15, m5
+%else
+ psubusb m9, m5, m4
+ psubusb m5, m4, m5
+ psubusb m11, m6, m4
+ psubusb m6, m4, m6
+ por m9, m5 ; abs_diff_p0(p0 - px)
+ por m11, m6 ; abs_diff_p1(p1 - px)
+ pcmpeqb m5, m9
+ pcmpeqb m6, m11
+ punpckhbw m10, m9, m11
+ punpcklbw m9, m11
+ por m5, %5
+ por m11, m6, %5
+ punpckhbw m6, m5, m11
+ punpcklbw m5, m11
+ psignb m11, %5, m6
+ psrlw m6, m10, %2 ; emulate 8-bit shift
+ pand m6, %3
+ psubusb m6, %4, m6
+ pminub m6, m10
+ pmaddubsw m6, m11
+ paddw m12, m6
+ psignb m11, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+ pminub m5, m9
+ pmaddubsw m5, m11
+ paddw m15, m5
+%endif
+%endmacro
+
+%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
+%if %2 == 4
+ %if %5 == 1
+ punpcklbw m4, %3
+ %endif
+ pcmpgtw %3, m15
+ paddw m15, %3
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m15
+ paddb m4, m15
+ %else
+ paddw m4, m15
+ packuswb m4, m4 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ movd [dstq+strideq*0], xm4
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+strideq*1], xm4, 1
+ pextrd [dstq+stride3q ], xm5, 1
+%else
+ pcmpgtw m6, %3, m12
+ pcmpgtw m5, %3, m15
+ paddw m12, m6
+ paddw m15, m5
+ %if %5 == 1
+ punpckhbw m5, m4, %3
+ punpcklbw m4, %3
+ %endif
+ pmulhrsw m12, %4
+ pmulhrsw m15, %4
+ %if %5 == 0
+ packsswb m15, m12
+ paddb m4, m15
+ %else
+ paddw m5, m12
+ paddw m4, m15
+ packuswb m4, m5 ; clip px in [0x0,0xff]
+ pminub m4, m7
+ pmaxub m4, m8
+ %endif
+ vextracti128 xm5, m4, 1
+ %if %1 == 4
+ movd [dstq +strideq*0], xm4
+ movd [dst4q+strideq*0], xm5
+ pextrd [dstq +strideq*1], xm4, 1
+ pextrd [dst4q+strideq*1], xm5, 1
+ pextrd [dstq +strideq*2], xm4, 2
+ pextrd [dst4q+strideq*2], xm5, 2
+ pextrd [dstq +stride3q ], xm4, 3
+ pextrd [dst4q+stride3q ], xm5, 3
+ %else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm5
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%endmacro
+
+%macro BORDER_PREP_REGS 2 ; w, h
+ ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+ mov dird, r6m
+ lea dirq, [tableq+dirq*2+14]
+%if %1*%2*2/mmsize > 1
+ %if %1 == 4
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
+ %else
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
+ %endif
+ mov hd, %1*%2*2/mmsize
+%else
+ DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
+%endif
+ lea stkq, [px]
+ pxor m11, m11
+%endmacro
+
+%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
+ mov kd, 1
+%if %1 == 4
+ movq xm4, [stkq+32*0]
+ movhps xm4, [stkq+32*1]
+ movq xm5, [stkq+32*2]
+ movhps xm5, [stkq+32*3]
+ vinserti128 m4, xm5, 1
+%else
+ mova xm4, [stkq+32*0] ; px
+ vinserti128 m4, [stkq+32*1], 1
+%endif
+ pxor m15, m15 ; sum
+%if %3 == 1
+ mova m7, m4 ; max
+ mova m8, m4 ; min
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
+ ; mul_tap, w, clip
+ ; load p0/p1
+ movsx offq, byte [dirq+kq+%1] ; off1
+%if %6 == 4
+ movq xm5, [stkq+offq*2+32*0] ; p0
+ movq xm6, [stkq+offq*2+32*2]
+ movhps xm5, [stkq+offq*2+32*1]
+ movhps xm6, [stkq+offq*2+32*3]
+ vinserti128 m5, xm6, 1
+%else
+ movu xm5, [stkq+offq*2+32*0] ; p0
+ vinserti128 m5, [stkq+offq*2+32*1], 1
+%endif
+ neg offq ; -off1
+%if %6 == 4
+ movq xm6, [stkq+offq*2+32*0] ; p1
+ movq xm9, [stkq+offq*2+32*2]
+ movhps xm6, [stkq+offq*2+32*1]
+ movhps xm9, [stkq+offq*2+32*3]
+ vinserti128 m6, xm9, 1
+%else
+ movu xm6, [stkq+offq*2+32*0] ; p1
+ vinserti128 m6, [stkq+offq*2+32*1], 1
+%endif
+%if %7 == 1
+ ; out of bounds values are set to a value that is a both a large unsigned
+ ; value and a negative signed value.
+ ; use signed max and unsigned min to remove them
+ pmaxsw m7, m5 ; max after p0
+ pminuw m8, m5 ; min after p0
+ pmaxsw m7, m6 ; max after p1
+ pminuw m8, m6 ; min after p1
+%endif
+
+ ; accumulate sum[m15] over p0/p1
+ ; calculate difference before converting
+ psubw m5, m4 ; diff_p0(p0 - px)
+ psubw m6, m4 ; diff_p1(p1 - px)
+
+ ; convert to 8-bits with signed saturation
+ ; saturating to large diffs has no impact on the results
+ packsswb m5, m6
+
+ ; group into pairs so we can accumulate using maddubsw
+ pshufb m5, m12
+ pabsb m9, m5
+ psignb m10, %5, m5
+ psrlw m5, m9, %2 ; emulate 8-bit shift
+ pand m5, %3
+ psubusb m5, %4, m5
+
+ ; use unsigned min since abs diff can equal 0x80
+ pminub m5, m9
+ pmaddubsw m5, m10
+ paddw m15, m5
+%endmacro
+
+%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
+ pcmpgtw m9, m11, m15
+ paddw m15, m9
+ pmulhrsw m15, %2
+ paddw m4, m15
+%if %3 == 1
+ pminsw m4, m7
+ pmaxsw m4, m8
+%endif
+ packuswb m4, m4
+ vextracti128 xm5, m4, 1
+%if %1 == 4
+ movd [dstq+strideq*0], xm4
+ pextrd [dstq+strideq*1], xm4, 1
+ movd [dstq+strideq*2], xm5
+ pextrd [dstq+stride3q], xm5, 1
+%else
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*1], xm5
+%endif
+%endmacro
+
+%macro CDEF_FILTER 2 ; w, h
+INIT_YMM avx2
+cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
+ pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+ mov edged, edgem
+ cmp edged, 0xf
+ jne .border_block
+
+ PUSH r9
+ PUSH r10
+ PUSH r11
+%if %2 == 4
+ %assign regs_used 12
+ %if STACK_ALIGNMENT < 32
+ PUSH r%+regs_used
+ %assign regs_used regs_used+1
+ %endif
+ ALLOC_STACK 0x60, 16
+ pmovzxbw xm0, [leftq+1]
+ vpermq m0, m0, q0110
+ psrldq m1, m0, 4
+ vpalignr m2, m0, m0, 12
+ movu [rsp+0x10], m0
+ movu [rsp+0x28], m1
+ movu [rsp+0x40], m2
+%elif %1 == 4
+ PUSH r12
+ %assign regs_used 13
+ %if STACK_ALIGNMENT < 32
+ PUSH r%+regs_used
+ %assign regs_used regs_used+1
+ %endif
+ ALLOC_STACK 8*2+%1*%2*1, 16
+ pmovzxwd m0, [leftq]
+ mova [rsp+0x10], m0
+%else
+ PUSH r12
+ PUSH r13
+ %assign regs_used 14
+ %if STACK_ALIGNMENT < 32
+ PUSH r%+regs_used
+ %assign regs_used regs_used+1
+ %endif
+ ALLOC_STACK 8*2+%1*%2*2+32, 16
+ lea r11, [strideq*3]
+ movu xm4, [dstq+strideq*2]
+ pmovzxwq m0, [leftq+0]
+ pmovzxwq m1, [leftq+8]
+ vinserti128 m4, [dstq+r11], 1
+ pmovzxbd m2, [leftq+1]
+ pmovzxbd m3, [leftq+9]
+ mova [rsp+0x10], m0
+ mova [rsp+0x30], m1
+ mova [rsp+0x50], m2
+ mova [rsp+0x70], m3
+ mova [rsp+0x90], m4
+%endif
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
+ mov dampingd, r7m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ or prid, 0
+ jz .sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ or secdmpd, 0
+ jz .pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ PREP_REGS %1, %2
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+ LOAD_BLOCK %1, %2, 1
+.k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
+ dec kq
+ jge .k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ pxor m9, m9
+ ADJUST_PIXEL %1, %2, m9, m10, 1
+%if %1*%2 > mmsize
+ mov dstq, dst4q
+ lea top1q, [rsp+0x90]
+ lea top2q, [rsp+0xA0]
+ lea dst4q, [dst4q+strideq*4]
+ add hq, 4
+ jl .v_loop
+%endif
+ RET
+
+.pri_only:
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m3, [pw_2048]
+ pxor m1, m1
+%if %1*%2 > mmsize
+.pri_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+ dec kq
+ jge .pri_k_loop
+ ADJUST_PIXEL %1, %2, m1, m3
+%if %1*%2 > mmsize
+ mov dstq, dst4q
+ lea top1q, [rsp+0x90]
+ lea top2q, [rsp+0xA0]
+ lea dst4q, [dst4q+strideq*4]
+ add hq, 4
+ jl .pri_v_loop
+%endif
+ RET
+
+.sec_only:
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, left, top, _, secdmp, table
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ PREP_REGS %1, %2
+ vpbroadcastd m2, [pw_2048]
+ pxor m0, m0
+%if %1*%2 > mmsize
+.sec_v_loop:
+%endif
+ LOAD_BLOCK %1, %2
+.sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+ ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+ dec kq
+ jge .sec_k_loop
+ ADJUST_PIXEL %1, %2, m0, m2
+%if %1*%2 > mmsize
+ mov dstq, dst4q
+ lea top1q, [rsp+0x90]
+ lea top2q, [rsp+0xA0]
+ lea dst4q, [dst4q+strideq*4]
+ add hq, 4
+ jl .sec_v_loop
+%endif
+ RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-1]
+ vpbroadcastq m10, [dstq+strideq*2-1]
+ movd xm5, [topq+strideq*1+1]
+ movd xm9, [dstq+strideq*0+1]
+ psrldq m11, m6, 2
+ psrldq m12, m10, 2
+ vinserti128 m6, [dstq+stride3q -1], 1
+ vinserti128 m10, [dstq+strideq*4-1], 1
+ vpblendd m5, m11, 0x10
+ vpblendd m9, m12, 0x10
+ movu m11, [blend_4x4+16]
+ punpckldq m6, m10
+ punpckldq m5, m9
+ vpblendvb m6, [rsp+gprsize+0x28], m11
+ %else
+ movd xm5, [topq +strideq*1+1]
+ movq xm6, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm5, [dstq +strideq*0+1], 1
+ movhps xm6, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm9, xm6, 2
+ shufps xm5, xm9, q2010 ; -1 +0 +1 +2
+ shufps xm6, xm10, q2020 ; +1 +2 +3 +4
+ psrldq xm9, xm11, 2
+ psrldq xm10, 2
+ shufps xm10, xm9, q2020 ; +3 +4 +5 +6
+ movd xm9, [dst4q+stride3q -1]
+ pinsrd xm9, [dst4q+strideq*4-1], 1
+ shufps xm11, xm9, q1020 ; +5 +6 +7 +8
+ pmovzxbw m9, [leftq+3]
+ vinserti128 m6, xm11, 1
+ movu m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [top2q +1]
+ vbroadcasti128 m10, [dstq+strideq*1-1]
+ vbroadcasti128 m11, [dstq+strideq*2-1]
+ movhps xm5, [dstq+strideq*0+1]
+ vinserti128 m6, m10, [dstq+stride3q -1], 1
+ vinserti128 m9, m11, [dstq+strideq*4-1], 1
+ psrldq m10, 2
+ psrldq m11, 2
+ punpcklqdq m6, m9
+ movu m9, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m11
+ vpblendd m5, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9
+%endif
+ ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+ movq xm6, [dstq+strideq*0-1]
+ movq xm9, [dstq+strideq*1-1]
+ vinserti128 m6, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m11, [rsp+gprsize+0x10]
+ pcmpeqd m12, m12
+ psrldq m5, m6, 2
+ psrldq m10, m9, 2
+ psrld m12, 24
+ punpckldq m6, m9
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movq xm6, [dstq +strideq*0-1]
+ movq xm9, [dstq +strideq*2-1]
+ movhps xm6, [dstq +strideq*1-1]
+ movhps xm9, [dstq +stride3q -1]
+ movq xm10, [dst4q+strideq*0-1]
+ movhps xm10, [dst4q+strideq*1-1]
+ psrldq xm5, xm6, 2
+ psrldq xm11, xm9, 2
+ shufps xm5, xm11, q2020
+ movq xm11, [dst4q+strideq*2-1]
+ movhps xm11, [dst4q+stride3q -1]
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, xm11, q2020
+ vinserti128 m6, xm9, 1
+ pmovzxbw m9, [leftq+1]
+ psrldq xm10, 2
+ psrldq xm11, 2
+ shufps xm10, xm11, q2020
+ vpbroadcastd m11, [blend_4x8_0+4]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m9, m11
+ %endif
+%else
+ movu xm5, [dstq+strideq*0-1]
+ movu xm9, [dstq+strideq*1-1]
+ vinserti128 m5, [dstq+strideq*2-1], 1
+ vinserti128 m9, [dstq+stride3q -1], 1
+ movu m10, [blend_8x8_0+16]
+ punpcklqdq m6, m5, m9
+ vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m10, [dstq+strideq*1-1]
+ vpbroadcastq m11, [dstq+strideq*2-1]
+ movd xm6, [topq+strideq*1-1]
+ movd xm9, [dstq+strideq*0-1]
+ psrldq m5, m10, 2
+ psrldq m12, m11, 2
+ vpblendd m6, m10, 0x10
+ vpblendd m9, m11, 0x10
+ movu m10, [blend_4x4]
+ vinserti128 m5, [dstq+stride3q +1], 1
+ vinserti128 m12, [dstq+strideq*4+1], 1
+ punpckldq m6, m9
+ punpckldq m5, m12
+ vpblendvb m6, [rsp+gprsize+0x40], m10
+ %else
+ movd xm6, [topq +strideq*1-1]
+ movq xm9, [dstq +strideq*1-1]
+ movq xm10, [dstq +stride3q -1]
+ movq xm11, [dst4q+strideq*1-1]
+ pinsrd xm6, [dstq +strideq*0-1], 1
+ movhps xm9, [dstq +strideq*2-1]
+ movhps xm10, [dst4q+strideq*0-1]
+ movhps xm11, [dst4q+strideq*2-1]
+ psrldq xm5, xm9, 2
+ shufps xm6, xm9, q2010
+ psrldq xm9, xm10, 2
+ shufps xm5, xm9, q2020
+ shufps xm10, xm11, q2020
+ movd xm9, [dst4q+stride3q +1]
+ vinserti128 m6, xm10, 1
+ pinsrd xm9, [dst4q+strideq*4+1], 1
+ psrldq xm11, 2
+ pmovzxbw m10, [leftq-1]
+ shufps xm11, xm9, q1020
+ movu m9, [blend_4x8_0]
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, m10, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0+8]
+ movq xm6, [top2q -1]
+ vbroadcasti128 m5, [dstq+strideq*1-1]
+ vbroadcasti128 m9, [dstq+strideq*2-1]
+ movhps xm6, [dstq+strideq*0-1]
+ movu m11, [r13+hq*2*1+16*1]
+ punpcklqdq m10, m5, m9
+ vinserti128 m5, [dstq+stride3q -1], 1
+ vinserti128 m9, [dstq+strideq*4-1], 1
+ vpblendd m6, m10, 0xF0
+ vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11
+ psrldq m5, 2
+ psrldq m9, 2
+ punpcklqdq m5, m9
+%endif
+ ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq+strideq*1 ]
+ vpbroadcastd m5, [dstq+strideq*1 ]
+ vpbroadcastd m9, [dstq+strideq*2 ]
+ vpblendd xm6, [dstq+strideq*0-4], 0x2
+ vpblendd m5, m9, 0x22
+ vpblendd m6, m5, 0x30
+ vinserti128 m5, [dstq+stride3q ], 1
+ vpblendd m5, [dstq+strideq*4-20], 0x20
+ %else
+ movd xm6, [topq +strideq*1]
+ movd xm5, [dstq +strideq*1]
+ movd xm9, [dstq +stride3q ]
+ movd xm10, [dst4q+strideq*1]
+ movd xm11, [dst4q+stride3q ]
+ pinsrd xm6, [dstq +strideq*0], 1
+ pinsrd xm5, [dstq +strideq*2], 1
+ pinsrd xm9, [dst4q+strideq*0], 1
+ pinsrd xm10, [dst4q+strideq*2], 1
+ pinsrd xm11, [dst4q+strideq*4], 1
+ punpcklqdq xm6, xm5
+ punpcklqdq xm5, xm9
+ punpcklqdq xm9, xm10
+ punpcklqdq xm10, xm11
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ %endif
+%else
+ movq xm6, [top2q ]
+ movq xm5, [dstq+strideq*1]
+ movq xm9, [dstq+stride3q ]
+ movhps xm6, [dstq+strideq*0]
+ movhps xm5, [dstq+strideq*2]
+ movhps xm9, [dstq+strideq*4]
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+%endif
+ ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [dstq +strideq*2-2]
+ movd xm9, [dstq +stride3q -2]
+ movd xm5, [topq +strideq*0+2]
+ movd xm10, [topq +strideq*1+2]
+ pinsrw xm6, [leftq+4], 0
+ pinsrw xm9, [leftq+6], 0
+ vinserti128 m5, [dstq +strideq*0+2], 1
+ vinserti128 m10, [dstq +strideq*1+2], 1
+ vinserti128 m6, [dst4q+strideq*0-2], 1
+ vinserti128 m9, [dst4q+strideq*1-2], 1
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movq xm6, [dstq +strideq*2-2]
+ movd xm10, [dst4q+strideq*2-2]
+ movd xm5, [topq +strideq*0+2]
+ movq xm9, [dst4q+strideq*0-2]
+ movhps xm6, [dstq +stride3q -2]
+ pinsrw xm10, [dst4q+stride3q ], 3
+ pinsrd xm5, [topq +strideq*1+2], 1
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst8q+strideq*0-2], 2
+ pinsrd xm5, [dstq +strideq*0+2], 2
+ pinsrd xm10, [dst8q+strideq*1-2], 3
+ pinsrd xm5, [dstq +strideq*1+2], 3
+ shufps xm11, xm6, xm9, q3131
+ shufps xm6, xm9, q2020
+ movu m9, [blend_4x8_3+8]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm11, 1
+ vpblendvb m6, [rsp+gprsize+16+8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm6, [dstq +strideq*2-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm5, [top1q +2]
+ movq xm10, [top2q +2]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m6, [dst4q+strideq*0-2], 1
+ vinserti128 m9, [dst4q+strideq*1-2], 1
+ vinserti128 m5, [dstq +strideq*0+2], 1
+ vinserti128 m10, [dstq +strideq*1+2], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11
+%endif
+ ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m6, [dstq+strideq*1-2]
+ vpbroadcastq m9, [dstq+strideq*2-2]
+ movd xm5, [topq+strideq*1+2]
+ movd xm10, [dstq+strideq*0+2]
+ psrldq m11, m6, 4
+ psrldq m12, m9, 4
+ vpblendd m5, m11, 0x10
+ movq xm11, [leftq+2]
+ vinserti128 m6, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ vpblendd m10, m12, 0x10
+ pcmpeqd m12, m12
+ pmovzxwd m11, xm11
+ psrld m12, 16
+ punpckldq m6, m9
+ vpbroadcastd m9, [dstq+strideq*4-2]
+ vpblendvb m6, m11, m12
+ punpckldq m5, m10
+ vpblendd m6, m9, 0x20
+ %else
+ movd xm5, [topq +strideq*1+2]
+ movq xm6, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q -2]
+ pinsrd xm5, [dstq +strideq*0+2], 1
+ movhps xm6, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [dst4q+strideq*4-2], 1
+ shufps xm5, xm6, q3110
+ shufps xm6, xm9, q2020
+ shufps xm9, xm10, q3131
+ shufps xm10, xm11, q1020
+ movu m11, [blend_4x8_2+4]
+ vinserti128 m6, xm10, 1
+ vinserti128 m5, xm9, 1
+ vpblendvb m6, [rsp+gprsize+16+4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+16]
+ movq xm5, [top2q +2]
+ vbroadcasti128 m6, [dstq+strideq*1-2]
+ vbroadcasti128 m9, [dstq+strideq*2-2]
+ movhps xm5, [dstq+strideq*0+2]
+ shufps m10, m6, m9, q2121
+ vinserti128 m6, [dstq+stride3q -2], 1
+ vinserti128 m9, [dstq+strideq*4-2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m5, m10, 0xF0
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11
+%endif
+ ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+ movq xm11, [leftq]
+ movq xm6, [dstq+strideq*0-2]
+ movq xm9, [dstq+strideq*1-2]
+ vinserti128 m6, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ punpckldq xm11, xm11
+ psrldq m5, m6, 4
+ psrldq m10, m9, 4
+ pmovzxwd m11, xm11
+ punpckldq m6, m9
+ punpckldq m5, m10
+ pblendw m6, m11, 0x05
+ %else
+ movq xm5, [dstq +strideq*0-2]
+ movq xm9, [dstq +strideq*2-2]
+ movq xm10, [dst4q+strideq*0-2]
+ movq xm11, [dst4q+strideq*2-2]
+ movhps xm5, [dstq +strideq*1-2]
+ movhps xm9, [dstq +stride3q -2]
+ movhps xm10, [dst4q+strideq*1-2]
+ movhps xm11, [dst4q+stride3q -2]
+ shufps xm6, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, xm11, q2020
+ shufps xm10, xm11, q3131
+ pmovzxwd m11, [leftq]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ pblendw m6, m11, 0x55
+ %endif
+%else
+ mova m11, [rsp+gprsize+16+hq*8+64]
+ movu xm5, [dstq+strideq*0-2]
+ movu xm9, [dstq+strideq*1-2]
+ vinserti128 m5, [dstq+strideq*2-2], 1
+ vinserti128 m9, [dstq+stride3q -2], 1
+ shufps m6, m5, m9, q1010
+ shufps m5, m9, q2121
+ pblendw m6, m11, 0x11
+%endif
+ ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+ vpbroadcastq m11, [dstq+strideq*1-2]
+ vpbroadcastq m12, [dstq+strideq*2-2]
+ movd xm6, [topq+strideq*1-2]
+ movd xm9, [dstq+strideq*0-2]
+ pblendw m11, [leftq-16+2], 0x01
+ pblendw m12, [leftq-16+4], 0x01
+ pinsrw xm9, [leftq- 0+0], 0
+ psrldq m5, m11, 4
+ psrldq m10, m12, 4
+ vinserti128 m5, [dstq+stride3q +2], 1
+ vinserti128 m10, [dstq+strideq*4+2], 1
+ vpblendd m6, m11, 0x10
+ vpblendd m9, m12, 0x10
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm6, [topq +strideq*1-2]
+ movq xm5, [dstq +strideq*1-2]
+ movq xm9, [dstq +stride3q -2]
+ movq xm10, [dst4q+strideq*1-2]
+ movd xm11, [dst4q+stride3q +2]
+ pinsrw xm6, [dstq +strideq*0 ], 3
+ movhps xm5, [dstq +strideq*2-2]
+ movhps xm9, [dst4q+strideq*0-2]
+ movhps xm10, [dst4q+strideq*2-2]
+ pinsrd xm11, [dst4q+strideq*4+2], 1
+ shufps xm6, xm5, q2010
+ shufps xm5, xm9, q3131
+ shufps xm9, xm10, q2020
+ shufps xm10, xm11, q1031
+ movu m11, [blend_4x8_2]
+ vinserti128 m6, xm9, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+16-4], m11
+ %endif
+%else
+ lea r13, [blend_8x8_1+8]
+ movq xm6, [top2q -2]
+ vbroadcasti128 m5, [dstq+strideq*1-2]
+ vbroadcasti128 m10, [dstq+strideq*2-2]
+ movhps xm6, [dstq+strideq*0-2]
+ punpcklqdq m9, m5, m10
+ vinserti128 m5, [dstq+stride3q -2], 1
+ vinserti128 m10, [dstq+strideq*4-2], 1
+ movu m11, [r13+hq*2*1+16*1]
+ vpblendd m6, m9, 0xF0
+ shufps m5, m10, q2121
+ vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11
+%endif
+ ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+ vinserti128 m6, [dstq +strideq*0-2], 1
+ vinserti128 m9, [dstq +strideq*1-2], 1
+ movd xm5, [dstq +strideq*2+2]
+ movd xm10, [dstq +stride3q +2]
+ pblendw m6, [leftq-16+0], 0x01
+ pblendw m9, [leftq-16+2], 0x01
+ vinserti128 m5, [dst4q+strideq*0+2], 1
+ vinserti128 m10, [dst4q+strideq*1+2], 1
+ vpblendd m6, [topq +strideq*0-2], 0x01
+ vpblendd m9, [topq +strideq*1-2], 0x01
+ punpckldq m5, m10
+ punpckldq m6, m9
+ %else
+ movd xm6, [topq +strideq*0-2]
+ movq xm5, [dstq +strideq*2-2]
+ movq xm9, [dst4q+strideq*0-2]
+ movd xm10, [dst4q+strideq*2+2]
+ pinsrd xm6, [topq +strideq*1-2], 1
+ movhps xm5, [dstq +stride3q -2]
+ movhps xm9, [dst4q+strideq*1-2]
+ pinsrd xm10, [dst4q+stride3q +2], 1
+ pinsrd xm6, [dstq +strideq*0-2], 2
+ pinsrd xm10, [dst8q+strideq*0+2], 2
+ pinsrd xm6, [dstq +strideq*1-2], 3
+ pinsrd xm10, [dst8q+strideq*1+2], 3
+ shufps xm11, xm5, xm9, q2020
+ shufps xm5, xm9, q3131
+ movu m9, [blend_4x8_3]
+ vinserti128 m6, xm11, 1
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, [rsp+gprsize+16-8], m9
+ %endif
+%else
+ lea r13, [blend_8x8_1]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -2]
+ movq xm9, [top2q -2]
+ movq xm5, [dstq +strideq*2+2]
+ movq xm10, [dstq +stride3q +2]
+ vinserti128 m6, [dstq +strideq*0-2], 1
+ vinserti128 m9, [dstq +strideq*1-2], 1
+ vinserti128 m5, [dst4q+strideq*0+2], 1
+ vinserti128 m10, [dst4q+strideq*1+2], 1
+ punpcklqdq m6, m9
+ vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11
+ punpcklqdq m5, m10
+%endif
+ ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq +strideq*0-1]
+ movd xm9, [topq +strideq*1-1]
+ movd xm5, [dstq +strideq*2+1]
+ movd xm10, [dstq +stride3q +1]
+ pcmpeqd m12, m12
+ pmovzxbw m11, [leftq-8+1]
+ psrld m12, 24
+ vinserti128 m6, [dstq +strideq*0-1], 1
+ vinserti128 m9, [dstq +strideq*1-1], 1
+ vinserti128 m5, [dst4q+strideq*0+1], 1
+ vinserti128 m10, [dst4q+strideq*1+1], 1
+ punpckldq m6, m9
+ pxor m9, m9
+ vpblendd m12, m9, 0x0F
+ punpckldq m5, m10
+ vpblendvb m6, m11, m12
+ %else
+ movd xm6, [topq +strideq*0-1]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movd xm10, [dst4q+strideq*2+1]
+ pinsrd xm6, [topq +strideq*1-1], 1
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ pinsrd xm10, [dst4q+stride3q +1], 1
+ pinsrd xm6, [dstq +strideq*0-1], 2
+ pinsrd xm10, [dst8q+strideq*0+1], 2
+ pinsrd xm6, [dstq +strideq*1-1], 3
+ pinsrd xm10, [dst8q+strideq*1+1], 3
+ shufps xm11, xm5, xm9, q2020
+ vinserti128 m6, xm11, 1
+ pmovzxbw m11, [leftq-3]
+ psrldq xm5, 2
+ psrldq xm9, 2
+ shufps xm5, xm9, q2020
+ movu m9, [blend_4x8_1]
+ vinserti128 m5, xm10, 1
+ vpblendvb m6, m11, m9
+ %endif
+%else
+ lea r13, [blend_8x8_0]
+ movu m11, [r13+hq*2*2+16*2]
+ movq xm6, [top1q -1]
+ movq xm9, [top2q -1]
+ movq xm5, [dstq +strideq*2+1]
+ movq xm10, [dstq +stride3q +1]
+ vinserti128 m6, [dstq +strideq*0-1], 1
+ vinserti128 m9, [dstq +strideq*1-1], 1
+ vinserti128 m5, [dst4q+strideq*0+1], 1
+ vinserti128 m10, [dst4q+strideq*1+1], 1
+ punpcklqdq m6, m9
+ punpcklqdq m5, m10
+ vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11
+%endif
+ ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm6, [topq +strideq*0]
+ movd xm9, [topq +strideq*1]
+ movd xm5, [dstq +strideq*2]
+ movd xm10, [dstq +stride3q ]
+ vinserti128 m6, [dstq +strideq*0], 1
+ vinserti128 m9, [dstq +strideq*1], 1
+ vinserti128 m5, [dst4q+strideq*0], 1
+ vinserti128 m10, [dst4q+strideq*1], 1
+ punpckldq m6, m9
+ punpckldq m5, m10
+ %else
+ movd xm5, [dstq +strideq*2]
+ movd xm6, [topq +strideq*0]
+ movd xm9, [dst4q+strideq*2]
+ pinsrd xm5, [dstq +stride3q ], 1
+ pinsrd xm6, [topq +strideq*1], 1
+ pinsrd xm9, [dst4q+stride3q ], 1
+ pinsrd xm5, [dst4q+strideq*0], 2
+ pinsrd xm6, [dstq +strideq*0], 2
+ pinsrd xm9, [dst8q+strideq*0], 2
+ pinsrd xm5, [dst4q+strideq*1], 3
+ pinsrd xm6, [dstq +strideq*1], 3
+ pinsrd xm9, [dst8q+strideq*1], 3
+ vinserti128 m6, xm5, 1
+ vinserti128 m5, xm9, 1
+ %endif
+%else
+ movq xm5, [dstq +strideq*2]
+ movq xm9, [dst4q+strideq*0]
+ movq xm6, [top1q ]
+ movq xm10, [dstq +strideq*0]
+ movhps xm5, [dstq +stride3q ]
+ movhps xm9, [dst4q+strideq*1]
+ movhps xm6, [top2q ]
+ movhps xm10, [dstq +strideq*1]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+%endif
+ ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+ movd xm5, [dstq +strideq*2-1]
+ movd xm9, [dstq +stride3q -1]
+ movd xm6, [topq +strideq*0+1]
+ movd xm10, [topq +strideq*1+1]
+ pinsrb xm5, [leftq+ 5], 0
+ pinsrb xm9, [leftq+ 7], 0
+ vinserti128 m6, [dstq +strideq*0+1], 1
+ vinserti128 m10, [dstq +strideq*1+1], 1
+ vinserti128 m5, [dst4q+strideq*0-1], 1
+ vinserti128 m9, [dst4q+strideq*1-1], 1
+ punpckldq m6, m10
+ punpckldq m5, m9
+ %else
+ movd xm6, [topq +strideq*0+1]
+ movq xm9, [dstq +strideq*2-1]
+ movq xm10, [dst4q+strideq*0-1]
+ movd xm11, [dst4q+strideq*2-1]
+ pinsrd xm6, [topq +strideq*1+1], 1
+ movhps xm9, [dstq +stride3q -1]
+ movhps xm10, [dst4q+strideq*1-1]
+ pinsrd xm11, [dst4q+stride3q -1], 1
+ pinsrd xm6, [dstq +strideq*0+1], 2
+ pinsrd xm11, [dst8q+strideq*0-1], 2
+ pinsrd xm6, [dstq +strideq*1+1], 3
+ pinsrd xm11, [dst8q+strideq*1-1], 3
+ shufps xm5, xm9, xm10, q2020
+ vinserti128 m5, xm11, 1
+ pmovzxbw m11, [leftq+5]
+ psrldq xm9, 2
+ psrldq xm10, 2
+ shufps xm9, xm10, q2020
+ movu m10, [blend_4x8_1+8]
+ vinserti128 m6, xm9, 1
+ vpblendvb m5, m11, m10
+ %endif
+%else
+ lea r13, [blend_8x8_0+16]
+ movq xm5, [dstq +strideq*2-1]
+ movq xm9, [dst4q+strideq*0-1]
+ movq xm6, [top1q +1]
+ movq xm10, [dstq +strideq*0+1]
+ movhps xm5, [dstq +stride3q -1]
+ movhps xm9, [dst4q+strideq*1-1]
+ movhps xm6, [top2q +1]
+ movhps xm10, [dstq +strideq*1+1]
+ movu m11, [r13+hq*2*2+16*2]
+ vinserti128 m5, xm9, 1
+ vinserti128 m6, xm10, 1
+ vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11
+%endif
+ ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%if %1 == 4 && %2 == 8
+ PUSH r9
+ %assign regs_used 10
+%else
+ %assign regs_used 9
+%endif
+%if STACK_ALIGNMENT < 32
+ PUSH r%+regs_used
+ %assign regs_used regs_used+1
+%endif
+ ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
+ pcmpeqw m14, m14
+ psllw m14, 15 ; 0x8000
+
+ ; prepare pixel buffers - body/right
+%if %1 == 4
+ INIT_XMM avx2
+%endif
+%if %2 == 8
+ lea dst4q, [dstq+strideq*4]
+%endif
+ lea stride3q, [strideq*3]
+ test edgeb, 2 ; have_right
+ jz .no_right
+ pmovzxbw m1, [dstq+strideq*0]
+ pmovzxbw m2, [dstq+strideq*1]
+ pmovzxbw m3, [dstq+strideq*2]
+ pmovzxbw m4, [dstq+stride3q]
+ mova [px+0*32], m1
+ mova [px+1*32], m2
+ mova [px+2*32], m3
+ mova [px+3*32], m4
+%if %2 == 8
+ pmovzxbw m1, [dst4q+strideq*0]
+ pmovzxbw m2, [dst4q+strideq*1]
+ pmovzxbw m3, [dst4q+strideq*2]
+ pmovzxbw m4, [dst4q+stride3q]
+ mova [px+4*32], m1
+ mova [px+5*32], m2
+ mova [px+6*32], m3
+ mova [px+7*32], m4
+%endif
+ jmp .body_done
+.no_right:
+%if %1 == 4
+ movd xm1, [dstq+strideq*0]
+ movd xm2, [dstq+strideq*1]
+ movd xm3, [dstq+strideq*2]
+ movd xm4, [dstq+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+0*32], xm1
+ movq [px+1*32], xm2
+ movq [px+2*32], xm3
+ movq [px+3*32], xm4
+%else
+ pmovzxbw xm1, [dstq+strideq*0]
+ pmovzxbw xm2, [dstq+strideq*1]
+ pmovzxbw xm3, [dstq+strideq*2]
+ pmovzxbw xm4, [dstq+stride3q]
+ mova [px+0*32], xm1
+ mova [px+1*32], xm2
+ mova [px+2*32], xm3
+ mova [px+3*32], xm4
+%endif
+ movd [px+0*32+%1*2], xm14
+ movd [px+1*32+%1*2], xm14
+ movd [px+2*32+%1*2], xm14
+ movd [px+3*32+%1*2], xm14
+%if %2 == 8
+ %if %1 == 4
+ movd xm1, [dst4q+strideq*0]
+ movd xm2, [dst4q+strideq*1]
+ movd xm3, [dst4q+strideq*2]
+ movd xm4, [dst4q+stride3q]
+ pmovzxbw xm1, xm1
+ pmovzxbw xm2, xm2
+ pmovzxbw xm3, xm3
+ pmovzxbw xm4, xm4
+ movq [px+4*32], xm1
+ movq [px+5*32], xm2
+ movq [px+6*32], xm3
+ movq [px+7*32], xm4
+ %else
+ pmovzxbw xm1, [dst4q+strideq*0]
+ pmovzxbw xm2, [dst4q+strideq*1]
+ pmovzxbw xm3, [dst4q+strideq*2]
+ pmovzxbw xm4, [dst4q+stride3q]
+ mova [px+4*32], xm1
+ mova [px+5*32], xm2
+ mova [px+6*32], xm3
+ mova [px+7*32], xm4
+ %endif
+ movd [px+4*32+%1*2], xm14
+ movd [px+5*32+%1*2], xm14
+ movd [px+6*32+%1*2], xm14
+ movd [px+7*32+%1*2], xm14
+%endif
+.body_done:
+
+ ; top
+ test edgeb, 4 ; have_top
+ jz .no_top
+ test edgeb, 1 ; have_left
+ jz .top_no_left
+ test edgeb, 2 ; have_right
+ jz .top_no_right
+ pmovzxbw m1, [topq+strideq*0-(%1/2)]
+ pmovzxbw m2, [topq+strideq*1-(%1/2)]
+ movu [px-2*32-%1], m1
+ movu [px-1*32-%1], m2
+ jmp .top_done
+.top_no_right:
+ pmovzxbw m1, [topq+strideq*0-%1]
+ pmovzxbw m2, [topq+strideq*1-%1]
+ movu [px-2*32-%1*2], m1
+ movu [px-1*32-%1*2], m2
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.top_no_left:
+ test edgeb, 2 ; have_right
+ jz .top_no_left_right
+ pmovzxbw m1, [topq+strideq*0]
+ pmovzxbw m2, [topq+strideq*1]
+ mova [px-2*32+0], m1
+ mova [px-1*32+0], m2
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ jmp .top_done
+.top_no_left_right:
+%if %1 == 4
+ movd xm1, [topq+strideq*0]
+ pinsrd xm1, [topq+strideq*1], 1
+ pmovzxbw xm1, xm1
+ movq [px-2*32+0], xm1
+ movhps [px-1*32+0], xm1
+%else
+ pmovzxbw xm1, [topq+strideq*0]
+ pmovzxbw xm2, [topq+strideq*1]
+ mova [px-2*32+0], xm1
+ mova [px-1*32+0], xm2
+%endif
+ movd [px-2*32-4], xm14
+ movd [px-1*32-4], xm14
+ movd [px-2*32+%1*2], xm14
+ movd [px-1*32+%1*2], xm14
+ jmp .top_done
+.no_top:
+ movu [px-2*32-%1], m14
+ movu [px-1*32-%1], m14
+.top_done:
+
+ ; left
+ test edgeb, 1 ; have_left
+ jz .no_left
+ pmovzxbw xm1, [leftq+ 0]
+%if %2 == 8
+ pmovzxbw xm2, [leftq+ 8]
+%endif
+ movd [px+0*32-4], xm1
+ pextrd [px+1*32-4], xm1, 1
+ pextrd [px+2*32-4], xm1, 2
+ pextrd [px+3*32-4], xm1, 3
+%if %2 == 8
+ movd [px+4*32-4], xm2
+ pextrd [px+5*32-4], xm2, 1
+ pextrd [px+6*32-4], xm2, 2
+ pextrd [px+7*32-4], xm2, 3
+%endif
+ jmp .left_done
+.no_left:
+ movd [px+0*32-4], xm14
+ movd [px+1*32-4], xm14
+ movd [px+2*32-4], xm14
+ movd [px+3*32-4], xm14
+%if %2 == 8
+ movd [px+4*32-4], xm14
+ movd [px+5*32-4], xm14
+ movd [px+6*32-4], xm14
+ movd [px+7*32-4], xm14
+%endif
+.left_done:
+
+ ; bottom
+ DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
+ test edgeb, 8 ; have_bottom
+ jz .no_bottom
+ lea dst8q, [dstq+%2*strideq]
+ test edgeb, 1 ; have_left
+ jz .bottom_no_left
+ test edgeb, 2 ; have_right
+ jz .bottom_no_right
+ pmovzxbw m1, [dst8q-(%1/2)]
+ pmovzxbw m2, [dst8q+strideq-(%1/2)]
+ movu [px+(%2+0)*32-%1], m1
+ movu [px+(%2+1)*32-%1], m2
+ jmp .bottom_done
+.bottom_no_right:
+ pmovzxbw m1, [dst8q-%1]
+ pmovzxbw m2, [dst8q+strideq-%1]
+ movu [px+(%2+0)*32-%1*2], m1
+ movu [px+(%2+1)*32-%1*2], m2
+%if %1 == 8
+ movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
+%endif
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.bottom_no_left:
+ test edgeb, 2 ; have_right
+ jz .bottom_no_left_right
+ pmovzxbw m1, [dst8q]
+ pmovzxbw m2, [dst8q+strideq]
+ mova [px+(%2+0)*32+0], m1
+ mova [px+(%2+1)*32+0], m2
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ jmp .bottom_done
+.bottom_no_left_right:
+%if %1 == 4
+ movd xm1, [dst8q]
+ pinsrd xm1, [dst8q+strideq], 1
+ pmovzxbw xm1, xm1
+ movq [px+(%2+0)*32+0], xm1
+ movhps [px+(%2+1)*32+0], xm1
+%else
+ pmovzxbw xm1, [dst8q]
+ pmovzxbw xm2, [dst8q+strideq]
+ mova [px+(%2+0)*32+0], xm1
+ mova [px+(%2+1)*32+0], xm2
+%endif
+ movd [px+(%2+0)*32-4], xm14
+ movd [px+(%2+1)*32-4], xm14
+ movd [px+(%2+0)*32+%1*2], xm14
+ movd [px+(%2+1)*32+%1*2], xm14
+ jmp .bottom_done
+.no_bottom:
+ movu [px+(%2+0)*32-%1], m14
+ movu [px+(%2+1)*32-%1], m14
+.bottom_done:
+
+ ; actual filter
+ INIT_YMM avx2
+ DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
+%undef edged
+ ; register to shuffle values into after packing
+ vbroadcasti128 m12, [shufb_lohi]
+
+ mov dampingd, r7m
+ xor zerod, zerod
+ movifnidn prid, prim
+ sub dampingd, 31
+ movifnidn secdmpd, secdmpm
+ or prid, 0
+ jz .border_sec_only
+ movd xm0, prid
+ lzcnt pridmpd, prid
+ add pridmpd, dampingd
+ cmovs pridmpd, zerod
+ mov [rsp+0], pridmpq ; pri_shift
+ or secdmpd, 0
+ jz .border_pri_only
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+
+ DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+
+ ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ vpbroadcastb m1, xm1 ; sec_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ lea secq, [tableq+12] ; sec_taps
+
+ BORDER_PREP_REGS %1, %2
+%if %1*%2*2/mmsize > 1
+.border_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2, 1
+.border_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
+ dec kq
+ jge .border_k_loop
+
+ vpbroadcastd m10, [pw_2048]
+ BORDER_ADJUST_PIXEL %1, m10, 1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_v_loop
+%endif
+ RET
+
+.border_pri_only:
+ DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
+ vpbroadcastb m0, xm0 ; pri_strength
+ and prid, 1
+ lea priq, [tableq+priq*2+8] ; pri_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m1, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_pri_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_pri_k_loop:
+ vpbroadcastb m2, [priq+kq] ; pri_taps
+ ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+ dec kq
+ jge .border_pri_k_loop
+ BORDER_ADJUST_PIXEL %1, m1
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_pri_v_loop
+%endif
+ RET
+
+.border_sec_only:
+ DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
+ movd xm1, secdmpd
+ lzcnt secdmpd, secdmpd
+ add secdmpd, dampingd
+ cmovs secdmpd, zerod
+ mov [rsp+8], secdmpq ; sec_shift
+ DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
+ lea tableq, [tap_table]
+ vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
+ DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
+ vpbroadcastb m1, xm1 ; sec_strength
+ lea secq, [tableq+12] ; sec_taps
+ BORDER_PREP_REGS %1, %2
+ vpbroadcastd m0, [pw_2048]
+%if %1*%2*2/mmsize > 1
+.border_sec_v_loop:
+%endif
+ BORDER_LOAD_BLOCK %1, %2
+.border_sec_k_loop:
+ vpbroadcastb m3, [secq+kq] ; sec_taps
+ ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+ ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
+ dec kq
+ jge .border_sec_k_loop
+ BORDER_ADJUST_PIXEL %1, m0
+%if %1*%2*2/mmsize > 1
+ %define vloop_lines (mmsize/(%1*2))
+ lea dstq, [dstq+strideq*vloop_lines]
+ add stkq, 32*vloop_lines
+ dec hd
+ jg .border_sec_v_loop
+%endif
+ RET
+%endmacro
+
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
+
+INIT_YMM avx2
+cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3
+ lea stride3q, [strideq*3]
+ movq xm0, [srcq+strideq*0]
+ movq xm1, [srcq+strideq*1]
+ movq xm2, [srcq+strideq*2]
+ movq xm3, [srcq+stride3q]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vpbroadcastq m7, [srcq+stride3q]
+ vpbroadcastd m8, [pw_128]
+ pxor m9, m9
+
+ vpblendd m0, m0, m7, 0xf0
+ vpblendd m1, m1, m6, 0xf0
+ vpblendd m2, m2, m5, 0xf0
+ vpblendd m3, m3, m4, 0xf0
+
+ punpcklbw m0, m9
+ punpcklbw m1, m9
+ punpcklbw m2, m9
+ punpcklbw m3, m9
+
+ psubw m0, m8
+ psubw m1, m8
+ psubw m2, m8
+ psubw m3, m8
+
+ ; shuffle registers to generate partial_sum_diag[0-1] together
+ vpermq m7, m0, q1032
+ vpermq m6, m1, q1032
+ vpermq m5, m2, q1032
+ vpermq m4, m3, q1032
+
+ ; start with partial_sum_hv[0-1]
+ paddw m8, m0, m1
+ paddw m9, m2, m3
+ phaddw m10, m0, m1
+ phaddw m11, m2, m3
+ paddw m8, m9
+ phaddw m10, m11
+ vextracti128 xm9, m8, 1
+ vextracti128 xm11, m10, 1
+ paddw xm8, xm9 ; partial_sum_hv[1]
+ phaddw xm10, xm11 ; partial_sum_hv[0]
+ vinserti128 m8, xm10, 1
+ vpbroadcastd m9, [div_table+44]
+ pmaddwd m8, m8
+ pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
+
+ ; create aggregates [lower half]:
+ ; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
+ ; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
+ ; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
+ ; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
+ ; and [upper half]:
+ ; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
+ ; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
+ ; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
+ ; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
+ ; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m9, m1, 2
+ psrldq m10, m1, 14
+ pslldq m11, m2, 4
+ psrldq m12, m2, 12
+ pslldq m13, m3, 6
+ psrldq m14, m3, 10
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m4, 8
+ psrldq m12, m4, 8
+ pslldq m13, m5, 10
+ psrldq m14, m5, 6
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14
+ pslldq m11, m6, 12
+ psrldq m12, m6, 4
+ pslldq m13, m7, 14
+ psrldq m14, m7, 2
+ paddw m9, m11
+ paddw m10, m12
+ paddw m9, m13
+ paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
+ vbroadcasti128 m14, [shufw_6543210x]
+ vbroadcasti128 m13, [div_table+16]
+ vbroadcasti128 m12, [div_table+0]
+ paddw m9, m0 ; partial_sum_diag[0/1][0-7]
+ pshufb m10, m14
+ punpckhwd m11, m9, m10
+ punpcklwd m9, m10
+ pmaddwd m11, m11
+ pmaddwd m9, m9
+ pmulld m11, m13
+ pmulld m9, m12
+ paddd m9, m11 ; cost0[a-d] | cost4[a-d]
+
+ ; merge horizontally and vertically for partial_sum_alt[0-3]
+ paddw m10, m0, m1
+ paddw m11, m2, m3
+ paddw m12, m4, m5
+ paddw m13, m6, m7
+ phaddw m0, m4
+ phaddw m1, m5
+ phaddw m2, m6
+ phaddw m3, m7
+
+ ; create aggregates [lower half]:
+ ; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
+ ; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
+ ; and [upper half]:
+ ; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
+ ; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
+ ; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m4, m11, 2
+ psrldq m11, 14
+ pslldq m5, m12, 4
+ psrldq m12, 12
+ pslldq m6, m13, 6
+ psrldq m13, 10
+ paddw m4, m10
+ paddw m11, m12
+ vpbroadcastd m12, [div_table+44]
+ paddw m5, m6
+ paddw m11, m13 ; partial_sum_alt[3/2] right
+ vbroadcasti128 m13, [div_table+32]
+ paddw m4, m5 ; partial_sum_alt[3/2] left
+ pshuflw m5, m11, q3012
+ punpckhwd m6, m11, m4
+ punpcklwd m4, m5
+ pmaddwd m6, m6
+ pmaddwd m4, m4
+ pmulld m6, m12
+ pmulld m4, m13
+ paddd m4, m6 ; cost7[a-d] | cost5[a-d]
+
+ ; create aggregates [lower half]:
+ ; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
+ ; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
+ ; and [upper half]:
+ ; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
+ ; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
+ ; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
+
+ pslldq m5, m1, 2
+ psrldq m1, 14
+ pslldq m6, m2, 4
+ psrldq m2, 12
+ pslldq m7, m3, 6
+ psrldq m3, 10
+ paddw m5, m0
+ paddw m1, m2
+ paddw m6, m7
+ paddw m1, m3 ; partial_sum_alt[0/1] right
+ paddw m5, m6 ; partial_sum_alt[0/1] left
+ pshuflw m0, m1, q3012
+ punpckhwd m1, m5
+ punpcklwd m5, m0
+ pmaddwd m1, m1
+ pmaddwd m5, m5
+ pmulld m1, m12
+ pmulld m5, m13
+ paddd m5, m1 ; cost1[a-d] | cost3[a-d]
+
+ mova xm0, [pd_47130256+ 16]
+ mova m1, [pd_47130256]
+ phaddd m9, m8
+ phaddd m5, m4
+ phaddd m9, m5
+ vpermd m0, m9 ; cost[0-3]
+ vpermd m1, m9 ; cost[4-7] | cost[0-3]
+
+ ; now find the best cost
+ pmaxsd xm2, xm0, xm1
+ pshufd xm3, xm2, q1032
+ pmaxsd xm2, xm3
+ pshufd xm3, xm2, q2301
+ pmaxsd xm2, xm3 ; best cost
+
+ ; find the idx using minpos
+ ; make everything other than the best cost negative via subtraction
+ ; find the min of unsigned 16-bit ints to sort out the negative values
+ psubd xm4, xm1, xm2
+ psubd xm3, xm0, xm2
+ packssdw xm3, xm4
+ phminposuw xm3, xm3
+
+ ; convert idx to 32-bits
+ psrld xm3, 16
+ movd eax, xm3
+
+ ; get idx^4 complement
+ vpermd m3, m1
+ psubd xm2, xm3
+ psrld xm2, 10
+ movd [varq], xm2
+ RET
+
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/cdef_avx512.asm
@@ -1,0 +1,867 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if HAVE_AVX512ICL && ARCH_X86_64
+
+%macro DUP4 1-*
+ %rep %0
+ times 4 db %1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DIRS 16 ; cdef_directions[]
+ %rep 4 + 16 + 4 ; 6 7 0 1 2 3 4 5 6 7 0 1
+ ; masking away unused bits allows us to use a single vpaddd {1to16}
+ ; instruction instead of having to do vpbroadcastd + paddb
+ db %13 & 0x3f, -%13 & 0x3f
+ %rotate 1
+ %endrep
+%endmacro
+
+SECTION_RODATA 64
+
+lut_perm_4x4: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 16, 17, 0, 1, 2, 3, 4, 5, 18, 19, 8, 9, 10, 11, 12, 13
+ db 20, 21, 80, 81, 82, 83, 84, 85, 22, 23, 32, 33, 34, 35, 36, 37
+ db 98, 99,100,101,102,103,104,105, 50, 51, 52, 53, 54, 55, 56, 57
+lut_perm_4x8a: db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
+ db 96, 97, 0, 1, 2, 3, 4, 5, 98, 99, 8, 9, 10, 11, 12, 13
+lut_perm_4x8b:db 100,101, 16, 17, 18, 19, 20, 21,102,103, 24, 25, 26, 27, 28, 29
+ db 104,105, 32, 33, 34, 35, 36, 37,106,107, 40, 41, 42, 43, 44, 45
+ db 108,109, 48, 49, 50, 51, 52, 53,110,111, 56, 57, 58, 59, 60, 61
+ db 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95
+pd_01234567: dd 0, 1, 2, 3, 4, 5, 6, 7
+lut_perm_8x8a: db 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
+ db -1, -1, 34, 35, 36, 37, 38, 39, -1, -1, 50, 51, 52, 53, 54, 55
+ db -1, -1, 66, 67, 68, 69, 70, 71, -1, -1, 82, 83, 84, 85, 86, 87
+ db 96, 97, 98, 99,100,101,102,103,112,113,114,115,116,117,118,119
+lut_perm_8x8b: db 4, 5, 6, 7, 8, 9, 10, 11, 20, 21, 22, 23, 24, 25, 26, 27
+ db 36, 37, 38, 39, 40, 41, 42, 43, 52, 53, 54, 55, 56, 57, 58, 59
+ db 68, 69, 70, 71, 72, 73, 74, 75, 84, 85, 86, 87, 88, 89, 90, 91
+ db 100,101,102,103,104,105,106,107,116,117,118,119,120,121,122,123
+edge_mask: dq 0x00003c3c3c3c0000, 0x00003f3f3f3f0000 ; 0000, 0001
+ dq 0x0000fcfcfcfc0000, 0x0000ffffffff0000 ; 0010, 0011
+ dq 0x00003c3c3c3c3c3c, 0x00003f3f3f3f3f3f ; 0100, 0101
+ dq 0x0000fcfcfcfcfcfc, 0x0000ffffffffffff ; 0110, 0111
+ dq 0x3c3c3c3c3c3c0000, 0x3f3f3f3f3f3f0000 ; 1000, 1001
+ dq 0xfcfcfcfcfcfc0000, 0xffffffffffff0000 ; 1010, 1011
+ dq 0x3c3c3c3c3c3c3c3c, 0x3f3f3f3f3f3f3f3f ; 1100, 1101
+ dq 0xfcfcfcfcfcfcfcfc, 0xffffffffffffffff ; 1110, 1111
+px_idx: DUP4 18, 19, 20, 21, 26, 27, 28, 29, 34, 35, 36, 37, 42, 43, 44, 45
+cdef_dirs: DIRS -7,-14, 1, -6, 1, 2, 1, 10, 9, 18, 8, 17, 8, 16, 8, 15
+gf_shr: dq 0x0102040810204080, 0x0102040810204080 ; >> 0, >> 0
+ dq 0x0204081020408000, 0x0408102040800000 ; >> 1, >> 2
+ dq 0x0810204080000000, 0x1020408000000000 ; >> 3, >> 4
+ dq 0x2040800000000000, 0x4080000000000000 ; >> 5, >> 6
+ times 16 db 0 ; realign (introduced by cdef_dirs)
+end_perm_w8clip:db 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30
+ db 32, 36, 40, 44, 34, 38, 42, 46, 48, 52, 56, 60, 50, 54, 58, 62
+ db 1, 5, 9, 13, 3, 7, 11, 15, 17, 21, 25, 29, 19, 23, 27, 31
+ db 33, 37, 41, 45, 35, 39, 43, 47, 49, 53, 57, 61, 51, 55, 59, 63
+end_perm: db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+pri_tap: db 64, 64, 32, 32, 48, 48, 48, 48 ; left-shifted by 4
+sec_tap: db 32, 32, 16, 16
+pd_268435568: dd 268435568
+
+SECTION .text
+
+%if WIN64
+DECLARE_REG_TMP 5, 6
+%else
+DECLARE_REG_TMP 8, 5
+%endif
+
+; lut:
+; t0 t1 t2 t3 t4 t5 t6 t7
+; T0 T1 T2 T3 T4 T5 T6 T7
+; L0 L1 00 01 02 03 04 05
+; L2 L3 10 11 12 13 14 15
+; L4 L5 20 21 22 23 24 25
+; L6 L7 30 31 32 33 34 35
+; 4e 4f 40 41 42 43 44 45
+; 5e 5f 50 51 52 53 54 55
+
+INIT_ZMM avx512icl
+cglobal cdef_filter_4x4, 4, 8, 13, dst, stride, left, top, pri, sec, dir, damping, edge
+%define base r7-edge_mask
+ movq xmm0, [dstq+strideq*0]
+ movhps xmm0, [dstq+strideq*1]
+ lea r7, [edge_mask]
+ movq xmm1, [topq+strideq*0-2]
+ movhps xmm1, [topq+strideq*1-2]
+ mov r6d, edgem
+ vinserti32x4 ym0, ymm0, [leftq], 1
+ lea r2, [strideq*3]
+ vinserti32x4 ym1, ymm1, [dstq+strideq*2], 1
+ mova m5, [base+lut_perm_4x4]
+ vinserti32x4 m0, [dstq+r2], 2
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ lea r3, [dstq+strideq*4-4]
+ vinserti32x4 m1, [r3+strideq*0], 2
+ vinserti32x4 m0, [r3+strideq*1], 3
+.main:
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m3, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m5, m0, m1 ; lut
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m7, m7
+ lea r3, [r7+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m6, m3, m5 ; px
+ cmp r6d, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m1, m3, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m1, m1, m5 ; k0p0 k0p1 k1p0 k1p1
+%macro CDEF_FILTER_4x4_PRI 0
+ vpcmpub k1, m6, m1, 6 ; px > pN
+ psubb m2, m1, m6
+ lzcnt r6d, prid
+ vpsubb m2{k1}, m6, m1 ; abs(diff)
+ vpbroadcastb m4, prid
+ and prid, 1
+ vgf2p8affineqb m9, m2, [r3+r6*8] {1to8}, 0 ; abs(diff) >> shift
+ movifnidn t1d, secm
+ vpbroadcastd m10, [base+pri_tap+priq*4]
+ vpsubb m10{k1}, m7, m10 ; apply_sign(pri_tap)
+ psubusb m4, m9 ; imax(0, pri_strength - (abs(diff) >> shift)))
+ pminub m2, m4
+ vpdpbusd m0, m2, m10 ; sum
+%endmacro
+ CDEF_FILTER_4x4_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m4, m6, m1
+ pmaxub m1, m6
+ pminub m5, m2, m3
+ pmaxub m2, m3
+ pminub m4, m5
+ pmaxub m2, m1
+ psrldq m1, m4, 2
+ psrldq m3, m2, 2
+ pminub m1, m4
+ vpcmpw k1, m0, m7, 1
+ vpshldd m6, m0, 8
+ pmaxub m2, m3
+ pslldq m3, m1, 1
+ psubw m7, m0
+ paddusw m0, m6 ; clip >0xff
+ vpsubusw m0{k1}, m6, m7 ; clip <0x00
+ pslldq m4, m2, 1
+ pminub m1, m3
+ pmaxub m2, m4
+ pmaxub m0, m1
+ pminub m0, m2
+ jmp .end
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ vpshldd m6, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ paddw m0, m6 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+.end:
+ mova xm1, [base+end_perm]
+ vpermb m0, m1, m0 ; output in bits 8-15 of each dword
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ pextrd [dstq+strideq*2], xm0, 2
+ pextrd [dstq+r2 ], xm0, 3
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ vpbroadcastq m8, [base+edge_mask+r6*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m2, m3, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m8, m2 ; index in-range
+ mova m1, m6
+ vpermb m1{k1}, m2, m5
+ CDEF_FILTER_4x4_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m4, m3, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m9, m3, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m8, m4
+ mova m2, m6
+ vpermb m2{k1}, m4, m5
+ vpshufbitqmb k1, m8, m9
+ mova m3, m6
+ vpermb m3{k1}, m9, m5
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m2, m3, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m3, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m2, m2, m5 ; k0s0 k0s1 k1s0 k1s1
+ vpermb m3, m3, m5 ; k0s2 k0s3 k1s2 k1s3
+.sec_main:
+ vpbroadcastd m8, [base+sec_tap]
+ vpcmpub k1, m6, m2, 6
+ psubb m4, m2, m6
+ vpbroadcastb m12, t1d
+ lzcnt t1d, t1d
+ vpsubb m4{k1}, m6, m2
+ vpcmpub k2, m6, m3, 6
+ vpbroadcastq m11, [r3+t1*8]
+ gf2p8affineqb m10, m4, m11, 0
+ psubb m5, m3, m6
+ mova m9, m8
+ vpsubb m8{k1}, m7, m8
+ psubusb m10, m12, m10
+ vpsubb m5{k2}, m6, m3
+ pminub m4, m10
+ vpdpbusd m0, m4, m8
+ gf2p8affineqb m11, m5, m11, 0
+ vpsubb m9{k2}, m7, m9
+ psubusb m12, m11
+ pminub m5, m12
+ vpdpbusd m0, m5, m9
+ ret
+
+DECLARE_REG_TMP 2, 7
+
+; lut top lut bottom
+; t0 t1 t2 t3 t4 t5 t6 t7 L4 L5 20 21 22 23 24 25
+; T0 T1 T2 T3 T4 T5 T6 T7 L6 L7 30 31 32 33 34 35
+; L0 L1 00 01 02 03 04 05 L8 L9 40 41 42 43 44 45
+; L2 L3 10 11 12 13 14 15 La Lb 50 51 52 53 54 55
+; L4 L5 20 21 22 23 24 25 Lc Ld 60 61 62 63 64 65
+; L6 L7 30 31 32 33 34 35 Le Lf 70 71 72 73 74 75
+; L8 L9 40 41 42 43 44 45 8e 8f 80 81 82 83 84 85
+; La Lb 50 51 52 53 54 55 9e 9f 90 91 92 93 94 95
+
+cglobal cdef_filter_4x8, 4, 9, 22, dst, stride, left, top, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ vpbroadcastd ym21, strided
+ mov r6d, edgem
+ lea r8, [edge_mask]
+ movq xm1, [topq+strideq*0-2]
+ pmulld ym21, [base+pd_01234567]
+ kxnorb k1, k1, k1
+ movq xm2, [topq+strideq*1-2]
+ vpgatherdq m0{k1}, [dstq+ym21] ; +0+1 +2+3 +4+5 +6+7
+ mova m14, [base+lut_perm_4x8a]
+ movu m15, [base+lut_perm_4x8b]
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ lea r7, [dstq+strideq*8-2]
+ vinserti32x4 ym1, [r7+strideq*0], 1
+ vinserti32x4 ym2, [r7+strideq*1], 1
+.main:
+ punpcklqdq ym1, ym2
+ vinserti32x4 m1, [leftq], 2 ; -2-1 +8+9 left ____
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m16, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m14, m0, m1 ; lut top
+ vpermi2b m15, m0, m1 ; lut bottom
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m20, m20
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m2, m16, m14 ; pxt
+ vpermb m3, m16, m15 ; pxb
+ mova m1, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m4, m6, m14 ; pNt k0p0 k0p1 k1p0 k1p1
+ vpermb m5, m6, m15 ; pNb
+%macro CDEF_FILTER_4x8_PRI 0
+ vpcmpub k1, m2, m4, 6 ; pxt > pNt
+ vpcmpub k2, m3, m5, 6 ; pxb > pNb
+ psubb m6, m4, m2
+ psubb m7, m5, m3
+ lzcnt r6d, prid
+ vpsubb m6{k1}, m2, m4 ; abs(diff_top)
+ vpsubb m7{k2}, m3, m5 ; abs(diff_bottom)
+ vpbroadcastb m13, prid
+ vpbroadcastq m9, [r3+r6*8]
+ and prid, 1
+ vpbroadcastd m11, [base+pri_tap+priq*4]
+ vgf2p8affineqb m8, m6, m9, 0 ; abs(dt) >> shift
+ vgf2p8affineqb m9, m7, m9, 0 ; abs(db) >> shift
+ mova m10, m11
+ movifnidn t1d, secm
+ vpsubb m10{k1}, m20, m11 ; apply_sign(pri_tap_top)
+ vpsubb m11{k2}, m20, m11 ; apply_sign(pri_tap_bottom)
+ psubusb m12, m13, m8 ; imax(0, pri_strength - (abs(dt) >> shift)))
+ psubusb m13, m13, m9 ; imax(0, pri_strength - (abs(db) >> shift)))
+ pminub m6, m12
+ pminub m7, m13
+ vpdpbusd m0, m6, m10 ; sum top
+ vpdpbusd m1, m7, m11 ; sum bottom
+%endmacro
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m10, m4, m2
+ pminub m12, m6, m8
+ pminub m11, m5, m3
+ pminub m13, m7, m9
+ pmaxub m4, m2
+ pmaxub m6, m8
+ pmaxub m5, m3
+ pmaxub m7, m9
+ pminub m10, m12
+ pminub m11, m13
+ pmaxub m4, m6
+ pmaxub m5, m7
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ kxnorb k2, k2, k2 ; hw lw
+ vpshrdd m12, m0, m1, 16 ; m1lw m0hw
+ vpshrdd m6, m10, m11, 16 ; m11lw m10hw
+ vpshrdd m8, m4, m5, 16 ; m5lw m4hw
+ vpblendmw m7{k1}, m10, m11 ; m11hw m10lw
+ vpblendmw m9{k1}, m4, m5 ; m5hw m4lw
+ vpblendmw m4{k1}, m0, m12 ; m1lw m0lw
+ vpblendmw m5{k1}, m12, m1 ; m1hw m0hw
+ vpshrdd m2, m3, 16
+ pminub m6, m7
+ pmaxub m8, m9
+ mova ym14, [base+end_perm]
+ vpcmpw k1, m4, m20, 1
+ vpshldw m2, m5, 8
+ pslldq m7, m6, 1
+ pslldq m9, m8, 1
+ psubw m5, m20, m4
+ paddusw m0, m4, m2 ; clip >0xff
+ pminub m6, m7
+ pmaxub m8, m9
+ psubusw m0{k1}, m2, m5 ; clip <0x00
+ pmaxub m0, m6
+ pminub m0, m8
+ vpermb m0, m14, m0
+ vpscatterdd [dstq+ym21]{k2}, ym0
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova ym4, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m2, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m3, m1, 8
+ paddw m0, m2 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m3
+ pslld m0, 16
+ vpshrdd m0, m1, 16
+ vpermb m0, m4, m0 ; output in bits 8-15 of each word
+ vpscatterdd [dstq+ym21]{k1}, ym0
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t1d, r6d
+ or r6d, 8 ; top 4x4 has bottom
+ or t1d, 4 ; bottom 4x4 has top
+ vpbroadcastq m17, [base+edge_mask+r6*8]
+ vpbroadcastq m18, [base+edge_mask+t1*8]
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m6, m16, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m17, m6 ; index in-range
+ vpshufbitqmb k2, m18, m6
+ mova m4, m2
+ mova m5, m3
+ vpermb m4{k1}, m6, m14
+ vpermb m5{k2}, m6, m15
+ CDEF_FILTER_4x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m10, m16, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m11, m16, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m17, m10
+ vpshufbitqmb k2, m18, m10
+ vpshufbitqmb k3, m17, m11
+ vpshufbitqmb k4, m18, m11
+ mova m6, m2
+ mova m7, m3
+ mova m8, m2
+ mova m9, m3
+ vpermb m6{k1}, m10, m14
+ vpermb m7{k2}, m10, m15
+ vpermb m8{k3}, m11, m14
+ vpermb m9{k4}, m11, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m8, m16, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m9, m16, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m6, m8, m14 ; pNt k0s0 k0s1 k1s0 k1s1
+ vpermb m7, m8, m15 ; pNb
+ vpermb m8, m9, m14 ; pNt k0s2 k0s3 k1s2 k1s3
+ vpermb m9, m9, m15 ; pNb
+.sec_main:
+ vpbroadcastb m18, t1d
+ lzcnt t1d, t1d
+ vpcmpub k1, m2, m6, 6
+ vpcmpub k2, m3, m7, 6
+ vpcmpub k3, m2, m8, 6
+ vpcmpub k4, m3, m9, 6
+ vpbroadcastq m17, [r3+t1*8]
+ psubb m10, m6, m2
+ psubb m11, m7, m3
+ psubb m12, m8, m2
+ psubb m13, m9, m3
+ vpsubb m10{k1}, m2, m6 ; abs(dt0)
+ vpsubb m11{k2}, m3, m7 ; abs(db0)
+ vpsubb m12{k3}, m2, m8 ; abs(dt1)
+ vpsubb m13{k4}, m3, m9 ; abs(db1)
+ vpbroadcastd m19, [base+sec_tap]
+ gf2p8affineqb m14, m10, m17, 0 ; abs(dt0) >> shift
+ gf2p8affineqb m15, m11, m17, 0 ; abs(db0) >> shift
+ gf2p8affineqb m16, m12, m17, 0 ; abs(dt1) >> shift
+ gf2p8affineqb m17, m13, m17, 0 ; abs(db1) >> shift
+ psubusb m14, m18, m14 ; imax(0, sec_strength - (abs(dt0) >> shift)))
+ psubusb m15, m18, m15 ; imax(0, sec_strength - (abs(db0) >> shift)))
+ psubusb m16, m18, m16 ; imax(0, sec_strength - (abs(dt1) >> shift)))
+ psubusb m17, m18, m17 ; imax(0, sec_strength - (abs(db1) >> shift)))
+ pminub m10, m14
+ pminub m11, m15
+ pminub m12, m16
+ pminub m13, m17
+ mova m14, m19
+ mova m15, m19
+ mova m16, m19
+ vpsubb m14{k1}, m20, m19 ; apply_sign(sec_tap_top_0)
+ vpsubb m15{k2}, m20, m19 ; apply_sign(sec_tap_bottom_0)
+ vpsubb m16{k3}, m20, m19 ; apply_sign(sec_tap_top_1)
+ vpsubb m19{k4}, m20, m19 ; apply_sign(sec_tap_bottom_1)
+ vpdpbusd m0, m10, m14
+ vpdpbusd m1, m11, m15
+ vpdpbusd m0, m12, m16
+ vpdpbusd m1, m13, m19
+ ret
+
+; lut tl lut tr
+; t0 t1 t2 t3 t4 t5 t6 t7 t6 t7 t8 t9 ta tb tc td
+; T0 T1 T2 T3 T4 T5 T6 T7 T6 T7 T8 T9 TA TB TC TD
+; L0 L1 00 01 02 03 04 05 04 05 06 07 08 09 0a 0b
+; L2 L3 10 11 12 13 14 15 14 15 16 17 18 19 1a 1b
+; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
+; lut bl lut br
+; L4 L5 20 21 22 23 24 25 24 25 26 27 28 29 2a 2b
+; L6 L7 30 31 32 33 34 35 34 35 36 37 38 39 3a 3b
+; L8 L9 40 41 42 43 44 45 44 45 46 47 48 49 4a 4b
+; La Lb 50 51 52 53 54 55 54 55 56 57 58 59 5a 5b
+; Lc Ld 60 61 62 63 64 65 64 65 66 67 68 69 6a 6b
+; Le Lf 70 71 72 73 74 75 74 75 76 77 78 79 7a 7b
+; 8e 8f 80 81 82 83 84 85 84 85 86 87 88 89 8a 8b
+; 9e 9f 90 91 92 93 94 95 94 95 96 97 98 99 9a 9b
+
+cglobal cdef_filter_8x8, 4, 11, 32, 4*64, dst, stride, left, top, \
+ pri, sec, dir, damping, edge
+%define base r8-edge_mask
+ mov r6d, edgem
+ lea r10, [dstq+strideq*4-2]
+ movu xmm0, [topq+strideq*0-2]
+ movu xmm1, [dstq+strideq*2-2]
+ movu xmm2, [r10 +strideq*2 ]
+ lea r8, [edge_mask]
+ lea r9, [strideq*3]
+ pmovzxwq m10, [leftq-4]
+ vinserti32x4 ym0, ymm0, [topq+strideq*1-2], 1
+ vinserti32x4 ym1, ymm1, [dstq+r9 -2], 1
+ vinserti32x4 ym2, ymm2, [r10 +r9 ], 1
+ lea r7, [r10 +strideq*4 ]
+ pmovzxwq m11, [leftq+4]
+ vinserti32x4 m0, [dstq+strideq*0-2], 2
+ vinserti32x4 m1, [r10 +strideq*0 ], 2
+ mova m12, [base+lut_perm_8x8a]
+ movu m13, [base+lut_perm_8x8b]
+ vinserti32x4 m0, [dstq+strideq*1-2], 3
+ vinserti32x4 m1, [r10 +strideq*1 ], 3
+ test r6b, 0x08 ; avoid buffer overread
+ jz .main
+ vinserti32x4 m2, [r7 +strideq*0], 2
+ vinserti32x4 m2, [r7 +strideq*1], 3
+.main:
+ mov t1d, 0x11111100
+ mova m14, m12
+ mova m15, m13
+ kmovd k1, t1d
+ kshiftrd k2, k1, 8
+ movifnidn prid, prim
+ mov t0d, dirm
+ mova m30, [base+px_idx]
+ mov r3d, dampingm
+ vpermi2b m12, m0, m1 ; lut tl
+ vpermi2b m14, m1, m2 ; lut bl
+ vpermi2b m13, m0, m1 ; lut tr
+ vpermi2b m15, m1, m2 ; lut br
+ vpblendmw m12{k1}, m12, m10
+ vpblendmw m14{k2}, m14, m11
+ vpbroadcastd m0, [base+pd_268435568] ; (1 << 28) + (7 << 4)
+ pxor m31, m31
+ lea r3, [r8+r3*8] ; gf_shr + (damping - 30) * 8
+ vpermb m4, m30, m12 ; pxtl
+ vpermb m5, m30, m13 ; pxtr
+ vpermb m6, m30, m14 ; pxbl
+ vpermb m7, m30, m15 ; pxbr
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ cmp r6b, 0x0f
+ jne .mask_edges ; mask edges only if required
+ test prid, prid
+ jz .sec_only
+ vpaddd m11, m30, [base+cdef_dirs+(t0+2)*4] {1to16} ; dir
+ vpermb m8, m11, m12 ; pNtl k0p0 k0p1 k1p0 k1p1
+ vpermb m9, m11, m13 ; pNtr
+ vpermb m10, m11, m14 ; pNbl
+ vpermb m11, m11, m15 ; pNbr
+%macro CDEF_FILTER_8x8_PRI 0
+ vpcmpub k1, m4, m8, 6 ; pxtl > pNtl
+ vpcmpub k2, m5, m9, 6 ; pxtr > pNtr
+ vpcmpub k3, m6, m10, 6 ; pxbl > pNbl
+ vpcmpub k4, m7, m11, 6 ; pxbr > pNbr
+ psubb m16, m8, m4
+ psubb m17, m9, m5
+ psubb m18, m10, m6
+ psubb m19, m11, m7
+ lzcnt r6d, prid
+ vpsubb m16{k1}, m4, m8 ; abs(diff_tl)
+ vpsubb m17{k2}, m5, m9 ; abs(diff_tr)
+ vpsubb m18{k3}, m6, m10 ; abs(diff_bl)
+ vpsubb m19{k4}, m7, m11 ; abs(diff_br)
+ vpbroadcastq m28, [r3+r6*8]
+ vpbroadcastb m29, prid
+ and prid, 1
+ vpbroadcastd m27, [base+pri_tap+priq*4]
+ vgf2p8affineqb m20, m16, m28, 0 ; abs(dtl) >> shift
+ vgf2p8affineqb m21, m17, m28, 0 ; abs(dtr) >> shift
+ vgf2p8affineqb m22, m18, m28, 0 ; abs(dbl) >> shift
+ vgf2p8affineqb m23, m19, m28, 0 ; abs(dbl) >> shift
+ mova m24, m27
+ mova m25, m27
+ mova m26, m27
+ movifnidn t1d, secm
+ vpsubb m24{k1}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m25{k2}, m31, m27 ; apply_sign(pri_tap_tr)
+ vpsubb m26{k3}, m31, m27 ; apply_sign(pri_tap_tl)
+ vpsubb m27{k4}, m31, m27 ; apply_sign(pri_tap_tr)
+ psubusb m20, m29, m20 ; imax(0, pri_strength - (abs(dtl) >> shift)))
+ psubusb m21, m29, m21 ; imax(0, pri_strength - (abs(dtr) >> shift)))
+ psubusb m22, m29, m22 ; imax(0, pri_strength - (abs(dbl) >> shift)))
+ psubusb m23, m29, m23 ; imax(0, pri_strength - (abs(dbr) >> shift)))
+ pminub m16, m20
+ pminub m17, m21
+ pminub m18, m22
+ pminub m19, m23
+ vpdpbusd m0, m16, m24 ; sum tl
+ vpdpbusd m1, m17, m25 ; sum tr
+ vpdpbusd m2, m18, m26 ; sum bl
+ vpdpbusd m3, m19, m27 ; sum br
+%endmacro
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d ; sec
+ jz .end_no_clip
+ call .sec
+.end_clip:
+ pminub m20, m8, m4
+ pminub m24, m12, m16
+ pminub m21, m9, m5
+ pminub m25, m13, m17
+ pminub m22, m10, m6
+ pminub m26, m14, m18
+ pminub m23, m11, m7
+ pminub m27, m15, m19
+ pmaxub m8, m4
+ pmaxub m12, m16
+ pmaxub m9, m5
+ pmaxub m13, m17
+ pmaxub m10, m6
+ pmaxub m14, m18
+ pmaxub m11, m7
+ pmaxub m15, m19
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ pmaxub m8, m12
+ pmaxub m9, m13
+ pmaxub m10, m14
+ pmaxub m11, m15
+ mov r2d, 0xAAAAAAAA
+ kmovd k1, r2d
+ vpshrdd m24, m0, m1, 16
+ vpshrdd m25, m2, m3, 16
+ vpshrdd m12, m20, m21, 16
+ vpshrdd m14, m22, m23, 16
+ vpshrdd m16, m8, m9, 16
+ vpshrdd m18, m10, m11, 16
+ vpblendmw m13{k1}, m20, m21
+ vpblendmw m15{k1}, m22, m23
+ vpblendmw m17{k1}, m8, m9
+ vpblendmw m19{k1}, m10, m11
+ vpblendmw m20{k1}, m0, m24
+ vpblendmw m21{k1}, m24, m1
+ vpblendmw m22{k1}, m2, m25
+ vpblendmw m23{k1}, m25, m3
+ vpshrdd m4, m5, 16
+ vpshrdd m6, m7, 16
+ pminub m12, m13
+ pminub m14, m15
+ pmaxub m16, m17
+ pmaxub m18, m19
+ mova m8, [base+end_perm_w8clip]
+ vpcmpw k2, m20, m31, 1
+ vpcmpw k3, m22, m31, 1
+ vpshldw m4, m21, 8
+ vpshldw m6, m23, 8
+ kunpckdq k1, k1, k1
+ kxnorb k4, k4, k4
+ vpshrdw m11, m12, m14, 8
+ vpshrdw m15, m16, m18, 8
+ vpblendmb m13{k1}, m12, m14
+ vpblendmb m17{k1}, m16, m18
+ psubw m21, m31, m20
+ psubw m23, m31, m22
+ paddusw m0, m20, m4 ; clip >0xff
+ paddusw m1, m22, m6
+ pminub m11, m13
+ pmaxub m15, m17
+ psubusw m0{k2}, m4, m21 ; clip <0x00
+ psubusw m1{k3}, m6, m23
+ psrlw m0, 8
+ vmovdqu8 m0{k1}, m1
+ pmaxub m0, m11
+ pminub m0, m15
+ vpermb m0, m8, m0
+ add r10, 2
+ vextracti32x4 xm1, m0, 1
+ vextracti32x4 xm2, m0, 2
+ vextracti32x4 xm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*2], xm1
+ movq [r10 +strideq*0], xm2
+ movq [r10 +strideq*2], xm3
+ movhps [dstq+strideq*1], xm0
+ movhps [dstq+r9 ], xm1
+ movhps [r10 +strideq*1], xm2
+ movhps [r10 +r9 ], xm3
+ RET
+.sec_only:
+ movifnidn t1d, secm
+ call .sec
+.end_no_clip:
+ mova xm8, [base+end_perm]
+ kxnorb k1, k1, k1
+ vpshldd m4, m0, 8 ; (px << 8) + ((sum > -8) << 4)
+ vpshldd m5, m1, 8
+ vpshldd m6, m2, 8
+ vpshldd m7, m3, 8
+ paddw m0, m4 ; (px << 8) + ((sum + (sum > -8) + 7) << 4)
+ paddw m1, m5
+ paddw m2, m6
+ paddw m3, m7
+ vpermb m0, m8, m0
+ vpermb m1, m8, m1
+ vpermb m2, m8, m2
+ vpermb m3, m8, m3
+ add r10, 2
+ punpckldq m4, m0, m1
+ punpckhdq m0, m1
+ punpckldq m5, m2, m3
+ punpckhdq m2, m3
+ movq [dstq+strideq*0], xm4
+ movq [dstq+strideq*2], xm0
+ movq [r10 +strideq*0], xm5
+ movq [r10 +strideq*2], xm2
+ movhps [dstq+strideq*1], xm4
+ movhps [dstq+r9 ], xm0
+ movhps [r10 +strideq*1], xm5
+ movhps [r10 +r9 ], xm2
+ RET
+.mask_edges_sec_only:
+ movifnidn t1d, secm
+ call .mask_edges_sec
+ jmp .end_no_clip
+ALIGN function_align
+.mask_edges:
+ mov t0d, r6d
+ mov t1d, r6d
+ or t0d, 0xA ; top-left 4x4 has bottom and right
+ or t1d, 0x9 ; top-right 4x4 has bottom and left
+ vpbroadcastq m26, [base+edge_mask+t0*8]
+ vpbroadcastq m27, [base+edge_mask+t1*8]
+ mov t1d, r6d
+ or r6d, 0x6 ; bottom-left 4x4 has top and right
+ or t1d, 0x5 ; bottom-right 4x4 has top and left
+ vpbroadcastq m28, [base+edge_mask+r6*8]
+ vpbroadcastq m29, [base+edge_mask+t1*8]
+ mov t0d, dirm
+ test prid, prid
+ jz .mask_edges_sec_only
+ vpaddd m20, m30, [base+cdef_dirs+(t0+2)*4] {1to16}
+ vpshufbitqmb k1, m26, m20 ; index in-range
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m8, m4
+ mova m9, m5
+ mova m10, m6
+ mova m11, m7
+ vpermb m8{k1}, m20, m12
+ vpermb m9{k2}, m20, m13
+ vpermb m10{k3}, m20, m14
+ vpermb m11{k4}, m20, m15
+ mova [rsp+0x00], m26
+ mova [rsp+0x40], m27
+ mova [rsp+0x80], m28
+ mova [rsp+0xC0], m29
+ CDEF_FILTER_8x8_PRI
+ test t1d, t1d
+ jz .end_no_clip
+ mova m26, [rsp+0x00]
+ mova m27, [rsp+0x40]
+ mova m28, [rsp+0x80]
+ mova m29, [rsp+0xC0]
+ call .mask_edges_sec
+ jmp .end_clip
+.mask_edges_sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16}
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16}
+ vpshufbitqmb k1, m26, m20
+ vpshufbitqmb k2, m27, m20
+ vpshufbitqmb k3, m28, m20
+ vpshufbitqmb k4, m29, m20
+ mova m16, m4
+ mova m17, m5
+ mova m18, m6
+ mova m19, m7
+ vpermb m16{k1}, m20, m12
+ vpermb m17{k2}, m20, m13
+ vpermb m18{k3}, m20, m14
+ vpermb m19{k4}, m20, m15
+ vpshufbitqmb k1, m26, m21
+ vpshufbitqmb k2, m27, m21
+ vpshufbitqmb k3, m28, m21
+ vpshufbitqmb k4, m29, m21
+ vpermb m12, m21, m12
+ vpermb m13, m21, m13
+ vpermb m14, m21, m14
+ vpermb m15, m21, m15
+ vpblendmb m12{k1}, m4, m12
+ vpblendmb m13{k2}, m5, m13
+ vpblendmb m14{k3}, m6, m14
+ vpblendmb m15{k4}, m7, m15
+ jmp .sec_main
+ALIGN function_align
+.sec:
+ vpaddd m20, m30, [base+cdef_dirs+(t0+4)*4] {1to16} ; dir + 2
+ vpaddd m21, m30, [base+cdef_dirs+(t0+0)*4] {1to16} ; dir - 2
+ vpermb m16, m20, m12 ; pNtl k0s0 k0s1 k1s0 k1s1
+ vpermb m17, m20, m13 ; pNtr
+ vpermb m18, m20, m14 ; pNbl
+ vpermb m19, m20, m15 ; pNbr
+ vpermb m12, m21, m12 ; pNtl k0s2 k0s3 k1s2 k1s3
+ vpermb m13, m21, m13 ; pNtr
+ vpermb m14, m21, m14 ; pNbl
+ vpermb m15, m21, m15 ; pNbr
+.sec_main:
+%macro CDEF_FILTER_8x8_SEC 4-5 0 ; load constants
+ vpcmpub k1, m4, %1, 6
+ vpcmpub k2, m5, %2, 6
+ vpcmpub k3, m6, %3, 6
+ vpcmpub k4, m7, %4, 6
+ psubb m20, %1, m4
+ psubb m21, %2, m5
+ psubb m22, %3, m6
+ psubb m23, %4, m7
+%if %5
+ vpbroadcastb m28, t1d
+ lzcnt t1d, t1d
+ vpbroadcastq m29, [r3+t1*8]
+%endif
+ vpsubb m20{k1}, m4, %1
+ vpsubb m21{k2}, m5, %2
+ vpsubb m22{k3}, m6, %3
+ vpsubb m23{k4}, m7, %4
+ gf2p8affineqb m24, m20, m29, 0
+ gf2p8affineqb m25, m21, m29, 0
+ gf2p8affineqb m26, m22, m29, 0
+ gf2p8affineqb m27, m23, m29, 0
+%if %5
+ vpbroadcastd m30, [base+sec_tap]
+%endif
+ psubusb m24, m28, m24
+ psubusb m25, m28, m25
+ psubusb m26, m28, m26
+ psubusb m27, m28, m27
+ pminub m20, m24
+ pminub m21, m25
+ pminub m22, m26
+ pminub m23, m27
+ mova m24, m30
+ mova m25, m30
+ mova m26, m30
+ mova m27, m30
+ vpsubb m24{k1}, m31, m30
+ vpsubb m25{k2}, m31, m30
+ vpsubb m26{k3}, m31, m30
+ vpsubb m27{k4}, m31, m30
+ vpdpbusd m0, m20, m24
+ vpdpbusd m1, m21, m25
+ vpdpbusd m2, m22, m26
+ vpdpbusd m3, m23, m27
+%endmacro
+ CDEF_FILTER_8x8_SEC m16, m17, m18, m19, 1
+ CDEF_FILTER_8x8_SEC m12, m13, m14, m15
+ ret
+
+%endif ; HAVE_AVX512ICL && ARCH_X86_64