ref: 5fe20ec7dd89c88453a61cd47d5d01e49d6cc6c2
parent: 2b567aaa367c63975ad7f4fcbad96051a276a8d1
author: Henrik Gramner <gramner@twoorioles.com>
date: Tue Jun 30 22:49:12 EDT 2020
x86: Split AVX2 and AVX-512 mc asm into separate files
--- a/src/meson.build
+++ b/src/meson.build
@@ -176,13 +176,14 @@
if dav1d_bitdepths.contains('8')
libdav1d_sources_asm += files(
'x86/cdef_avx512.asm',
+ 'x86/mc_avx512.asm',
'x86/cdef_avx2.asm',
+ 'x86/mc_avx2.asm',
'x86/film_grain.asm',
'x86/ipred.asm',
'x86/itx.asm',
'x86/loopfilter.asm',
'x86/looprestoration.asm',
- 'x86/mc.asm',
'x86/cdef_sse.asm',
'x86/film_grain_ssse3.asm',
'x86/ipred_ssse3.asm',
--- a/src/x86/mc.asm
+++ /dev/null
@@ -1,8067 +1,0 @@
-; Copyright © 2018, VideoLAN and dav1d authors
-; Copyright © 2018, Two Orioles, LLC
-; All rights reserved.
-;
-; Redistribution and use in source and binary forms, with or without
-; modification, are permitted provided that the following conditions are met:
-;
-; 1. Redistributions of source code must retain the above copyright notice, this
-; list of conditions and the following disclaimer.
-;
-; 2. Redistributions in binary form must reproduce the above copyright notice,
-; this list of conditions and the following disclaimer in the documentation
-; and/or other materials provided with the distribution.
-;
-; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-%include "ext/x86/x86inc.asm"
-
-%if ARCH_X86_64
-
-SECTION_RODATA 64
-
-; dav1d_obmc_masks[] with 64-x interleaved
-obmc_masks: db 0, 0, 0, 0
- ; 2
- db 45, 19, 64, 0
- ; 4
- db 39, 25, 50, 14, 59, 5, 64, 0
- ; 8
- db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
- ; 16
- db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
- db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
- ; 32
- db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
- db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
- db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
- db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
-
-bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
- db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
- db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
- db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
-wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
- db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
- db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
- db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
-wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
- db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
- db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
- db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
-wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
- db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
- db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
- db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
-wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
- db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
- db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
- db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
-wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
- db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
- db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
- db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
-bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
- db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
-bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
- db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
- db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
- db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
-bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
- db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
-bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
- db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
- db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
- db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
-bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
- db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
- db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
- db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
-bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
-spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
- db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
- db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
-spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
- db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
-spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
- db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
-spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
- db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
-spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
- db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
- db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
-spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
- db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
- db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
- db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
-spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
- db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
-spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
- db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
- db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
- db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
-
-warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
- db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
-warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
- db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
-subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
- db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
-subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
-subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
-subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
-bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
-bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
-deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
-blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
-wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
-pb_8x0_8x8: times 8 db 0
- times 8 db 8
-bdct_lb_dw: times 4 db 0
- times 4 db 4
- times 4 db 8
- times 4 db 12
-
-ALIGN 32
-rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
-resize_shuf: times 5 db 0
- db 1, 2, 3, 4, 5, 6
- times 5+8 db 7
-
-ALIGN 8
-wm_420_perm64: dq 0xfedcba9876543210
-wm_420_sign: dd 0x01020102, 0x01010101
-wm_422_sign: dd 0x80808080, 0x7f7f7f7f
-wm_sign_avx512: dd 0x40804080, 0xc0c0c0c0, 0x40404040
-
-ALIGN 4
-pb_0123: db 0, 1, 2, 3
-pb_4567: db 4, 5, 6, 7
-pw_m128 times 2 dw -128
-pw_m256: times 2 dw -256
-pw_32: times 2 dw 32
-pw_34: times 2 dw 34
-pw_258: times 2 dw 258
-pw_512: times 2 dw 512
-pw_1024: times 2 dw 1024
-pw_2048: times 2 dw 2048
-pw_6903: times 2 dw 6903
-pw_8192: times 2 dw 8192
-pd_2: dd 2
-pd_32: dd 32
-pd_63: dd 63
-pd_512: dd 512
-pd_32768: dd 32768
-pd_0x3ff: dd 0x3ff
-pd_0x4000: dd 0x4000
-pq_0x40000000: dq 0x40000000
-
-%define pb_m64 (wm_sign_avx512+4)
-%define pb_64 (wm_sign_avx512+8)
-%define pb_127 (wm_422_sign +4)
-
-cextern mc_subpel_filters
-cextern mc_warp_filter
-%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
-
-%macro BASE_JMP_TABLE 3-*
- %xdefine %1_%2_table (%%table - %3)
- %xdefine %%base %1_%2
- %%table:
- %rep %0 - 2
- dw %%base %+ _w%3 - %%base
- %rotate 1
- %endrep
-%endmacro
-
-%macro HV_JMP_TABLE 5-*
- %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
- %xdefine %%base %1_%3
- %assign %%types %4
- %if %%types & 1
- %xdefine %1_%2_h_%3_table (%%h - %5)
- %%h:
- %rep %0 - 4
- dw %%prefix %+ .h_w%5 - %%base
- %rotate 1
- %endrep
- %rotate 4
- %endif
- %if %%types & 2
- %xdefine %1_%2_v_%3_table (%%v - %5)
- %%v:
- %rep %0 - 4
- dw %%prefix %+ .v_w%5 - %%base
- %rotate 1
- %endrep
- %rotate 4
- %endif
- %if %%types & 4
- %xdefine %1_%2_hv_%3_table (%%hv - %5)
- %%hv:
- %rep %0 - 4
- dw %%prefix %+ .hv_w%5 - %%base
- %rotate 1
- %endrep
- %endif
-%endmacro
-
-%macro BIDIR_JMP_TABLE 1-*
- %xdefine %1_table (%%table - 2*%2)
- %xdefine %%base %1_table
- %xdefine %%prefix mangle(private_prefix %+ _%1)
- %%table:
- %rep %0 - 1
- dd %%prefix %+ .w%2 - %%base
- %rotate 1
- %endrep
-%endmacro
-
-%macro SCALED_JMP_TABLE 1-*
- %xdefine %1_table (%%table - %2)
- %xdefine %%base mangle(private_prefix %+ _%1)
-%%table:
- %rep %0 - 1
- dw %%base %+ .w%2 - %%base
- %rotate 1
- %endrep
- %rotate 1
-%%dy_1024:
- %xdefine %1_dy1_table (%%dy_1024 - %2)
- %rep %0 - 1
- dw %%base %+ .dy1_w%2 - %%base
- %rotate 1
- %endrep
- %rotate 1
-%%dy_2048:
- %xdefine %1_dy2_table (%%dy_2048 - %2)
- %rep %0 - 1
- dw %%base %+ .dy2_w%2 - %%base
- %rotate 1
- %endrep
-%endmacro
-
-%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
-%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
-%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
-
-%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
-
-BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
-BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
-SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
-SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
-BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
-
-%if HAVE_AVX512ICL
-BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
-HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128
-BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128
-%endif ; HAVE_AVX512ICL
-
-SECTION .text
-
-INIT_XMM avx2
-DECLARE_REG_TMP 4, 6, 7
-cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
- movifnidn mxyd, r6m ; mx
- lea t2, [put_avx2]
- tzcnt wd, wm
- movifnidn hd, hm
- test mxyd, mxyd
- jnz .h
- mov mxyd, r7m ; my
- test mxyd, mxyd
- jnz .v
-.put:
- movzx wd, word [t2+wq*2+table_offset(put,)]
- add wq, t2
- jmp wq
-.put_w2:
- movzx t0d, word [srcq+ssq*0]
- movzx t1d, word [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0w
- mov [dstq+dsq*1], t1w
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w2
- RET
-.put_w4:
- mov t0d, [srcq+ssq*0]
- mov t1d, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0d
- mov [dstq+dsq*1], t1d
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w4
- RET
-.put_w8:
- mov t0, [srcq+ssq*0]
- mov t1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mov [dstq+dsq*0], t0
- mov [dstq+dsq*1], t1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w8
- RET
-.put_w16:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mova [dstq+dsq*0], m0
- mova [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w16
- RET
-INIT_YMM avx2
-.put_w32:
- movu m0, [srcq+ssq*0]
- movu m1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mova [dstq+dsq*0], m0
- mova [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w32
- RET
-.put_w64:
- movu m0, [srcq+ssq*0+32*0]
- movu m1, [srcq+ssq*0+32*1]
- movu m2, [srcq+ssq*1+32*0]
- movu m3, [srcq+ssq*1+32*1]
- lea srcq, [srcq+ssq*2]
- mova [dstq+dsq*0+32*0], m0
- mova [dstq+dsq*0+32*1], m1
- mova [dstq+dsq*1+32*0], m2
- mova [dstq+dsq*1+32*1], m3
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .put_w64
- RET
-.put_w128:
- movu m0, [srcq+32*0]
- movu m1, [srcq+32*1]
- movu m2, [srcq+32*2]
- movu m3, [srcq+32*3]
- add srcq, ssq
- mova [dstq+32*0], m0
- mova [dstq+32*1], m1
- mova [dstq+32*2], m2
- mova [dstq+32*3], m3
- add dstq, dsq
- dec hd
- jg .put_w128
- RET
-.h:
- ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
- ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
- imul mxyd, 0xff01
- vbroadcasti128 m4, [bilin_h_shuf8]
- add mxyd, 16 << 8
- movd xm5, mxyd
- mov mxyd, r7m ; my
- vpbroadcastw m5, xm5
- test mxyd, mxyd
- jnz .hv
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
- vpbroadcastd m3, [pw_2048]
- add wq, t2
- jmp wq
-.h_w2:
- movd xm0, [srcq+ssq*0]
- pinsrd xm0, [srcq+ssq*1], 1
- lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pmaddubsw xm0, xm5
- pmulhrsw xm0, xm3
- packuswb xm0, xm0
- pextrw [dstq+dsq*0], xm0, 0
- pextrw [dstq+dsq*1], xm0, 2
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w2
- RET
-.h_w4:
- mova xm4, [bilin_h_shuf4]
-.h_w4_loop:
- movq xm0, [srcq+ssq*0]
- movhps xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pmaddubsw xm0, xm5
- pmulhrsw xm0, xm3
- packuswb xm0, xm0
- movd [dstq+dsq*0], xm0
- pextrd [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w4_loop
- RET
-.h_w8:
- movu xm0, [srcq+ssq*0]
- movu xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pshufb xm1, xm4
- pmaddubsw xm0, xm5
- pmaddubsw xm1, xm5
- pmulhrsw xm0, xm3
- pmulhrsw xm1, xm3
- packuswb xm0, xm1
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w8
- RET
-.h_w16:
- movu xm0, [srcq+ssq*0+8*0]
- vinserti128 m0, m0, [srcq+ssq*1+8*0], 1
- movu xm1, [srcq+ssq*0+8*1]
- vinserti128 m1, m1, [srcq+ssq*1+8*1], 1
- lea srcq, [srcq+ssq*2]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], m0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w16
- RET
-.h_w32:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- add srcq, ssq
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .h_w32
- RET
-.h_w64:
- movu m0, [srcq+8*0]
- movu m1, [srcq+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- movu m1, [srcq+8*4]
- movu m2, [srcq+8*5]
- add srcq, ssq
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmulhrsw m1, m3
- pmulhrsw m2, m3
- packuswb m1, m2
- mova [dstq+32*0], m0
- mova [dstq+32*1], m1
- add dstq, dsq
- dec hd
- jg .h_w64
- RET
-.h_w128:
- mov t1, -32*3
-.h_w128_loop:
- movu m0, [srcq+t1+32*3+8*0]
- movu m1, [srcq+t1+32*3+8*1]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmulhrsw m0, m3
- pmulhrsw m1, m3
- packuswb m0, m1
- mova [dstq+t1+32*3], m0
- add t1, 32
- jle .h_w128_loop
- add srcq, ssq
- add dstq, dsq
- dec hd
- jg .h_w128
- RET
-.v:
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
- imul mxyd, 0xff01
- vpbroadcastd m5, [pw_2048]
- add mxyd, 16 << 8
- add wq, t2
- movd xm4, mxyd
- vpbroadcastw m4, xm4
- jmp wq
-.v_w2:
- movd xm0, [srcq+ssq*0]
-.v_w2_loop:
- pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
- lea srcq, [srcq+ssq*2]
- pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
- pshuflw xm1, xm1, q2301 ; 1 0
- punpcklbw xm1, xm0, xm1
- pmaddubsw xm1, xm4
- pmulhrsw xm1, xm5
- packuswb xm1, xm1
- pextrw [dstq+dsq*0], xm1, 1
- pextrw [dstq+dsq*1], xm1, 0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w2_loop
- RET
-.v_w4:
- movd xm0, [srcq+ssq*0]
-.v_w4_loop:
- vpbroadcastd xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd xm2, xm1, xm0, 0x01 ; 0 1
- vpbroadcastd xm0, [srcq+ssq*0]
- vpblendd xm1, xm1, xm0, 0x02 ; 1 2
- punpcklbw xm1, xm2
- pmaddubsw xm1, xm4
- pmulhrsw xm1, xm5
- packuswb xm1, xm1
- movd [dstq+dsq*0], xm1
- pextrd [dstq+dsq*1], xm1, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w4_loop
- RET
-.v_w8:
- movq xm0, [srcq+ssq*0]
-.v_w8_loop:
- movq xm3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpcklbw xm1, xm3, xm0
- movq xm0, [srcq+ssq*0]
- punpcklbw xm2, xm0, xm3
- pmaddubsw xm1, xm4
- pmaddubsw xm2, xm4
- pmulhrsw xm1, xm5
- pmulhrsw xm2, xm5
- packuswb xm1, xm2
- movq [dstq+dsq*0], xm1
- movhps [dstq+dsq*1], xm1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w8_loop
- RET
-.v_w16:
- movu xm0, [srcq+ssq*0]
-.v_w16_loop:
- vbroadcasti128 m2, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd m3, m2, m0, 0x0f ; 0 1
- vbroadcasti128 m0, [srcq+ssq*0]
- vpblendd m2, m2, m0, 0xf0 ; 1 2
- punpcklbw m1, m2, m3
- punpckhbw m2, m3
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*0], xm1
- vextracti128 [dstq+dsq*1], m1, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w16_loop
- RET
-.v_w32:
-%macro PUT_BILIN_V_W32 0
- movu m0, [srcq+ssq*0]
-%%loop:
- movu m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- punpcklbw m1, m3, m0
- punpckhbw m2, m3, m0
- movu m0, [srcq+ssq*0]
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*0], m1
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m1, m4
- pmaddubsw m2, m4
- pmulhrsw m1, m5
- pmulhrsw m2, m5
- packuswb m1, m2
- mova [dstq+dsq*1], m1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg %%loop
-%endmacro
- PUT_BILIN_V_W32
- RET
-.v_w64:
- movu m0, [srcq+32*0]
- movu m1, [srcq+32*1]
-.v_w64_loop:
- add srcq, ssq
- movu m3, [srcq+32*0]
- punpcklbw m2, m3, m0
- punpckhbw m0, m3, m0
- pmaddubsw m2, m4
- pmaddubsw m0, m4
- pmulhrsw m2, m5
- pmulhrsw m0, m5
- packuswb m2, m0
- mova m0, m3
- movu m3, [srcq+32*1]
- mova [dstq+32*0], m2
- punpcklbw m2, m3, m1
- punpckhbw m1, m3, m1
- pmaddubsw m2, m4
- pmaddubsw m1, m4
- pmulhrsw m2, m5
- pmulhrsw m1, m5
- packuswb m2, m1
- mova m1, m3
- mova [dstq+32*1], m2
- add dstq, dsq
- dec hd
- jg .v_w64_loop
- RET
-.v_w128:
- mov t0, dstq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
-.v_w128_loop:
- PUT_BILIN_V_W32
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .v_w128_loop
- RET
-.hv:
- ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
- ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
- movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
- WIN64_SPILL_XMM 8
- shl mxyd, 11 ; can't shift by 12 due to signed overflow
- vpbroadcastd m7, [pw_2048]
- movd xm6, mxyd
- add wq, t2
- vpbroadcastw m6, xm6
- jmp wq
-.hv_w2:
- vpbroadcastd xm0, [srcq+ssq*0]
- pshufb xm0, xm4
- pmaddubsw xm0, xm5
-.hv_w2_loop:
- movd xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pinsrd xm1, [srcq+ssq*0], 1
- pshufb xm1, xm4
- pmaddubsw xm1, xm5 ; 1 _ 2 _
- shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
- mova xm0, xm1
- psubw xm1, xm2
- paddw xm1, xm1
- pmulhw xm1, xm6
- paddw xm1, xm2
- pmulhrsw xm1, xm7
- packuswb xm1, xm1
- pextrw [dstq+dsq*0], xm1, 0
- pextrw [dstq+dsq*1], xm1, 2
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w2_loop
- RET
-.hv_w4:
- mova xm4, [bilin_h_shuf4]
- movddup xm0, [srcq+ssq*0]
- pshufb xm0, xm4
- pmaddubsw xm0, xm5
-.hv_w4_loop:
- movq xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movhps xm1, [srcq+ssq*0]
- pshufb xm1, xm4
- pmaddubsw xm1, xm5 ; 1 2
- shufps xm2, xm0, xm1, q1032 ; 0 1
- mova xm0, xm1
- psubw xm1, xm2
- paddw xm1, xm1
- pmulhw xm1, xm6
- paddw xm1, xm2
- pmulhrsw xm1, xm7
- packuswb xm1, xm1
- movd [dstq+dsq*0], xm1
- pextrd [dstq+dsq*1], xm1, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w4_loop
- RET
-.hv_w8:
- vbroadcasti128 m0, [srcq+ssq*0]
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w8_loop:
- movu xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti128 m1, m1, [srcq+ssq*0], 1
- pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2
- vperm2i128 m2, m0, m1, 0x21 ; 0 1
- mova m0, m1
- psubw m1, m2
- paddw m1, m1
- pmulhw m1, m6
- paddw m1, m2
- pmulhrsw m1, m7
- vextracti128 xm2, m1, 1
- packuswb xm1, xm2
- movq [dstq+dsq*0], xm1
- movhps [dstq+dsq*1], xm1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w8_loop
- RET
-.hv_w16:
- movu m0, [srcq+ssq*0+8*0]
- vinserti128 m0, m0, [srcq+ssq*0+8*1], 1
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w16_loop:
- movu xm2, [srcq+ssq*1+8*0]
- vinserti128 m2, m2, [srcq+ssq*1+8*1], 1
- lea srcq, [srcq+ssq*2]
- movu xm3, [srcq+ssq*0+8*0]
- vinserti128 m3, m3, [srcq+ssq*0+8*1], 1
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m2, m5
- psubw m1, m2, m0
- paddw m1, m1
- pmulhw m1, m6
- paddw m1, m0
- pmaddubsw m0, m3, m5
- psubw m3, m0, m2
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m2
- pmulhrsw m1, m7
- pmulhrsw m3, m7
- packuswb m1, m3
- vpermq m1, m1, q3120
- mova [dstq+dsq*0], xm1
- vextracti128 [dstq+dsq*1], m1, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w16_loop
- RET
-.hv_w32:
- xor t2d, t2d
-.hv_w32gt:
- mov t0, dstq
- mov t1, srcq
-%if WIN64
- movaps r4m, xmm8
-%endif
-.hv_w32_loop0:
- movu m0, [srcq+8*0]
- vinserti128 m0, m0, [srcq+8*2], 1
- movu m1, [srcq+8*1]
- vinserti128 m1, m1, [srcq+8*3], 1
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
-.hv_w32_loop:
- add srcq, ssq
- movu xm2, [srcq+8*1]
- vinserti128 m2, m2, [srcq+8*3], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m1
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m1
- mova m1, m2
- pmulhrsw m8, m3, m7
- movu xm2, [srcq+8*0]
- vinserti128 m2, m2, [srcq+8*2], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m0
- paddw m3, m3
- pmulhw m3, m6
- paddw m3, m0
- mova m0, m2
- pmulhrsw m3, m7
- packuswb m3, m8
- mova [dstq], m3
- add dstq, dsq
- dec hd
- jg .hv_w32_loop
- movzx hd, t2b
- add t0, 32
- add t1, 32
- mov dstq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w32_loop0
-%if WIN64
- movaps xmm8, r4m
-%endif
- RET
-.hv_w64:
- lea t2d, [hq+(1<<8)]
- jmp .hv_w32gt
-.hv_w128:
- lea t2d, [hq+(3<<8)]
- jmp .hv_w32gt
-
-%macro PREP_BILIN 0
-DECLARE_REG_TMP 3, 5, 6
-cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
- movifnidn mxyd, r5m ; mx
- lea t2, [prep%+SUFFIX]
- tzcnt wd, wm
- movifnidn hd, hm
- test mxyd, mxyd
- jnz .h
- mov mxyd, r6m ; my
- test mxyd, mxyd
- jnz .v
-.prep:
- movzx wd, word [t2+wq*2+table_offset(prep,)]
- add wq, t2
- lea stride3q, [strideq*3]
- jmp wq
-.prep_w4:
- movd xm0, [srcq+strideq*0]
- pinsrd xm0, [srcq+strideq*1], 1
- pinsrd xm0, [srcq+strideq*2], 2
- pinsrd xm0, [srcq+stride3q ], 3
- lea srcq, [srcq+strideq*4]
- pmovzxbw ym0, xm0
- psllw ym0, 4
- mova [tmpq], ym0
- add tmpq, 32
- sub hd, 4
- jg .prep_w4
- RET
-.prep_w8:
- movq xm0, [srcq+strideq*0]
-%if cpuflag(avx512)
- movq xm1, [srcq+strideq*1]
- vinserti128 ym0, [srcq+strideq*2], 1
- vinserti128 ym1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- punpcklqdq ym0, ym1
- pmovzxbw m0, ym0
- psllw m0, 4
- mova [tmpq], m0
-%else
- movhps xm0, [srcq+strideq*1]
- movq xm1, [srcq+strideq*2]
- movhps xm1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pmovzxbw m0, xm0
- pmovzxbw m1, xm1
- psllw m0, 4
- psllw m1, 4
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
-%endif
- add tmpq, 32*2
- sub hd, 4
- jg .prep_w8
- RET
-.prep_w16:
-%if cpuflag(avx512)
- movu xm0, [srcq+strideq*0]
- vinserti128 ym0, [srcq+strideq*1], 1
- movu xm1, [srcq+strideq*2]
- vinserti128 ym1, [srcq+stride3q ], 1
- pmovzxbw m0, ym0
- pmovzxbw m1, ym1
-%else
- pmovzxbw m0, [srcq+strideq*0]
- pmovzxbw m1, [srcq+strideq*1]
- pmovzxbw m2, [srcq+strideq*2]
- pmovzxbw m3, [srcq+stride3q ]
-%endif
- lea srcq, [srcq+strideq*4]
- psllw m0, 4
- psllw m1, 4
-%if notcpuflag(avx512)
- psllw m2, 4
- psllw m3, 4
-%endif
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
-%if notcpuflag(avx512)
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
-%endif
- add tmpq, 32*4
- sub hd, 4
- jg .prep_w16
- RET
-.prep_w32:
-%if cpuflag(avx512)
- pmovzxbw m0, [srcq+strideq*0]
- pmovzxbw m1, [srcq+strideq*1]
- pmovzxbw m2, [srcq+strideq*2]
- pmovzxbw m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
-%else
- pmovzxbw m0, [srcq+strideq*0+16*0]
- pmovzxbw m1, [srcq+strideq*0+16*1]
- pmovzxbw m2, [srcq+strideq*1+16*0]
- pmovzxbw m3, [srcq+strideq*1+16*1]
- lea srcq, [srcq+strideq*2]
-%endif
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
- add tmpq, mmsize*4
- sub hd, mmsize*4/(32*2)
- jg .prep_w32
- RET
-.prep_w64:
-%if cpuflag(avx512)
- pmovzxbw m0, [srcq+strideq*0+32*0]
- pmovzxbw m1, [srcq+strideq*0+32*1]
- pmovzxbw m2, [srcq+strideq*1+32*0]
- pmovzxbw m3, [srcq+strideq*1+32*1]
- lea srcq, [srcq+strideq*2]
-%else
- pmovzxbw m0, [srcq+16*0]
- pmovzxbw m1, [srcq+16*1]
- pmovzxbw m2, [srcq+16*2]
- pmovzxbw m3, [srcq+16*3]
- add srcq, strideq
-%endif
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
- add tmpq, mmsize*4
-%if cpuflag(avx512)
- sub hd, 2
-%else
- dec hd
-%endif
- jg .prep_w64
- RET
-.prep_w128:
- pmovzxbw m0, [srcq+(mmsize/2)*0]
- pmovzxbw m1, [srcq+(mmsize/2)*1]
- pmovzxbw m2, [srcq+(mmsize/2)*2]
- pmovzxbw m3, [srcq+(mmsize/2)*3]
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
-%if notcpuflag(avx512)
- pmovzxbw m0, [srcq+16*4]
- pmovzxbw m1, [srcq+16*5]
- pmovzxbw m2, [srcq+16*6]
- pmovzxbw m3, [srcq+16*7]
-%endif
- add tmpq, 32*8
- add srcq, strideq
-%if notcpuflag(avx512)
- psllw m0, 4
- psllw m1, 4
- psllw m2, 4
- psllw m3, 4
- mova [tmpq-32*4], m0
- mova [tmpq-32*3], m1
- mova [tmpq-32*2], m2
- mova [tmpq-32*1], m3
-%endif
- dec hd
- jg .prep_w128
- RET
-.h:
- ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
- ; = (16 - mx) * src[x] + mx * src[x + 1]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
-%if cpuflag(avx512)
- vpbroadcastw m5, mxyd
-%else
- movd xm5, mxyd
- vbroadcasti128 m4, [bilin_h_shuf8]
- vpbroadcastw m5, xm5
-%endif
- mov mxyd, r6m ; my
- test mxyd, mxyd
- jnz .hv
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
- add wq, t2
- lea stride3q, [strideq*3]
- jmp wq
-.h_w4:
- vbroadcasti128 ym4, [bilin_h_shuf4]
-.h_w4_loop:
- movq xm0, [srcq+strideq*0]
- movhps xm0, [srcq+strideq*1]
- movq xm1, [srcq+strideq*2]
- movhps xm1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vinserti128 ym0, xm1, 1
- pshufb ym0, ym4
- pmaddubsw ym0, ym5
- mova [tmpq], ym0
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
-.h_w8:
-%if cpuflag(avx512)
- vbroadcasti128 m4, [bilin_h_shuf8]
-.h_w8_loop:
- movu xm0, [srcq+strideq*0]
- vinserti128 ym0, [srcq+strideq*1], 1
- vinserti128 m0, [srcq+strideq*2], 2
- vinserti128 m0, [srcq+stride3q ], 3
- lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pmaddubsw m0, m5
- mova [tmpq+64*0], m0
-%else
-.h_w8_loop:
- movu xm0, [srcq+strideq*0]
- vinserti128 m0, [srcq+strideq*1], 1
- movu xm1, [srcq+strideq*2]
- vinserti128 m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
-%endif
- add tmpq, 32*2
- sub hd, 4
- jg .h_w8_loop
- RET
-.h_w16:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm16]
-.h_w16_loop:
- movu ym0, [srcq+strideq*0]
- vinserti32x8 m0, [srcq+strideq*1], 1
- movu ym1, [srcq+strideq*2]
- vinserti32x8 m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- vpermb m0, m4, m0
- vpermb m1, m4, m1
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- mova [tmpq+64*0], m0
- mova [tmpq+64*1], m1
-%else
-.h_w16_loop:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, [srcq+strideq*1+8*1], 1
- movu xm2, [srcq+strideq*2+8*0]
- vinserti128 m2, [srcq+strideq*2+8*1], 1
- movu xm3, [srcq+stride3q +8*0]
- vinserti128 m3, [srcq+stride3q +8*1], 1
- lea srcq, [srcq+strideq*4]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+32*0], m0
- mova [tmpq+32*1], m1
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m3
-%endif
- add tmpq, 32*4
- sub hd, 4
- jg .h_w16_loop
- RET
-.h_w32:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
-.h_w32_loop:
- vpermb m0, m4, [srcq+strideq*0]
- vpermb m1, m4, [srcq+strideq*1]
- vpermb m2, m4, [srcq+strideq*2]
- vpermb m3, m4, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
-%else
-.h_w32_loop:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- movu xm1, [srcq+strideq*0+8*2]
- vinserti128 m1, [srcq+strideq*0+8*3], 1
- movu xm2, [srcq+strideq*1+8*0]
- vinserti128 m2, [srcq+strideq*1+8*1], 1
- movu xm3, [srcq+strideq*1+8*2]
- vinserti128 m3, [srcq+strideq*1+8*3], 1
- lea srcq, [srcq+strideq*2]
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
-%endif
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
- add tmpq, mmsize*4
- sub hd, mmsize*4/(32*2)
- jg .h_w32_loop
- RET
-.h_w64:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
-.h_w64_loop:
- vpermb m0, m4, [srcq+strideq*0+32*0]
- vpermb m1, m4, [srcq+strideq*0+32*1]
- vpermb m2, m4, [srcq+strideq*1+32*0]
- vpermb m3, m4, [srcq+strideq*1+32*1]
- lea srcq, [srcq+strideq*2]
-%else
-.h_w64_loop:
- movu xm0, [srcq+8*0]
- vinserti128 m0, [srcq+8*1], 1
- movu xm1, [srcq+8*2]
- vinserti128 m1, [srcq+8*3], 1
- movu xm2, [srcq+8*4]
- vinserti128 m2, [srcq+8*5], 1
- movu xm3, [srcq+8*6]
- vinserti128 m3, [srcq+8*7], 1
- add srcq, strideq
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
-%endif
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
- add tmpq, mmsize*4
-%if cpuflag(avx512)
- sub hd, 2
-%else
- dec hd
-%endif
- jg .h_w64_loop
- RET
-.h_w128:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
-.h_w128_loop:
- vpermb m0, m4, [srcq+32*0]
- vpermb m1, m4, [srcq+32*1]
- vpermb m2, m4, [srcq+32*2]
- vpermb m3, m4, [srcq+32*3]
-%else
-.h_w128_loop:
- movu xm0, [srcq+8*0]
- vinserti128 m0, [srcq+8*1], 1
- movu xm1, [srcq+8*2]
- vinserti128 m1, [srcq+8*3], 1
- movu xm2, [srcq+8*4]
- vinserti128 m2, [srcq+8*5], 1
- movu xm3, [srcq+8*6]
- vinserti128 m3, [srcq+8*7], 1
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
-%endif
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq+mmsize*0], m0
- mova [tmpq+mmsize*1], m1
- mova [tmpq+mmsize*2], m2
- mova [tmpq+mmsize*3], m3
-%if notcpuflag(avx512)
- movu xm0, [srcq+8* 8]
- vinserti128 m0, [srcq+8* 9], 1
- movu xm1, [srcq+8*10]
- vinserti128 m1, [srcq+8*11], 1
- movu xm2, [srcq+8*12]
- vinserti128 m2, [srcq+8*13], 1
- movu xm3, [srcq+8*14]
- vinserti128 m3, [srcq+8*15], 1
-%endif
- add tmpq, 32*8
- add srcq, strideq
-%if notcpuflag(avx512)
- pshufb m0, m4
- pshufb m1, m4
- pshufb m2, m4
- pshufb m3, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- mova [tmpq-32*4], m0
- mova [tmpq-32*3], m1
- mova [tmpq-32*2], m2
- mova [tmpq-32*1], m3
-%endif
- dec hd
- jg .h_w128_loop
- RET
-.v:
- WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
- imul mxyd, 0xff01
- add mxyd, 16 << 8
- add wq, t2
- lea stride3q, [strideq*3]
-%if cpuflag(avx512)
- vpbroadcastw m6, mxyd
-%else
- movd xm6, mxyd
- vpbroadcastw m6, xm6
-%endif
- jmp wq
-.v_w4:
-%if cpuflag(avx512)
- vpbroadcastd xm0, [srcq+strideq*0]
- mov r3d, 0x29
- vbroadcasti128 ym3, [bilin_v_shuf4]
- kmovb k1, r3d
-.v_w4_loop:
- vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
- vpbroadcastd ym2, [srcq+strideq*2]
- vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
- lea srcq, [srcq+strideq*4]
- vpbroadcastd ym0, [srcq+strideq*0]
- punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
- pshufb ym2, ym3
-%else
- movd xm0, [srcq+strideq*0]
-.v_w4_loop:
- vpbroadcastd m1, [srcq+strideq*2]
- vpbroadcastd xm2, [srcq+strideq*1]
- vpbroadcastd m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m1, m0, 0x05 ; 0 2 2 2
- vpbroadcastd m0, [srcq+strideq*0]
- vpblendd m3, m3, m2, 0x0f ; 1 1 3 3
- vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
- vpblendd m1, m1, m3, 0xaa ; 0 1 2 3
- vpblendd m2, m2, m3, 0x55 ; 1 2 3 4
- punpcklbw m2, m1
-%endif
- pmaddubsw ym2, ym6
- mova [tmpq], ym2
- add tmpq, 32
- sub hd, 4
- jg .v_w4_loop
- RET
-.v_w8:
-%if cpuflag(avx512icl)
- mova m5, [bilin_v_perm8]
- vbroadcasti128 ym0, [srcq+strideq*0]
-%else
- movq xm0, [srcq+strideq*0]
-%endif
-.v_w8_loop:
-%if cpuflag(avx512icl)
- vinserti128 ym1, ym0, [srcq+strideq*1], 1
- vpbroadcastq ym0, [srcq+strideq*2]
- vinserti128 m1, [srcq+stride3q ], 2
- lea srcq, [srcq+strideq*4]
- vinserti128 ym0, [srcq+strideq*0], 0
- vpermt2b m1, m5, m0
- pmaddubsw m1, m6
- mova [tmpq], m1
-%else
- vpbroadcastq m1, [srcq+strideq*2]
- vpbroadcastq m2, [srcq+strideq*1]
- vpbroadcastq m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m1, m0, 0x03 ; 0 2 2 2
- vpbroadcastq m0, [srcq+strideq*0]
- vpblendd m3, m3, m2, 0x33 ; 1 3 1 3
- vpblendd m2, m1, m3, 0x0f ; 1 3 2 2
- vpblendd m1, m1, m3, 0xf0 ; 0 2 1 3
- vpblendd m2, m2, m0, 0xc0 ; 1 3 2 4
- punpcklbw m3, m2, m1
- punpckhbw m2, m1
- pmaddubsw m3, m6
- pmaddubsw m2, m6
- mova [tmpq+32*0], m3
- mova [tmpq+32*1], m2
-%endif
- add tmpq, 32*2
- sub hd, 4
- jg .v_w8_loop
- RET
-.v_w16:
-%if cpuflag(avx512icl)
- mova m5, [bilin_v_perm16]
- movu xm0, [srcq+strideq*0]
-.v_w16_loop:
- movu xm2, [srcq+strideq*2]
- vinserti128 ym1, ym0, [srcq+strideq*1], 1
- vpermt2b m1, m5, m2
- vinserti128 ym2, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- movu xm0, [srcq+strideq*0]
- vpermt2b m2, m5, m0
- pmaddubsw m1, m6
- pmaddubsw m2, m6
- mova [tmpq+64*0], m1
- mova [tmpq+64*1], m2
-%else
- vbroadcasti128 m0, [srcq+strideq*0]
-.v_w16_loop:
- vbroadcasti128 m1, [srcq+strideq*2]
- vbroadcasti128 m2, [srcq+strideq*1]
- vbroadcasti128 m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h
- vbroadcasti128 m0, [srcq+strideq*0]
- shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h
- shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h
- punpcklbw m3, m2, m4
- punpcklbw m5, m1, m2
- punpckhbw m1, m2
- punpckhbw m2, m4
- pmaddubsw m3, m6
- pmaddubsw m5, m6
- pmaddubsw m2, m6
- pmaddubsw m1, m6
- mova [tmpq+32*0], m3
- mova [tmpq+32*1], m5
- mova [tmpq+32*2], m2
- mova [tmpq+32*3], m1
-%endif
- add tmpq, 32*4
- sub hd, 4
- jg .v_w16_loop
- RET
-.v_w32:
-%if cpuflag(avx512icl)
- mova m5, [bilin_v_perm32]
- movu ym0, [srcq+strideq*0]
-.v_w32_loop:
- movu ym2, [srcq+strideq*1]
- movu ym3, [srcq+strideq*2]
- movu ym4, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpermt2b m0, m5, m2
- vpermt2b m2, m5, m3
- vpermt2b m3, m5, m4
- pmaddubsw m1, m0, m6
- movu ym0, [srcq+strideq*0]
- vpermt2b m4, m5, m0
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- pmaddubsw m4, m6
- mova [tmpq+64*0], m1
- mova [tmpq+64*1], m2
- mova [tmpq+64*2], m3
- mova [tmpq+64*3], m4
- add tmpq, 64*4
-%else
- vpermq ym0, [srcq+strideq*0], q3120
-.v_w32_loop:
- vpermq ym1, [srcq+strideq*1], q3120
- vpermq ym2, [srcq+strideq*2], q3120
- vpermq ym3, [srcq+stride3q ], q3120
- lea srcq, [srcq+strideq*4]
- punpcklbw m4, m1, m0
- punpckhbw m5, m1, m0
- vpermq ym0, [srcq+strideq*0], q3120
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- mova [tmpq+32*0], ym4
- mova [tmpq+32*1], ym5
- punpcklbw m4, m2, m1
- punpckhbw m5, m2, m1
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- mova [tmpq+32*2], ym4
- mova [tmpq+32*3], ym5
- add tmpq, 32*8
- punpcklbw m4, m3, m2
- punpckhbw m5, m3, m2
- punpcklbw m1, m0, m3
- punpckhbw m2, m0, m3
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- pmaddubsw m1, m6
- pmaddubsw m2, m6
- mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
- mova [tmpq-32*2], m1
- mova [tmpq-32*1], m2
-%endif
- sub hd, 4
- jg .v_w32_loop
- RET
-.v_w64:
-%if cpuflag(avx512)
- mova m5, [bilin_v_perm64]
- vpermq m0, m5, [srcq+strideq*0]
-.v_w64_loop:
- vpermq m1, m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- punpcklbw m4, m1, m0
- punpckhbw m2, m1, m0
- vpermq m0, m5, [srcq+strideq*0]
- punpcklbw m3, m0, m1
- punpckhbw m1, m0, m1
- pmaddubsw m4, m6
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- pmaddubsw m1, m6
- mova [tmpq+64*0], m4
- mova [tmpq+64*1], m2
- mova [tmpq+64*2], m3
- mova [tmpq+64*3], m1
- add tmpq, 64*4
-%else
- vpermq m0, [srcq+strideq*0+32*0], q3120
- vpermq m1, [srcq+strideq*0+32*1], q3120
-.v_w64_loop:
- vpermq m2, [srcq+strideq*1+32*0], q3120
- vpermq m3, [srcq+strideq*1+32*1], q3120
- lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m5, m2, m0
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- mova [tmpq+32*0], m4
- mova [tmpq+32*1], m5
- punpcklbw m4, m3, m1
- punpckhbw m5, m3, m1
- vpermq m0, [srcq+strideq*0+32*0], q3120
- vpermq m1, [srcq+strideq*0+32*1], q3120
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- mova [tmpq+32*2], m4
- mova [tmpq+32*3], m5
- add tmpq, 32*8
- punpcklbw m4, m0, m2
- punpckhbw m5, m0, m2
- punpcklbw m2, m1, m3
- punpckhbw m3, m1, m3
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- mova [tmpq-32*4], m4
- mova [tmpq-32*3], m5
- mova [tmpq-32*2], m2
- mova [tmpq-32*1], m3
-%endif
- sub hd, 2
- jg .v_w64_loop
- RET
-.v_w128:
-%if cpuflag(avx512)
- mova m5, [bilin_v_perm64]
- vpermq m0, m5, [srcq+strideq*0+ 0]
- vpermq m1, m5, [srcq+strideq*0+64]
-.v_w128_loop:
- vpermq m2, m5, [srcq+strideq*1+ 0]
- vpermq m3, m5, [srcq+strideq*1+64]
- lea srcq, [srcq+strideq*2]
- punpcklbw m4, m2, m0
- punpckhbw m0, m2, m0
- pmaddubsw m4, m6
- pmaddubsw m0, m6
- mova [tmpq+64*0], m4
- mova [tmpq+64*1], m0
- punpcklbw m4, m3, m1
- punpckhbw m1, m3, m1
- pmaddubsw m4, m6
- pmaddubsw m1, m6
- mova [tmpq+64*2], m4
- mova [tmpq+64*3], m1
- vpermq m0, m5, [srcq+strideq*0+ 0]
- vpermq m1, m5, [srcq+strideq*0+64]
- punpcklbw m4, m0, m2
- punpckhbw m2, m0, m2
- pmaddubsw m4, m6
- pmaddubsw m2, m6
- mova [tmpq+64*4], m4
- mova [tmpq+64*5], m2
- punpcklbw m4, m1, m3
- punpckhbw m3, m1, m3
- pmaddubsw m4, m6
- pmaddubsw m3, m6
- mova [tmpq+64*6], m4
- mova [tmpq+64*7], m3
- add tmpq, 64*8
- sub hd, 2
- jg .v_w128_loop
-%else
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
-.v_w128_loop0:
- vpermq m0, [srcq+strideq*0], q3120
-.v_w128_loop:
- vpermq m1, [srcq+strideq*1], q3120
- lea srcq, [srcq+strideq*2]
- punpcklbw m2, m1, m0
- punpckhbw m3, m1, m0
- vpermq m0, [srcq+strideq*0], q3120
- punpcklbw m4, m0, m1
- punpckhbw m5, m0, m1
- pmaddubsw m2, m6
- pmaddubsw m3, m6
- pmaddubsw m4, m6
- pmaddubsw m5, m6
- mova [tmpq+32*0], m2
- mova [tmpq+32*1], m3
- mova [tmpq+32*8], m4
- mova [tmpq+32*9], m5
- add tmpq, 32*16
- sub hd, 2
- jg .v_w128_loop
- movzx hd, t2b
- add t0, 64
- add t1, 32
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .v_w128_loop0
-%endif
- RET
-.hv:
- ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
- ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 7
- movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
- shl mxyd, 11
-%if cpuflag(avx512)
- vpbroadcastw m6, mxyd
-%else
- movd xm6, mxyd
- vpbroadcastw m6, xm6
-%endif
- add wq, t2
- lea stride3q, [strideq*3]
- jmp wq
-.hv_w4:
- vbroadcasti128 ym4, [bilin_h_shuf4]
- vpbroadcastq ym0, [srcq+strideq*0]
- pshufb ym0, ym4
- pmaddubsw ym0, ym5
-.hv_w4_loop:
- movq xm1, [srcq+strideq*1]
- movhps xm1, [srcq+strideq*2]
- movq xm2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movhps xm2, [srcq+strideq*0]
- vinserti128 ym1, xm2, 1
- pshufb ym1, ym4
- pmaddubsw ym1, ym5 ; 1 2 3 4
-%if cpuflag(avx512)
- valignq ym2, ym1, ym0, 3 ; 0 1 2 3
-%else
- vpblendd ym2, ym1, ym0, 0xc0
- vpermq ym2, ym2, q2103 ; 0 1 2 3
-%endif
- mova ym0, ym1
- psubw ym1, ym2
- pmulhrsw ym1, ym6
- paddw ym1, ym2
- mova [tmpq], ym1
- add tmpq, 32
- sub hd, 4
- jg .hv_w4_loop
- RET
-.hv_w8:
-%if cpuflag(avx512)
- vbroadcasti128 m4, [bilin_h_shuf8]
-%endif
- vbroadcasti128 m0, [srcq+strideq*0]
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w8_loop:
- movu xm1, [srcq+strideq*1]
-%if cpuflag(avx512)
- vinserti128 ym1, [srcq+strideq*2], 1
- vinserti128 m1, [srcq+stride3q ], 2
- lea srcq, [srcq+strideq*4]
- vinserti128 m1, [srcq+strideq*0], 3
- pshufb m1, m4
- pmaddubsw m1, m5 ; 1 2 3 4
- valignq m2, m1, m0, 6 ; 0 1 2 3
- mova m0, m1
- psubw m1, m2
- pmulhrsw m1, m6
- paddw m1, m2
- mova [tmpq], m1
-%else
- vinserti128 m1, m1, [srcq+strideq*2], 1
- movu xm2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vinserti128 m2, m2, [srcq+strideq*0], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5 ; 1 2
- vperm2i128 m3, m0, m1, 0x21 ; 0 1
- pmaddubsw m0, m2, m5 ; 3 4
- vperm2i128 m2, m1, m0, 0x21 ; 2 3
- psubw m1, m3
- pmulhrsw m1, m6
- paddw m1, m3
- psubw m3, m0, m2
- pmulhrsw m3, m6
- paddw m3, m2
- mova [tmpq+32*0], m1
- mova [tmpq+32*1], m3
-%endif
- add tmpq, 32*2
- sub hd, 4
- jg .hv_w8_loop
- RET
-.hv_w16:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm16]
- vbroadcasti32x8 m0, [srcq+strideq*0]
- vpermb m0, m4, m0
-%else
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- pshufb m0, m4
-%endif
- pmaddubsw m0, m5
-.hv_w16_loop:
-%if cpuflag(avx512icl)
- movu ym1, [srcq+strideq*1]
- vinserti32x8 m1, [srcq+strideq*2], 1
- movu ym2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vinserti32x8 m2, [srcq+strideq*0], 1
- vpermb m1, m4, m1
- vpermb m2, m4, m2
- pmaddubsw m1, m5 ; 1 2
- vshufi32x4 m3, m0, m1, q1032 ; 0 1
- pmaddubsw m0, m2, m5 ; 3 4
- vshufi32x4 m2, m1, m0, q1032 ; 2 3
- psubw m1, m3
- pmulhrsw m1, m6
- paddw m1, m3
- psubw m3, m0, m2
- pmulhrsw m3, m6
- paddw m3, m2
- mova [tmpq+64*0], m1
- mova [tmpq+64*1], m3
-%else
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, [srcq+strideq*0+8*1], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+32*0], m3
- mova [tmpq+32*1], m2
-%endif
- add tmpq, mmsize*2
- sub hd, mmsize*2/(16*2)
- jg .hv_w16_loop
- RET
-.hv_w32:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
- vpermb m0, m4, [srcq+strideq*0]
- pmaddubsw m0, m5
-.hv_w32_loop:
- vpermb m1, m4, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermb m2, m4, [srcq+strideq*0]
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+64*0], m3
- mova [tmpq+64*1], m2
- add tmpq, 64*2
- sub hd, 2
-%else
- movu xm0, [srcq+8*0]
- vinserti128 m0, [srcq+8*1], 1
- movu xm1, [srcq+8*2]
- vinserti128 m1, [srcq+8*3], 1
- pshufb m0, m4
- pshufb m1, m4
- pmaddubsw m0, m5
- pmaddubsw m1, m5
-.hv_w32_loop:
- add srcq, strideq
- movu xm2, [srcq+8*0]
- vinserti128 m2, m2, [srcq+8*1], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m0
- pmulhrsw m3, m6
- paddw m3, m0
- mova m0, m2
- mova [tmpq+ 0], m3
- movu xm2, [srcq+8*2]
- vinserti128 m2, m2, [srcq+8*3], 1
- pshufb m2, m4
- pmaddubsw m2, m5
- psubw m3, m2, m1
- pmulhrsw m3, m6
- paddw m3, m1
- mova m1, m2
- mova [tmpq+32], m3
- add tmpq, 32*2
- dec hd
-%endif
- jg .hv_w32_loop
- RET
-.hv_w64:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
- vpermb m0, m4, [srcq+32*0]
- vpermb m1, m4, [srcq+32*1]
- pmaddubsw m0, m5
- pmaddubsw m1, m5
-.hv_w64_loop:
- add srcq, strideq
- vpermb m2, m4, [srcq+32*0]
- vpermb m3, m4, [srcq+32*1]
- pmaddubsw m2, m5
- pmaddubsw m3, m5
- psubw m7, m2, m0
- psubw m8, m3, m1
- pmulhrsw m7, m6
- pmulhrsw m8, m6
- paddw m7, m0
- paddw m8, m1
- mova [tmpq+ 0], m7
- mova [tmpq+64], m8
- mova m0, m2
- mova m1, m3
- add tmpq, 64*2
- dec hd
- jg .hv_w64_loop
-%else
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(3<<8)]
-.hv_w64_loop0:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w64_loop:
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+32*0], m3
- add tmpq, 32*8
- mova [tmpq-32*4], m2
- sub hd, 2
- jg .hv_w64_loop
- movzx hd, t2b
- add t0, 32
- add t1, 16
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w64_loop0
-%endif
- RET
-.hv_w128:
-%if cpuflag(avx512icl)
- mova m4, [bilin_h_perm32]
- vpermb m0, m4, [srcq+32*0]
- vpermb m1, m4, [srcq+32*1]
- vpermb m2, m4, [srcq+32*2]
- vpermb m3, m4, [srcq+32*3]
- pmaddubsw m0, m5
- pmaddubsw m1, m5
- pmaddubsw m2, m5
- pmaddubsw m3, m5
-.hv_w128_loop:
- add srcq, strideq
- vpermb m7, m4, [srcq+32*0]
- vpermb m8, m4, [srcq+32*1]
- vpermb m9, m4, [srcq+32*2]
- vpermb m10, m4, [srcq+32*3]
- pmaddubsw m7, m5
- pmaddubsw m8, m5
- pmaddubsw m9, m5
- pmaddubsw m10, m5
- psubw m11, m7, m0
- psubw m12, m8, m1
- psubw m13, m9, m2
- psubw m14, m10, m3
- pmulhrsw m11, m6
- pmulhrsw m12, m6
- pmulhrsw m13, m6
- pmulhrsw m14, m6
- paddw m11, m0
- paddw m12, m1
- paddw m13, m2
- paddw m14, m3
- mova [tmpq+64*0], m11
- mova [tmpq+64*1], m12
- mova [tmpq+64*2], m13
- mova [tmpq+64*3], m14
- mova m0, m7
- mova m1, m8
- mova m2, m9
- mova m3, m10
- add tmpq, 64*4
- dec hd
- jg .hv_w128_loop
-%else
- mov t0, tmpq
- mov t1, srcq
- lea t2d, [hq+(7<<8)]
-.hv_w128_loop0:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, m0, [srcq+strideq*0+8*1], 1
- pshufb m0, m4
- pmaddubsw m0, m5
-.hv_w128_loop:
- movu xm1, [srcq+strideq*1+8*0]
- vinserti128 m1, m1, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- movu xm2, [srcq+strideq*0+8*0]
- vinserti128 m2, m2, [srcq+strideq*0+8*1], 1
- pshufb m1, m4
- pshufb m2, m4
- pmaddubsw m1, m5
- psubw m3, m1, m0
- pmulhrsw m3, m6
- paddw m3, m0
- pmaddubsw m0, m2, m5
- psubw m2, m0, m1
- pmulhrsw m2, m6
- paddw m2, m1
- mova [tmpq+32*0], m3
- mova [tmpq+32*8], m2
- add tmpq, 32*16
- sub hd, 2
- jg .hv_w128_loop
- movzx hd, t2b
- add t0, mmsize
- add t1, mmsize/2
- mov tmpq, t0
- mov srcq, t1
- sub t2d, 1<<8
- jg .hv_w128_loop0
-%endif
- RET
-%endmacro
-
-; int8_t subpel_filters[5][15][8]
-%assign FILTER_REGULAR (0*15 << 16) | 3*15
-%assign FILTER_SMOOTH (1*15 << 16) | 4*15
-%assign FILTER_SHARP (2*15 << 16) | 3*15
-
-%macro FN 4 ; fn, type, type_h, type_v
-cglobal %1_%2
- mov t0d, FILTER_%3
-%ifidn %3, %4
- mov t1d, t0d
-%else
- mov t1d, FILTER_%4
-%endif
-%ifnidn %2, regular ; skip the jump in the last filter
- jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
-%endif
-%endmacro
-
-%if WIN64
-DECLARE_REG_TMP 4, 5
-%else
-DECLARE_REG_TMP 7, 8
-%endif
-
-%define PUT_8TAP_FN FN put_8tap,
-
-PUT_8TAP_FN sharp, SHARP, SHARP
-PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_FN regular, REGULAR, REGULAR
-
-cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
- imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
- imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
- lea r8, [put_avx2]
- movsxd wq, wm
- movifnidn hd, hm
- test mxd, 0xf00
- jnz .h
- test myd, 0xf00
- jnz .v
- tzcnt wd, wd
- movzx wd, word [r8+wq*2+table_offset(put,)]
- add wq, r8
- lea r6, [ssq*3]
- lea r7, [dsq*3]
-%if WIN64
- pop r8
-%endif
- jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
- vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
- WIN64_SPILL_XMM 11
- cmp wd, 4
- jl .h_w2
- vbroadcasti128 m6, [subpel_h_shufA]
- je .h_w4
- tzcnt wd, wd
- vbroadcasti128 m7, [subpel_h_shufB]
- vbroadcasti128 m8, [subpel_h_shufC]
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
- vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
- vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
- add wq, r8
- jmp wq
-.h_w2:
- movzx mxd, mxb
- dec srcq
- mova xm4, [subpel_h_shuf4]
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
-.h_w2_loop:
- movq xm0, [srcq+ssq*0]
- movhps xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm0, xm4
- pmaddubsw xm0, xm3
- phaddw xm0, xm0
- paddw xm0, xm5
- psraw xm0, 6
- packuswb xm0, xm0
- pextrw [dstq+dsq*0], xm0, 0
- pextrw [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w2_loop
- RET
-.h_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
-.h_w4_loop:
- movq xm0, [srcq+ssq*0]
- movq xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm0, xm6
- pshufb xm1, xm6
- pmaddubsw xm0, xm3
- pmaddubsw xm1, xm3
- phaddw xm0, xm1
- paddw xm0, xm5
- psraw xm0, 6
- packuswb xm0, xm0
- movd [dstq+dsq*0], xm0
- pextrd [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w4_loop
- RET
-.h_w8:
-%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
- pshufb m%2, m%1, m7
- pshufb m%3, m%1, m8
- pshufb m%1, m6
- pmaddubsw m%4, m%2, m9
- pmaddubsw m%2, m10
- pmaddubsw m%3, m10
- pmaddubsw m%1, m9
- paddw m%3, m%4
- paddw m%1, m%2
- phaddw m%1, m%3
- paddw m%1, m5
- psraw m%1, 6
-%endmacro
- movu xm0, [srcq+ssq*0]
- vinserti128 m0, m0, [srcq+ssq*1], 1
- lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 0, 1, 2, 3
- vextracti128 xm1, m0, 1
- packuswb xm0, xm1
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w8
- RET
-.h_w16:
- movu xm0, [srcq+ssq*0+8*0]
- vinserti128 m0, m0, [srcq+ssq*1+8*0], 1
- movu xm1, [srcq+ssq*0+8*1]
- vinserti128 m1, m1, [srcq+ssq*1+8*1], 1
- PUT_8TAP_H 0, 2, 3, 4
- lea srcq, [srcq+ssq*2]
- PUT_8TAP_H 1, 2, 3, 4
- packuswb m0, m1
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], m0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .h_w16
- RET
-.h_w32:
- xor r6d, r6d
- jmp .h_start
-.h_w64:
- mov r6, -32*1
- jmp .h_start
-.h_w128:
- mov r6, -32*3
-.h_start:
- sub srcq, r6
- sub dstq, r6
- mov r4, r6
-.h_loop:
- movu m0, [srcq+r6+8*0]
- movu m1, [srcq+r6+8*1]
- PUT_8TAP_H 0, 2, 3, 4
- PUT_8TAP_H 1, 2, 3, 4
- packuswb m0, m1
- mova [dstq+r6], m0
- add r6, 32
- jle .h_loop
- add srcq, ssq
- add dstq, dsq
- mov r6, r4
- dec hd
- jg .h_loop
- RET
-.v:
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 16
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- tzcnt r6d, wd
- movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
- vpbroadcastd m7, [pw_512]
- lea myq, [r8+myq*8+subpel_filters-put_avx2]
- vpbroadcastw m8, [myq+0]
- vpbroadcastw m9, [myq+2]
- vpbroadcastw m10, [myq+4]
- vpbroadcastw m11, [myq+6]
- add r6, r8
- lea ss3q, [ssq*3]
- sub srcq, ss3q
- jmp r6
-.v_w2:
- movd xm2, [srcq+ssq*0]
- pinsrw xm2, [srcq+ssq*1], 2
- pinsrw xm2, [srcq+ssq*2], 4
- pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
- add srcq, ss3q
- vpblendd xm3, xm3, xm1, 0x02 ; 4 5
- vpblendd xm1, xm1, xm0, 0x02 ; 5 6
- palignr xm4, xm3, xm2, 4 ; 1 2 3 4
- punpcklbw xm3, xm1 ; 45 56
- punpcklbw xm1, xm2, xm4 ; 01 12
- punpckhbw xm2, xm4 ; 23 34
-.v_w2_loop:
- pmaddubsw xm5, xm1, xm8 ; a0 b0
- mova xm1, xm2
- pmaddubsw xm2, xm9 ; a1 b1
- paddw xm5, xm2
- mova xm2, xm3
- pmaddubsw xm3, xm10 ; a2 b2
- paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd xm4, xm4, xm0, 0x02 ; 7 8
- punpcklbw xm3, xm4 ; 67 78
- pmaddubsw xm4, xm3, xm11 ; a3 b3
- paddw xm5, xm4
- pmulhrsw xm5, xm7
- packuswb xm5, xm5
- pextrw [dstq+dsq*0], xm5, 0
- pextrw [dstq+dsq*1], xm5, 2
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w2_loop
- RET
-.v_w4:
- movd xm2, [srcq+ssq*0]
- pinsrd xm2, [srcq+ssq*1], 1
- pinsrd xm2, [srcq+ssq*2], 2
- pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3
- lea srcq, [srcq+ssq*4]
- movd xm3, [srcq+ssq*0]
- vpbroadcastd xm1, [srcq+ssq*1]
- vpbroadcastd xm0, [srcq+ssq*2]
- add srcq, ss3q
- vpblendd xm3, xm3, xm1, 0x02 ; 4 5
- vpblendd xm1, xm1, xm0, 0x02 ; 5 6
- palignr xm4, xm3, xm2, 4 ; 1 2 3 4
- punpcklbw xm3, xm1 ; 45 56
- punpcklbw xm1, xm2, xm4 ; 01 12
- punpckhbw xm2, xm4 ; 23 34
-.v_w4_loop:
- pmaddubsw xm5, xm1, xm8 ; a0 b0
- mova xm1, xm2
- pmaddubsw xm2, xm9 ; a1 b1
- paddw xm5, xm2
- mova xm2, xm3
- pmaddubsw xm3, xm10 ; a2 b2
- paddw xm5, xm3
- vpbroadcastd xm4, [srcq+ssq*0]
- vpblendd xm3, xm0, xm4, 0x02 ; 6 7
- vpbroadcastd xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd xm4, xm4, xm0, 0x02 ; 7 8
- punpcklbw xm3, xm4 ; 67 78
- pmaddubsw xm4, xm3, xm11 ; a3 b3
- paddw xm5, xm4
- pmulhrsw xm5, xm7
- packuswb xm5, xm5
- movd [dstq+dsq*0], xm5
- pextrd [dstq+dsq*1], xm5, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w4_loop
- RET
-.v_w8:
- movq xm1, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m2, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m6, [srcq+ssq*1]
- vpbroadcastq m0, [srcq+ssq*2]
- add srcq, ss3q
- vpblendd m1, m1, m4, 0x30
- vpblendd m4, m4, m2, 0x30
- punpcklbw m1, m4 ; 01 12
- vpblendd m2, m2, m5, 0x30
- vpblendd m5, m5, m3, 0x30
- punpcklbw m2, m5 ; 23 34
- vpblendd m3, m3, m6, 0x30
- vpblendd m6, m6, m0, 0x30
- punpcklbw m3, m6 ; 45 56
-.v_w8_loop:
- pmaddubsw m5, m1, m8 ; a0 b0
- mova m1, m2
- pmaddubsw m2, m9 ; a1 b1
- paddw m5, m2
- mova m2, m3
- pmaddubsw m3, m10 ; a2 b2
- paddw m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
- vpblendd m3, m0, m4, 0x30
- vpbroadcastq m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd m4, m4, m0, 0x30
- punpcklbw m3, m4 ; 67 78
- pmaddubsw m4, m3, m11 ; a3 b3
- paddw m5, m4
- pmulhrsw m5, m7
- vextracti128 xm4, m5, 1
- packuswb xm5, xm4
- movq [dstq+dsq*0], xm5
- movhps [dstq+dsq*1], xm5
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w8_loop
- RET
-.v_w16:
-.v_w32:
-.v_w64:
-.v_w128:
- lea r6d, [wq-16]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
-.v_w16_loop0:
- vbroadcasti128 m4, [srcq+ssq*0]
- vbroadcasti128 m5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m0, [srcq+ssq*1]
- vbroadcasti128 m6, [srcq+ssq*0]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m1, [srcq+ssq*0]
- vbroadcasti128 m2, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m3, [srcq+ssq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
-.v_w16_loop:
- vbroadcasti128 m12, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vbroadcasti128 m13, [srcq+ssq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
- mova m1, m3
- mova m2, m4
- pmaddubsw m3, m9 ; a1
- pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
- mova m3, m5
- mova m4, m6
- pmaddubsw m5, m10 ; a2
- pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
- paddw m14, m12
- paddw m15, m13
- pmulhrsw m14, m7
- pmulhrsw m15, m7
- packuswb m14, m15
- vpermq m14, m14, q3120
- mova [dstq+dsq*0], xm14
- vextracti128 [dstq+dsq*1], m14, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .v_w16_loop
- movzx hd, r6b
- add r4, 16
- add r7, 16
- mov dstq, r4
- mov srcq, r7
- sub r6d, 1<<8
- jg .v_w16_loop0
- RET
-.hv:
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 16
- cmp wd, 4
- jg .hv_w8
- movzx mxd, mxb
- dec srcq
- vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
- lea ss3q, [ssq*3]
- sub srcq, ss3q
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- vpbroadcastd m8, [pw_8192]
- vpbroadcastd m9, [pd_512]
- pshufd m10, m0, q0000
- pshufd m11, m0, q1111
- pshufd m12, m0, q2222
- pshufd m13, m0, q3333
- cmp wd, 4
- je .hv_w4
- vbroadcasti128 m6, [subpel_h_shuf4]
- movq xm2, [srcq+ssq*0]
- movhps xm2, [srcq+ssq*1]
- movq xm0, [srcq+ssq*2]
- movhps xm0, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
- add srcq, ss3q
- vpblendd m2, m2, m3, 0x30
- vpblendd m0, m0, m1, 0x30
- vpblendd m2, m2, m4, 0xc0
- pshufb m2, m6
- pshufb m0, m6
- pmaddubsw m2, m7
- pmaddubsw m0, m7
- phaddw m2, m0
- pmulhrsw m2, m8
- vextracti128 xm3, m2, 1
- palignr xm4, xm3, xm2, 4
- punpcklwd xm1, xm2, xm4 ; 01 12
- punpckhwd xm2, xm4 ; 23 34
- pshufd xm0, xm3, q2121
- punpcklwd xm3, xm0 ; 45 56
-.hv_w2_loop:
- pmaddwd xm5, xm1, xm10 ; a0 b0
- mova xm1, xm2
- pmaddwd xm2, xm11 ; a1 b1
- paddd xm5, xm2
- mova xm2, xm3
- pmaddwd xm3, xm12 ; a2 b2
- paddd xm5, xm3
- movq xm4, [srcq+ssq*0]
- movhps xm4, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pshufb xm4, xm6
- pmaddubsw xm4, xm7
- phaddw xm4, xm4
- pmulhrsw xm4, xm8
- palignr xm3, xm4, xm0, 12
- mova xm0, xm4
- punpcklwd xm3, xm0 ; 67 78
- pmaddwd xm4, xm3, xm13 ; a3 b3
- paddd xm5, xm9
- paddd xm5, xm4
- psrad xm5, 10
- packssdw xm5, xm5
- packuswb xm5, xm5
- pextrw [dstq+dsq*0], xm5, 0
- pextrw [dstq+dsq*1], xm5, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w2_loop
- RET
-.hv_w4:
- mova m6, [subpel_h_shuf4]
- vpbroadcastq m2, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m0, [srcq+ssq*2]
- vpbroadcastq m5, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpbroadcastq m3, [srcq+ssq*0]
- vpblendd m2, m2, m4, 0xcc ; 0 1
- vpbroadcastq m4, [srcq+ssq*1]
- vpbroadcastq m1, [srcq+ssq*2]
- add srcq, ss3q
- vpblendd m0, m0, m5, 0xcc ; 2 3
- vpblendd m3, m3, m4, 0xcc ; 4 5
- pshufb m2, m6
- pshufb m0, m6
- pshufb m3, m6
- pshufb m1, m6
- pmaddubsw m2, m7
- pmaddubsw m0, m7
- pmaddubsw m3, m7
- pmaddubsw m1, m7
- phaddw m2, m0
- phaddw m3, m1
- pmulhrsw m2, m8
- pmulhrsw m3, m8
- palignr m4, m3, m2, 4
- punpcklwd m1, m2, m4 ; 01 12
- punpckhwd m2, m4 ; 23 34
- pshufd m0, m3, q2121
- punpcklwd m3, m0 ; 45 56
-.hv_w4_loop:
- pmaddwd m5, m1, m10 ; a0 b0
- mova m1, m2
- pmaddwd m2, m11 ; a1 b1
- paddd m5, m2
- mova m2, m3
- pmaddwd m3, m12 ; a2 b2
- paddd m5, m3
- vpbroadcastq m4, [srcq+ssq*0]
- vpbroadcastq m3, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd m4, m4, m3, 0xcc ; 7 8
- pshufb m4, m6
- pmaddubsw m4, m7
- phaddw m4, m4
- pmulhrsw m4, m8
- palignr m3, m4, m0, 12
- mova m0, m4
- punpcklwd m3, m0 ; 67 78
- pmaddwd m4, m3, m13 ; a3 b3
- paddd m5, m9
- paddd m5, m4
- psrad m5, 10
- vextracti128 xm4, m5, 1
- packssdw xm5, xm4
- packuswb xm5, xm5
- pshuflw xm5, xm5, q3120
- movd [dstq+dsq*0], xm5
- pextrd [dstq+dsq*1], xm5, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w4_loop
- RET
-.hv_w8:
- shr mxd, 16
- sub srcq, 3
- vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
- vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 6
- cmovs myd, mxd
- vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
- lea ss3q, [ssq*3]
- sub srcq, ss3q
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m12, m0, q0000
- pshufd m13, m0, q1111
- pshufd m14, m0, q2222
- pshufd m15, m0, q3333
- lea r6d, [wq-8]
- mov r4, dstq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
-.hv_w8_loop0:
- vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
- movu xm4, [srcq+ssq*0]
- movu xm5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- movu xm6, [srcq+ssq*0]
- vbroadcasti128 m0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vpblendd m4, m4, m0, 0xf0 ; 0 3
- vinserti128 m5, m5, [srcq+ssq*0], 1 ; 1 4
- vinserti128 m6, m6, [srcq+ssq*1], 1 ; 2 5
- lea srcq, [srcq+ssq*2]
- vinserti128 m0, m0, [srcq+ssq*0], 1 ; 3 6
-%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
- pshufb %3, %1, %6
- pshufb %4, %1, %7
- pshufb %1, %5
- pmaddubsw %2, %3, m10
- pmaddubsw %4, m11
- pmaddubsw %3, m11
- pmaddubsw %1, m10
- paddw %2, %4
- paddw %1, %3
- phaddw %1, %2
-%endmacro
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
- vpbroadcastd m7, [pw_8192]
- vpermq m4, m4, q3120
- vpermq m5, m5, q3120
- vpermq m6, m6, q3120
- pmulhrsw m0, m7
- pmulhrsw m4, m7
- pmulhrsw m5, m7
- pmulhrsw m6, m7
- vpermq m7, m0, q3120
- punpcklwd m1, m4, m5 ; 01
- punpckhwd m4, m5 ; 34
- punpcklwd m2, m5, m6 ; 12
- punpckhwd m5, m6 ; 45
- punpcklwd m3, m6, m7 ; 23
- punpckhwd m6, m7 ; 56
-.hv_w8_loop:
- vextracti128 r6m, m0, 1 ; not enough registers
- movu xm0, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- vinserti128 m0, m0, [srcq+ssq*0], 1 ; 7 8
- pmaddwd m8, m1, m12 ; a0
- pmaddwd m9, m2, m12 ; b0
- mova m1, m3
- mova m2, m4
- pmaddwd m3, m13 ; a1
- pmaddwd m4, m13 ; b1
- paddd m8, m3
- paddd m9, m4
- mova m3, m5
- mova m4, m6
- pmaddwd m5, m14 ; a2
- pmaddwd m6, m14 ; b2
- paddd m8, m5
- paddd m9, m6
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
- vpbroadcastd m5, [pw_8192]
- vpbroadcastd m7, [pd_512]
- vbroadcasti128 m6, r6m
- pmulhrsw m0, m5
- paddd m8, m7
- paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
- punpcklwd m5, m6, m7 ; 67
- punpckhwd m6, m7 ; 78
- pmaddwd m7, m5, m15 ; a3
- paddd m8, m7
- pmaddwd m7, m6, m15 ; b3
- paddd m7, m9
- psrad m8, 10
- psrad m7, 10
- packssdw m8, m7
- vextracti128 xm7, m8, 1
- packuswb xm8, xm7
- pshufd xm7, xm8, q3120
- movq [dstq+dsq*0], xm7
- movhps [dstq+dsq*1], xm7
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .hv_w8_loop
- movzx hd, r6b
- add r4, 8
- add r7, 8
- mov dstq, r4
- mov srcq, r7
- sub r6d, 1<<8
- jg .hv_w8_loop0
- RET
-
-%macro PREP_8TAP_H 0
- %if cpuflag(avx512)
- vpermb m10, m5, m0
- vpermb m11, m5, m1
- vpermb m12, m6, m0
- vpermb m13, m6, m1
- vpermb m14, m7, m0
- vpermb m15, m7, m1
- mova m0, m4
- mova m2, m4
- mova m1, m4
- mova m3, m4
- vpdpbusd m0, m10, m8
- vpdpbusd m2, m12, m8
- vpdpbusd m1, m11, m8
- vpdpbusd m3, m13, m8
- vpdpbusd m0, m12, m9
- vpdpbusd m2, m14, m9
- vpdpbusd m1, m13, m9
- vpdpbusd m3, m15, m9
- packssdw m0, m2
- packssdw m1, m3
- psraw m0, 2
- psraw m1, 2
- mova [tmpq+ 0], m0
- mova [tmpq+64], m1
- %else
- pshufb m1, m0, m5
- pshufb m2, m0, m6
- pshufb m3, m0, m7
- pmaddubsw m1, m8
- pmaddubsw m0, m2, m8
- pmaddubsw m2, m9
- pmaddubsw m3, m9
- paddw m1, m2
- paddw m0, m3
- phaddw m0, m1, m0
- pmulhrsw m0, m4
- %endif
-%endmacro
-
-%macro PREP_8TAP_V_W4 5 ; round, weights
- movd xm0, [srcq+strideq*0]
- vpbroadcastd ym1, [srcq+strideq*2]
- vpbroadcastd xm2, [srcq+strideq*1]
- vpbroadcastd ym3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd ym1, ym1, ym0, 0x01 ; 0 2 2 _ 2 _ _ _
- vpblendd ym3, ym3, ym2, 0x03 ; 1 1 3 3 3 3 _ _
- vpbroadcastd ym0, [srcq+strideq*0]
- vpbroadcastd ym2, [srcq+strideq*1]
- vpblendd ym1, ym1, ym0, 0x68 ; 0 2 2 4 2 4 4 _
- vpbroadcastd ym0, [srcq+strideq*2]
- vbroadcasti128 ym5, [deint_shuf4]
- vpblendd ym3, ym3, ym2, 0xc0 ; 1 1 3 3 3 3 5 5
- vpblendd ym2, ym3, ym1, 0x55 ; 0 1 2 3 2 3 4 5
- vpblendd ym3, ym3, ym1, 0xaa ; 1 2 3 4 3 4 5 _
- punpcklbw ym1, ym2, ym3 ; 01 12 23 34
- vpblendd ym3, ym3, ym0, 0x80 ; 1 2 3 4 3 4 5 6
- punpckhbw ym2, ym3 ; 23 34 45 56
-.v_w4_loop:
- pinsrd xm0, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- vpbroadcastd ym3, [srcq+strideq*0]
- vpbroadcastd ym4, [srcq+strideq*1]
- vpblendd ym3, ym3, ym4, 0x20 ; _ _ 8 _ 8 9 _ _
- vpblendd ym3, ym3, ym0, 0x03 ; 6 7 8 _ 8 9 _ _
- vpbroadcastd ym0, [srcq+strideq*2]
- vpblendd ym3, ym3, ym0, 0x40 ; 6 7 8 _ 8 9 a _
- pshufb ym3, ym5 ; 67 78 89 9a
- pmaddubsw ym4, ym1, ym%2
- vperm2i128 ym1, ym2, ym3, 0x21 ; 45 56 67 78
- pmaddubsw ym2, ym%3
- paddw ym4, ym2
- mova ym2, ym3
- pmaddubsw ym3, ym%5
- paddw ym3, ym4
- pmaddubsw ym4, ym1, ym%4
- paddw ym3, ym4
- pmulhrsw ym3, ym%1
- mova [tmpq], ym3
-%endmacro
-
-%macro PREP_8TAP 0
- %if WIN64
- DECLARE_REG_TMP 6, 4
- %else
- DECLARE_REG_TMP 6, 7
-%endif
-
-%define PREP_8TAP_FN FN prep_8tap,
-
-PREP_8TAP_FN sharp, SHARP, SHARP
-PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_FN regular, REGULAR, REGULAR
-
-cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
- imul mxd, mxm, 0x010101
- add mxd, t0d ; 8tap_h, mx, 4tap_h
- imul myd, mym, 0x010101
- add myd, t1d ; 8tap_v, my, 4tap_v
- lea r7, [prep%+SUFFIX]
- movsxd wq, wm
- movifnidn hd, hm
- test mxd, 0xf00
- jnz .h
- test myd, 0xf00
- jnz .v
- tzcnt wd, wd
- movzx wd, word [r7+wq*2+table_offset(prep,)]
- add wq, r7
- lea r6, [strideq*3]
-%if WIN64
- pop r7
-%endif
- jmp wq
-.h:
- test myd, 0xf00
- jnz .hv
-%if cpuflag(avx512)
- vpbroadcastd m4, [pd_2]
-%else
- vpbroadcastd m4, [pw_8192]
- vbroadcasti128 m5, [subpel_h_shufA]
-%endif
- WIN64_SPILL_XMM 10
- cmp wd, 4
- je .h_w4
- tzcnt wd, wd
-%if notcpuflag(avx512)
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
-%endif
- shr mxd, 16
- sub srcq, 3
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
- vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
- add wq, r7
- jmp wq
-.h_w4:
-%if cpuflag(avx512)
- mov r3d, 0x4
- kmovb k1, r3d
- vbroadcasti128 ym5, [subpel_h_shufA]
-%endif
- movzx mxd, mxb
- dec srcq
- vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
- lea stride3q, [strideq*3]
-.h_w4_loop:
-%if cpuflag(avx512icl)
- mova ym0, ym4
- mova ym1, ym4
- movq xm2, [srcq+strideq*0]
- movq xm3, [srcq+strideq*1]
- vpbroadcastq ym2{k1}, [srcq+strideq*2]
- vpbroadcastq ym3{k1}, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pshufb ym2, ym5
- pshufb ym3, ym5
- vpdpbusd ym0, ym2, ym6
- vpdpbusd ym1, ym3, ym6
- packssdw ym0, ym1
- psraw ym0, 2
-%else
- movq xm0, [srcq+strideq*0]
- vpbroadcastq m2, [srcq+strideq*2]
- movq xm1, [srcq+strideq*1]
- vpblendd m0, m0, m2, 0xf0
- vpbroadcastq m2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpblendd m1, m1, m2, 0xf0
- pshufb m0, m5
- pshufb m1, m5
- pmaddubsw m0, m6
- pmaddubsw m1, m6
- phaddw m0, m1
- pmulhrsw m0, m4
-%endif
- mova [tmpq], ym0
- add tmpq, 32
- sub hd, 4
- jg .h_w4_loop
- RET
-.h_w8:
-%if cpuflag(avx512)
- vbroadcasti128 m5, [subpel_h_shufA]
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- lea stride3q, [strideq*3]
-%endif
-.h_w8_loop:
- movu xm0, [srcq+strideq*0]
- vinserti128 ym0, [srcq+strideq*1], 1
-%if cpuflag(avx512)
- vinserti128 m0, [srcq+strideq*2], 2
- vinserti128 m0, [srcq+stride3q ], 3
-%endif
- lea srcq, [srcq+strideq*(mmsize/(8*2))]
-%if cpuflag(avx512icl)
- mova m10, m4
- mova m11, m4
- pshufb m1, m0, m5
- pshufb m2, m0, m6
- pshufb m3, m0, m7
- vpdpbusd m10, m1, m8
- vpdpbusd m11, m2, m8
- vpdpbusd m10, m2, m9
- vpdpbusd m11, m3, m9
- packssdw m10, m11
- psraw m0, m10, 2
-%else
- PREP_8TAP_H
-%endif
- mova [tmpq], m0
- add tmpq, mmsize
- sub hd, mmsize/(8*2)
- jg .h_w8_loop
- RET
-.h_w16:
-%if cpuflag(avx512icl)
- mova m5, [spel_h_perm16a]
- mova m6, [spel_h_perm16b]
- mova m7, [spel_h_perm16c]
- lea stride3q, [strideq*3]
-.h_w16_loop:
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*2]
- vinserti32x8 m0, [srcq+strideq*1], 1
- vinserti32x8 m1, [srcq+stride3q ], 1
- lea srcq, [srcq+strideq*4]
- PREP_8TAP_H
-%else
-.h_w16_loop:
- movu xm0, [srcq+strideq*0+8*0]
- vinserti128 m0, [srcq+strideq*0+8*1], 1
- PREP_8TAP_H
- mova [tmpq+32*0], m0
- movu xm0, [srcq+strideq*1+8*0]
- vinserti128 m0, m0, [srcq+strideq*1+8*1], 1
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
- mova [tmpq+32*1], m0
-%endif
- add tmpq, mmsize*2
- sub hd, mmsize*2/(16*2)
- jg .h_w16_loop
- RET
-.h_w32:
-%if cpuflag(avx512icl)
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
-.h_w32_loop:
- movu m0, [srcq+strideq*0]
- movu m1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- PREP_8TAP_H
- add tmpq, 64*2
- sub hd, 2
- jg .h_w32_loop
- RET
-%else
- xor r6d, r6d
- jmp .h_start
-%endif
-.h_w64:
-%if cpuflag(avx512)
- xor r6d, r6d
-%else
- mov r6, -32*1
-%endif
- jmp .h_start
-.h_w128:
-%if cpuflag(avx512)
- mov r6, -64*1
-%else
- mov r6, -32*3
-%endif
-.h_start:
-%if cpuflag(avx512)
- mova m5, [spel_h_perm32a]
- mova m6, [spel_h_perm32b]
- mova m7, [spel_h_perm32c]
-%endif
- sub srcq, r6
- mov r5, r6
-.h_loop:
-%if cpuflag(avx512icl)
- movu m0, [srcq+r6+32*0]
- movu m1, [srcq+r6+32*1]
- PREP_8TAP_H
-%else
- movu xm0, [srcq+r6+8*0]
- vinserti128 ym0, [srcq+r6+8*1], 1
- PREP_8TAP_H
- mova [tmpq+32*0], m0
- movu xm0, [srcq+r6+8*2]
- vinserti128 ym0, [srcq+r6+8*3], 1
- PREP_8TAP_H
- mova [tmpq+32*1], m0
-%endif
- add tmpq, mmsize*2
- add r6, mmsize
- jle .h_loop
- add srcq, strideq
- mov r6, r5
- dec hd
- jg .h_loop
- RET
-.v:
- %assign stack_offset stack_offset - stack_size_padded
- WIN64_SPILL_XMM 16
- movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
- shr myd, 16 ; Note that the code is 8-tap only, having
- cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
- cmove myd, mxd ; had a negligible effect on performance.
- ; TODO: Would a 6-tap code path be worth it?
-%if cpuflag(avx512)
- tzcnt wd, wd
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
- add wq, r7
-%endif
- lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- vpbroadcastd m7, [pw_8192]
- vpbroadcastw m8, [myq+0]
- vpbroadcastw m9, [myq+2]
- vpbroadcastw m10, [myq+4]
- vpbroadcastw m11, [myq+6]
-%if cpuflag(avx512)
- jmp wq
-%else
- cmp wd, 8
- jg .v_w16
- je .v_w8
-%endif
-.v_w4:
-%if cpuflag(avx512)
- AVX512_MM_PERMUTATION
- PREP_8TAP_V_W4 23, 24, 25, 26, 27
- AVX512_MM_PERMUTATION
-%else
- PREP_8TAP_V_W4 7, 8, 9, 10, 11
-%endif
- add tmpq, 32
- sub hd, 4
- jg .v_w4_loop
-%if cpuflag(avx512)
- vzeroupper
-%endif
- RET
-.v_w8:
-%if cpuflag(avx512)
- mov r3d, 0xf044
- kmovw k1, r3d
- kshiftrw k2, k1, 8
- movq xm0, [srcq+strideq*0]
- vpbroadcastq ym1, [srcq+strideq*1]
- vpbroadcastq m2, [srcq+strideq*2]
- vpbroadcastq m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m4, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- vpbroadcastq m6, [srcq+strideq*2]
- vmovdqa64 ym0{k1}, ym1
- vmovdqa64 ym1{k1}, ym2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- punpcklbw ym0, ym1 ; 01 12 __ __
- punpcklbw m2, m3 ; 23 34 23 34
- punpcklbw m4, m5 ; 45 56 45 56
- vmovdqa64 m0{k2}, m2 ; 01 12 23 34
- vmovdqa64 m2{k2}, m4 ; 23 34 45 56
-.v_w8_loop:
- vpbroadcastq m1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m3, [srcq+strideq*0]
- vpbroadcastq m5, [srcq+strideq*1]
- pmaddubsw m14, m0, m8
- pmaddubsw m15, m2, m9
- vpblendmq m0{k1}, m6, m1
- vpblendmq m2{k1}, m1, m3
- vpbroadcastq m6, [srcq+strideq*2]
- paddw m14, m15
- punpcklbw m2, m0, m2 ; 67 78 67 78
- vpblendmq m12{k1}, m3, m5
- vpblendmq m13{k1}, m5, m6
- vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
- punpcklbw m4, m12, m13 ; 89 9a 89 9a
- vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
- pmaddubsw m12, m0, m10
- pmaddubsw m13, m2, m11
- paddw m14, m12
- paddw m14, m13
- pmulhrsw m14, m7
- mova [tmpq], m14
-%else
- movq xm1, [srcq+strideq*0]
- vpbroadcastq m4, [srcq+strideq*1]
- vpbroadcastq m2, [srcq+strideq*2]
- vpbroadcastq m5, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m3, [srcq+strideq*0]
- vpbroadcastq m6, [srcq+strideq*1]
- vpbroadcastq m0, [srcq+strideq*2]
- vpblendd m1, m1, m4, 0x30
- vpblendd m4, m4, m2, 0x30
- punpcklbw m1, m4 ; 01 12
- vpblendd m2, m2, m5, 0x30
- vpblendd m5, m5, m3, 0x30
- punpcklbw m2, m5 ; 23 34
- vpblendd m3, m3, m6, 0x30
- vpblendd m6, m6, m0, 0x30
- punpcklbw m3, m6 ; 45 56
-.v_w8_loop:
- vpbroadcastq m4, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- pmaddubsw m5, m2, m9 ; a1
- pmaddubsw m6, m2, m8 ; b0
- vpblendd m2, m0, m4, 0x30
- vpbroadcastq m0, [srcq+strideq*0]
- vpblendd m4, m4, m0, 0x30
- punpcklbw m2, m4 ; 67 78
- pmaddubsw m1, m8 ; a0
- pmaddubsw m4, m3, m9 ; b1
- paddw m5, m1
- mova m1, m3
- pmaddubsw m3, m10 ; a2
- paddw m6, m4
- paddw m5, m3
- vpbroadcastq m4, [srcq+strideq*1]
- vpblendd m3, m0, m4, 0x30
- vpbroadcastq m0, [srcq+strideq*2]
- vpblendd m4, m4, m0, 0x30
- punpcklbw m3, m4 ; 89 9a
- pmaddubsw m4, m2, m11 ; a3
- paddw m5, m4
- pmaddubsw m4, m2, m10 ; b2
- paddw m6, m4
- pmaddubsw m4, m3, m11 ; b3
- paddw m6, m4
- pmulhrsw m5, m7
- pmulhrsw m6, m7
- mova [tmpq+32*0], m5
- mova [tmpq+32*1], m6
-%endif
- add tmpq, 32*2
- sub hd, 4
- jg .v_w8_loop
- RET
-.v_w16:
-%if cpuflag(avx512)
- mov r3d, 0xf0
- kmovb k1, r3d
- vbroadcasti128 m0, [srcq+strideq*0]
- vbroadcasti128 m1, [srcq+strideq*1]
- vbroadcasti128 m2, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vbroadcasti128 m4, [srcq+strideq*0]
- vbroadcasti128 m5, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- vmovdqa64 m0{k1}, m1
- vmovdqa64 m1{k1}, m2
- vmovdqa64 m2{k1}, m3
- vmovdqa64 m3{k1}, m4
- vmovdqa64 m4{k1}, m5
- vmovdqa64 m5{k1}, m6
- shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
- shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
- shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
- shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
- punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
- punpcklbw m0, m1 ; 01a 01b 12a 12b
- punpcklbw m4, m5 ; 45a 45b 56a 56b
-.v_w16_loop:
- vbroadcasti128 m3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vbroadcasti128 m5, [srcq+strideq*0]
- vpblendmq m1{k1}, m6, m3
- vmovdqa64 m3{k1}, m5
- pmaddubsw m12, m0, m8
- pmaddubsw m13, m2, m8
- pmaddubsw m14, m2, m9
- pmaddubsw m15, m4, m9
- pmaddubsw m0, m4, m10
- vbroadcasti128 m2, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*2]
- paddw m12, m14
- paddw m13, m15
- paddw m12, m0
- vmovdqa64 m5{k1}, m2
- vmovdqa64 m2{k1}, m6
- mova m0, m4
- shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
- shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
- punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
- punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
- pmaddubsw m14, m2, m10
- pmaddubsw m15, m2, m11
- paddw m13, m14
- paddw m12, m15
- pmaddubsw m14, m4, m11
- paddw m13, m14
- pmulhrsw m12, m7
- pmulhrsw m13, m7
- mova [tmpq+ 0], m12
- mova [tmpq+64], m13
- add tmpq, 64*2
- sub hd, 4
- jg .v_w16_loop
-%else
- lea r6d, [wq-16]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 4
- mov r6b, hb
-.v_w16_loop0:
- vbroadcasti128 m4, [srcq+strideq*0]
- vbroadcasti128 m5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m0, [srcq+strideq*1]
- vbroadcasti128 m6, [srcq+strideq*0]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m1, [srcq+strideq*0]
- vbroadcasti128 m2, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m3, [srcq+strideq*0]
- shufpd m4, m4, m0, 0x0c
- shufpd m5, m5, m1, 0x0c
- punpcklbw m1, m4, m5 ; 01
- punpckhbw m4, m5 ; 34
- shufpd m6, m6, m2, 0x0c
- punpcklbw m2, m5, m6 ; 12
- punpckhbw m5, m6 ; 45
- shufpd m0, m0, m3, 0x0c
- punpcklbw m3, m6, m0 ; 23
- punpckhbw m6, m0 ; 56
-.v_w16_loop:
- vbroadcasti128 m12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vbroadcasti128 m13, [srcq+strideq*0]
- pmaddubsw m14, m1, m8 ; a0
- pmaddubsw m15, m2, m8 ; b0
- mova m1, m3
- mova m2, m4
- pmaddubsw m3, m9 ; a1
- pmaddubsw m4, m9 ; b1
- paddw m14, m3
- paddw m15, m4
- mova m3, m5
- mova m4, m6
- pmaddubsw m5, m10 ; a2
- pmaddubsw m6, m10 ; b2
- paddw m14, m5
- paddw m15, m6
- shufpd m6, m0, m12, 0x0d
- shufpd m0, m12, m13, 0x0c
- punpcklbw m5, m6, m0 ; 67
- punpckhbw m6, m0 ; 78
- pmaddubsw m12, m5, m11 ; a3
- pmaddubsw m13, m6, m11 ; b3
- paddw m14, m12
- paddw m15, m13
- pmulhrsw m14, m7
- pmulhrsw m15, m7
- mova [tmpq+wq*0], m14
- mova [tmpq+wq*2], m15
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jg .v_w16_loop
- movzx hd, r6b
- add r5, 32
- add r7, 16
- mov tmpq, r5
- mov srcq, r7
- sub r6d, 1<<8
- jg .v_w16_loop0
-%endif
- RET
-%if cpuflag(avx512)
-.v_w32:
- mova m18, [bilin_v_perm64]
- movu ym0, [srcq+strideq*0]
- movu ym1, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- movu ym3, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym4, [srcq+strideq*0]
- movu ym5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym6, [srcq+strideq*0]
- vpermq m0, m18, m0
- vpermq m1, m18, m1
- vpermq m2, m18, m2
- vpermq m3, m18, m3
- vpermq m4, m18, m4
- vpermq m5, m18, m5
- vpermq m6, m18, m6
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
- punpcklbw m3, m4
- punpcklbw m4, m5
- punpcklbw m5, m6
-.v_w32_loop:
- movu ym12, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu ym13, [srcq+strideq*0]
- pmaddubsw m14, m0, m8
- pmaddubsw m16, m2, m9
- pmaddubsw m15, m1, m8
- pmaddubsw m17, m3, m9
- mova m0, m2
- mova m1, m3
- vpermq m12, m18, m12
- vpermq m13, m18, m13
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m4, m10
- pmaddubsw m17, m5, m10
- punpcklbw m6, m12
- punpcklbw m12, m13
- mova m2, m4
- mova m3, m5
- paddw m14, m16
- paddw m15, m17
- pmaddubsw m16, m6, m11
- pmaddubsw m17, m12, m11
- mova m4, m6
- mova m5, m12
- paddw m14, m16
- paddw m15, m17
- pmulhrsw m14, m7
- pmulhrsw m15, m7
- mova m6, m13
- mova [tmpq+ 0], m14
- mova [tmpq+64], m15
- add tmpq, 64*2
- sub hd, 2
- jg .v_w32_loop
- vzeroupper
- RET
-.v_w64:
- mov r6d, hd
- mov wd, 64
- jmp .v_start
-.v_w128:
- lea r6d, [(1<<8)+hq]
- mov wd, 128
-.v_start:
- WIN64_SPILL_XMM 27
- mova m26, [bilin_v_perm64]
- mov r5, tmpq
- mov r7, srcq
-.v_loop0:
- vpermq m0, m26, [srcq+strideq*0]
- vpermq m1, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m2, m26, [srcq+strideq*0]
- vpermq m3, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m4, m26, [srcq+strideq*0]
- vpermq m5, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m6, m26, [srcq+strideq*0]
- punpckhbw m12, m0, m1
- punpcklbw m0, m1
- punpckhbw m13, m1, m2
- punpcklbw m1, m2
- punpckhbw m14, m2, m3
- punpcklbw m2, m3
- punpckhbw m15, m3, m4
- punpcklbw m3, m4
- punpckhbw m16, m4, m5
- punpcklbw m4, m5
- punpckhbw m17, m5, m6
- punpcklbw m5, m6
-.v_loop:
- vpermq m18, m26, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpermq m19, m26, [srcq+strideq*0]
- pmaddubsw m20, m0, m8
- pmaddubsw m21, m12, m8
- pmaddubsw m22, m1, m8
- pmaddubsw m23, m13, m8
- mova m0, m2
- mova m12, m14
- mova m1, m3
- mova m13, m15
- pmaddubsw m2, m9
- pmaddubsw m14, m9
- pmaddubsw m3, m9
- pmaddubsw m15, m9
- punpckhbw m24, m6, m18
- punpcklbw m6, m18
- paddw m20, m2
- paddw m21, m14
- paddw m22, m3
- paddw m23, m15
- mova m2, m4
- mova m14, m16
- mova m3, m5
- mova m15, m17
- pmaddubsw m4, m10
- pmaddubsw m16, m10
- pmaddubsw m5, m10
- pmaddubsw m17, m10
- punpckhbw m25, m18, m19
- punpcklbw m18, m19
- paddw m20, m4
- paddw m21, m16
- paddw m22, m5
- paddw m23, m17
- mova m4, m6
- mova m16, m24
- mova m5, m18
- mova m17, m25
- pmaddubsw m6, m11
- pmaddubsw m24, m11
- pmaddubsw m18, m11
- pmaddubsw m25, m11
- paddw m20, m6
- paddw m21, m24
- paddw m22, m18
- paddw m23, m25
- pmulhrsw m20, m7
- pmulhrsw m21, m7
- pmulhrsw m22, m7
- pmulhrsw m23, m7
- mova m6, m19
- mova [tmpq+wq*0+ 0], m20
- mova [tmpq+wq*0+64], m21
- mova [tmpq+wq*2+ 0], m22
- mova [tmpq+wq*2+64], m23
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jg .v_loop
- movzx hd, r6b
- add r5, 64*2
- add r7, 64
- mov tmpq, r5
- mov srcq, r7
- sub r6d, 1<<8
- jg .v_loop0
-%endif
- RET
-.hv:
- %assign stack_offset stack_offset - stack_size_padded
- %assign stack_size_padded 0
- WIN64_SPILL_XMM 16
- cmp wd, 4
- je .hv_w4
- shr mxd, 16
- sub srcq, 3
- vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
- vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 4
- cmove myd, mxd
-%if cpuflag(avx512)
- tzcnt wd, wd
- vpbroadcastd m8, [pd_2]
- movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
- vpbroadcastd m9, [pd_32]
- add wq, r7
-%endif
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
- lea stride3q, [strideq*3]
- sub srcq, stride3q
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- pshufd m12, m0, q0000
- pshufd m13, m0, q1111
- pshufd m14, m0, q2222
- pshufd m15, m0, q3333
-%if cpuflag(avx512)
- jmp wq
-%else
- jmp .hv_w8
-%endif
-.hv_w4:
- movzx mxd, mxb
- dec srcq
- vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
- movzx mxd, myb
- shr myd, 16
- cmp hd, 4
- cmove myd, mxd
- vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
- lea stride3q, [strideq*3]
- sub srcq, stride3q
-%if cpuflag(avx512)
- mov r3d, 0x04
- kmovb k1, r3d
- kshiftlb k2, k1, 2
- kshiftlb k3, k1, 4
- vpbroadcastd m10, [pd_2]
- vbroadcasti128 m16, [subpel_h_shufA]
-%else
- mova m7, [subpel_h_shuf4]
- pmovzxbd m9, [deint_shuf4]
- vpbroadcastd m10, [pw_8192]
-%endif
- punpcklbw m0, m0
- psraw m0, 8 ; sign-extend
- vpbroadcastd m11, [pd_32]
- pshufd m12, m0, q0000
- pshufd m13, m0, q1111
- pshufd m14, m0, q2222
- pshufd m15, m0, q3333
-%if cpuflag(avx512icl)
- movq xm3, [srcq+strideq*0]
- vpbroadcastq ym2, [srcq+strideq*1]
- vpbroadcastq ym3{k1}, [srcq+strideq*2]
- vpbroadcastq m2{k2}, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m3{k2}, [srcq+strideq*0]
- vpbroadcastq m2{k3}, [srcq+strideq*1]
- vpbroadcastq m3{k3}, [srcq+strideq*2]
- mova m17, [spel_hv_perm4a]
- movu m18, [spel_hv_perm4b]
- mova m0, m10
- mova m1, m10
- pshufb m2, m16
- pshufb m3, m16
- vpdpbusd m0, m2, m8
- vpdpbusd m1, m3, m8
- packssdw m0, m1 ; _ 0 1 2 3 4 5 6
- psraw m0, 2
- vpermb m1, m17, m0 ; 01 12 23 34
- vpermb m2, m18, m0 ; 23 34 45 56
-.hv_w4_loop:
- movq xm3, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- movq xm4, [srcq+strideq*0]
- vpbroadcastq ym3{k1}, [srcq+strideq*1]
- vpbroadcastq ym4{k1}, [srcq+strideq*2]
- mova ym5, ym10
- mova ym6, ym10
- pshufb ym3, ym16
- pshufb ym4, ym16
- vpdpbusd ym5, ym3, ym8
- vpdpbusd ym6, ym4, ym8
- mova m7, m11
- packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
- psraw ym5, 2
- valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
- vpdpwssd m7, m1, m12
- vpdpwssd m7, m2, m13
- vpermb m1, m17, m0 ; 45 56 67 78
- vpermb m2, m18, m0 ; 67 78 89 9a
- vpdpwssd m7, m1, m14
- vpdpwssd m7, m2, m15
- psrad m7, 6
- vpmovdw [tmpq], m7
-%else
- vpbroadcastq m2, [srcq+strideq*0]
- vpbroadcastq m4, [srcq+strideq*1]
- vpbroadcastq m0, [srcq+strideq*2]
- vpbroadcastq m5, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vpbroadcastq m3, [srcq+strideq*0]
- vpbroadcastq m6, [srcq+strideq*1]
- vpbroadcastq m1, [srcq+strideq*2]
- vpblendd m2, m2, m4, 0xcc ; 0 1
- vpblendd m0, m0, m5, 0xcc ; 2 3
- vpblendd m3, m3, m6, 0xcc ; 4 5
- pshufb m2, m7 ; 00 01 10 11 02 03 12 13
- pshufb m0, m7 ; 20 21 30 31 22 23 32 33
- pshufb m3, m7 ; 40 41 50 51 42 43 52 53
- pshufb m1, m7 ; 60 61 60 61 62 63 62 63
- pmaddubsw m2, m8
- pmaddubsw m0, m8
- pmaddubsw m3, m8
- pmaddubsw m1, m8
- phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b
- phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __
- pmulhrsw m2, m10
- pmulhrsw m3, m10
- palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b
- punpcklwd m1, m2, m4 ; 01 12
- punpckhwd m2, m4 ; 23 34
- pshufd m0, m3, q2121
- punpcklwd m3, m0 ; 45 56
-.hv_w4_loop:
- pmaddwd m5, m1, m12 ; a0 b0
- pmaddwd m6, m2, m12 ; c0 d0
- pmaddwd m2, m13 ; a1 b1
- pmaddwd m4, m3, m13 ; c1 d1
- mova m1, m3
- pmaddwd m3, m14 ; a2 b2
- paddd m5, m2
- vpbroadcastq m2, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- paddd m6, m4
- paddd m5, m3
- vpbroadcastq m4, [srcq+strideq*0]
- vpbroadcastq m3, [srcq+strideq*1]
- vpblendd m2, m2, m4, 0xcc
- vpbroadcastq m4, [srcq+strideq*2]
- vpblendd m3, m3, m4, 0xcc
- pshufb m2, m7
- pshufb m3, m7
- pmaddubsw m2, m8
- pmaddubsw m3, m8
- phaddw m2, m3
- pmulhrsw m2, m10
- palignr m3, m2, m0, 12
- mova m0, m2
- punpcklwd m2, m3, m0 ; 67 78
- punpckhwd m3, m0 ; 89 9a
- pmaddwd m4, m2, m14 ; c2 d2
- paddd m6, m11
- paddd m5, m11
- paddd m6, m4
- pmaddwd m4, m2, m15 ; a3 b3
- paddd m5, m4
- pmaddwd m4, m3, m15 ; c3 d3
- paddd m6, m4
- psrad m5, 6
- psrad m6, 6
- packssdw m5, m6
- vpermd m5, m9, m5
- mova [tmpq], m5
-%endif
- add tmpq, 32
- sub hd, 4
- jg .hv_w4_loop
-%if cpuflag(avx512)
- vzeroupper
-%endif
- RET
-.hv_w8:
-%if cpuflag(avx512icl)
- WIN64_SPILL_XMM 24
- vbroadcasti128 m16, [subpel_h_shufA]
- vbroadcasti128 m17, [subpel_h_shufB]
- vbroadcasti128 m18, [subpel_h_shufC]
- vinserti128 ym0, [srcq+strideq*0], 1
- vinserti128 m0, [srcq+strideq*1], 2
- vinserti128 m0, [srcq+strideq*2], 3
- movu xm1, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vinserti128 ym1, [srcq+strideq*0], 1
- vinserti128 m1, [srcq+strideq*1], 2
- vinserti128 m1, [srcq+strideq*2], 3
- mova m2, m8
- mova m4, m8
- mova m3, m8
- mova m5, m8
- pshufb m20, m0, m16
- pshufb m21, m0, m17
- pshufb m22, m0, m18
- pshufb m23, m1, m16
- pshufb m6, m1, m17
- pshufb m7, m1, m18
- vpdpbusd m2, m20, m10
- vpdpbusd m4, m21, m10
- vpdpbusd m2, m21, m11
- vpdpbusd m4, m22, m11
- vpdpbusd m3, m23, m10
- vpdpbusd m5, m6, m10
- vpdpbusd m3, m6, m11
- vpdpbusd m5, m7, m11
- packssdw m2, m4
- packssdw m3, m5
- psraw m2, 2 ; _ 0 1 2
- psraw m3, 2 ; 3 4 5 6
- valignq m0, m3, m2, 2 ; 0 1 2 3
- valignq m1, m3, m2, 4 ; 1 2 3 4
- valignq m2, m3, m2, 6 ; 2 3 4 5
- punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
- punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
- punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
- punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
-.hv_w8_loop:
- movu xm19, [srcq+stride3q ]
- lea srcq, [srcq+strideq*4]
- vinserti128 ym19, [srcq+strideq*0], 1
- vinserti128 m19, [srcq+strideq*1], 2
- vinserti128 m19, [srcq+strideq*2], 3
- mova m20, m9
- mova m21, m9
- mova m22, m8
- mova m23, m8
- vpdpwssd m20, m4, m12
- vpdpwssd m21, m5, m12
- vpdpwssd m20, m6, m13
- vpdpwssd m21, m7, m13
- pshufb m0, m19, m16
- pshufb m1, m19, m17
- pshufb m2, m19, m18
- vpdpbusd m22, m0, m10
- vpdpbusd m23, m1, m10
- vpdpbusd m22, m1, m11
- vpdpbusd m23, m2, m11
- packssdw m22, m23
- psraw m22, 2 ; 7 8 9 A
- valignq m0, m22, m3, 2 ; 4 5 6 7
- valignq m1, m22, m3, 4 ; 5 6 7 8
- valignq m2, m22, m3, 6 ; 6 7 8 9
- mova m3, m22
- punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
- punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
- punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
- punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
- psrad m20, 6
- psrad m21, 6
- packssdw m20, m21
- mova [tmpq], m20
- add tmpq, 64
- sub hd, 4
- jg .hv_w8_loop
-%else
- lea r6d, [wq-8]
- mov r5, tmpq
- mov r7, srcq
- shl r6d, 5
- mov r6b, hb
-.hv_w8_loop0:
- vbroadcasti128 m7, [subpel_h_shufA]
- vbroadcasti128 m8, [subpel_h_shufB]
- vbroadcasti128 m9, [subpel_h_shufC]
- movu xm4, [srcq+strideq*0]
- movu xm5, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- movu xm6, [srcq+strideq*0]
- vbroadcasti128 m0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vpblendd m4, m4, m0, 0xf0 ; 0 3
- vinserti128 m5, m5, [srcq+strideq*0], 1 ; 1 4
- vinserti128 m6, m6, [srcq+strideq*1], 1 ; 2 5
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, m0, [srcq+strideq*0], 1 ; 3 6
- HV_H_W8 m4, m1, m2, m3, m7, m8, m9
- HV_H_W8 m5, m1, m2, m3, m7, m8, m9
- HV_H_W8 m6, m1, m2, m3, m7, m8, m9
- HV_H_W8 m0, m1, m2, m3, m7, m8, m9
- vpbroadcastd m7, [pw_8192]
- vpermq m4, m4, q3120
- vpermq m5, m5, q3120
- vpermq m6, m6, q3120
- pmulhrsw m0, m7
- pmulhrsw m4, m7
- pmulhrsw m5, m7
- pmulhrsw m6, m7
- vpermq m7, m0, q3120
- punpcklwd m1, m4, m5 ; 01
- punpckhwd m4, m5 ; 34
- punpcklwd m2, m5, m6 ; 12
- punpckhwd m5, m6 ; 45
- punpcklwd m3, m6, m7 ; 23
- punpckhwd m6, m7 ; 56
-.hv_w8_loop:
- vextracti128 [tmpq], m0, 1 ; not enough registers
- movu xm0, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti128 m0, m0, [srcq+strideq*0], 1 ; 7 8
- pmaddwd m8, m1, m12 ; a0
- pmaddwd m9, m2, m12 ; b0
- mova m1, m3
- mova m2, m4
- pmaddwd m3, m13 ; a1
- pmaddwd m4, m13 ; b1
- paddd m8, m3
- paddd m9, m4
- mova m3, m5
- mova m4, m6
- pmaddwd m5, m14 ; a2
- pmaddwd m6, m14 ; b2
- paddd m8, m5
- paddd m9, m6
- vbroadcasti128 m6, [subpel_h_shufB]
- vbroadcasti128 m7, [subpel_h_shufC]
- vbroadcasti128 m5, [subpel_h_shufA]
- HV_H_W8 m0, m5, m6, m7, m5, m6, m7
- vpbroadcastd m5, [pw_8192]
- vpbroadcastd m7, [pd_32]
- vbroadcasti128 m6, [tmpq]
- pmulhrsw m0, m5
- paddd m8, m7
- paddd m9, m7
- vpermq m7, m0, q3120 ; 7 8
- shufpd m6, m6, m7, 0x04 ; 6 7
- punpcklwd m5, m6, m7 ; 67
- punpckhwd m6, m7 ; 78
- pmaddwd m7, m5, m15 ; a3
- paddd m8, m7
- pmaddwd m7, m6, m15 ; b3
- paddd m7, m9
- psrad m8, 6
- psrad m7, 6
- packssdw m8, m7
- vpermq m7, m8, q3120
- mova [tmpq+wq*0], xm7
- vextracti128 [tmpq+wq*2], m7, 1
- lea tmpq, [tmpq+wq*4]
- sub hd, 2
- jg .hv_w8_loop
- movzx hd, r6b
- add r5, 16
- add r7, 8
- mov tmpq, r5
- mov srcq, r7
- sub r6d, 1<<8
- jg .hv_w8_loop0
-%endif
- RET
-%if cpuflag(avx512icl)
-.hv_w16:
- mov wd, 16*2
- jmp .hv_start
-.hv_w32:
- mov wd, 32*2
- jmp .hv_start
-.hv_w64:
- mov wd, 64*2
- jmp .hv_start
-.hv_w128:
- mov wd, 128*2
-.hv_start:
- WIN64_SPILL_XMM 31
- mova m16, [spel_h_perm16a]
- mova m17, [spel_h_perm16b]
- mova m18, [spel_h_perm16c]
- lea r6d, [wq*8-16*2*8+hq]
- mov r5, tmpq
- mov r7, srcq
-.hv_loop0:
- movu ym0, [srcq+strideq*0]
- vinserti32x8 m0, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym1, [srcq+strideq*0]
- vinserti32x8 m1, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym2, [srcq+strideq*0]
- vinserti32x8 m2, [srcq+strideq*1], 1
- lea srcq, [srcq+strideq*2]
- movu ym3, [srcq+strideq*0]
- mova m4, m8
- mova m5, m8
- mova m6, m8
- mova m7, m8
- vpermb m19, m16, m0
- vpermb m20, m17, m0
- vpermb m21, m18, m0
- vpermb m22, m16, m1
- vpermb m23, m17, m1
- vpermb m24, m18, m1
- vpermb m25, m16, m2
- vpermb m26, m17, m2
- vpermb m27, m18, m2
- vpermb ym28, ym16, ym3
- vpermb ym29, ym17, ym3
- vpermb ym30, ym18, ym3
- mova m0, m8
- mova m1, m8
- mova ym2, ym8
- mova ym3, ym8
- vpdpbusd m4, m19, m10
- vpdpbusd m5, m20, m10
- vpdpbusd m6, m22, m10
- vpdpbusd m7, m23, m10
- vpdpbusd m0, m25, m10
- vpdpbusd m1, m26, m10
- vpdpbusd ym2, ym28, ym10
- vpdpbusd ym3, ym29, ym10
- vpdpbusd m4, m20, m11
- vpdpbusd m5, m21, m11
- vpdpbusd m6, m23, m11
- vpdpbusd m7, m24, m11
- vpdpbusd m0, m26, m11
- vpdpbusd m1, m27, m11
- vpdpbusd ym2, ym29, ym11
- vpdpbusd ym3, ym30, ym11
- packssdw m4, m5
- packssdw m6, m7
- packssdw m0, m1
- packssdw ym2, ym3
- psraw m4, 2 ; 0a 0b 1a 1b
- psraw m6, 2 ; 2a 2b 3a 3b
- psraw m0, 2 ; 4a 4b 5a 5b
- psraw ym2, 2 ; 6a 6b __ __
- vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
- vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
- vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
- punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
- punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
- punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
- punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
- punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
- punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
-.hv_loop:
- movu ym19, [srcq+strideq*1]
- lea srcq, [srcq+strideq*2]
- vinserti32x8 m19, [srcq+strideq*0], 1
- mova m20, m9
- mova m21, m9
- mova m22, m8
- mova m23, m8
- vpdpwssd m20, m2, m12
- vpdpwssd m21, m3, m12
- vpdpwssd m20, m4, m13
- vpdpwssd m21, m5, m13
- vpermb m24, m16, m19
- vpermb m25, m17, m19
- vpermb m26, m18, m19
- vpdpbusd m22, m24, m10
- vpdpbusd m23, m25, m10
- vpdpbusd m22, m25, m11
- vpdpbusd m23, m26, m11
- packssdw m22, m23
- psraw m22, 2 ; 7a 7b 8a 8b
- vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
- mova m2, m4
- mova m3, m5
- mova m1, m22
- mova m4, m6
- mova m5, m7
- punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
- punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
- vpdpwssd m20, m4, m14
- vpdpwssd m21, m5, m14
- vpdpwssd m20, m6, m15
- vpdpwssd m21, m7, m15
- psrad m20, 6
- psrad m21, 6
- packssdw m20, m21
- mova [tmpq+wq*0], ym20
- vextracti32x8 [tmpq+wq*1], m20, 1
- lea tmpq, [tmpq+wq*2]
- sub hd, 2
- jg .hv_loop
- movzx hd, r6b
- add r5, 32
- add r7, 16
- mov tmpq, r5
- mov srcq, r7
- sub r6d, 1<<8
- jg .hv_loop0
-%endif
- RET
-%endmacro
-
-%macro movifprep 2
- %if isprep
- mov %1, %2
- %endif
-%endmacro
-
-%macro REMAP_REG 2
- %xdefine r%1 r%2
- %xdefine r%1q r%2q
- %xdefine r%1d r%2d
-%endmacro
-
-%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
- %if isprep
- %xdefine r14_save r14
- %assign %%i 14
- %rep 14
- %assign %%j %%i-1
- REMAP_REG %%i, %%j
- %assign %%i %%i-1
- %endrep
- %endif
-%endmacro
-
-%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
- %if isprep
- %assign %%i 1
- %rep 13
- %assign %%j %%i+1
- REMAP_REG %%i, %%j
- %assign %%i %%i+1
- %endrep
- %xdefine r14 r14_save
- %undef r14_save
- %endif
-%endmacro
-
-%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
- MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
- RET
- %if %1
- MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
- %endif
-%endmacro
-
-%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
- movq xm%1, [srcq+ r4]
- movq xm%2, [srcq+ r6]
- movhps xm%1, [srcq+ r7]
- movhps xm%2, [srcq+ r9]
- vinserti128 m%1, [srcq+r10], 1
- vinserti128 m%2, [srcq+r11], 1
- vpbroadcastq m%5, [srcq+r13]
- vpbroadcastq m%6, [srcq+ rX]
- add srcq, ssq
- movq xm%3, [srcq+ r4]
- movq xm%4, [srcq+ r6]
- movhps xm%3, [srcq+ r7]
- movhps xm%4, [srcq+ r9]
- vinserti128 m%3, [srcq+r10], 1
- vinserti128 m%4, [srcq+r11], 1
- vpbroadcastq m%7, [srcq+r13]
- vpbroadcastq m%8, [srcq+ rX]
- add srcq, ssq
- vpblendd m%1, m%5, 0xc0
- vpblendd m%2, m%6, 0xc0
- vpblendd m%3, m%7, 0xc0
- vpblendd m%4, m%8, 0xc0
- pmaddubsw m%1, m15
- pmaddubsw m%2, m10
- pmaddubsw m%3, m15
- pmaddubsw m%4, m10
- phaddw m%1, m%2
- phaddw m%3, m%4
- phaddw m%1, m%3
- pmulhrsw m%1, m12
-%endmacro
-
-%macro MC_8TAP_SCALED 1
-%ifidn %1, put
- %assign isprep 0
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
- %else
-cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
- %endif
- %xdefine base_reg r12
- %define rndshift 10
-%else
- %assign isprep 1
- %if required_stack_alignment <= STACK_ALIGNMENT
-cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
- %xdefine tmp_stridem r14q
- %else
-cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
- %define tmp_stridem qword [rsp+104]
- %endif
- %xdefine base_reg r11
- %define rndshift 6
-%endif
- lea base_reg, [%1_8tap_scaled_avx2]
-%define base base_reg-%1_8tap_scaled_avx2
- tzcnt wd, wm
- vpbroadcastd m8, dxm
-%if isprep && UNIX64
- movd xm14, mxd
- vpbroadcastd m14, xm14
- mov r5d, t0d
- DECLARE_REG_TMP 5, 7
-%else
- vpbroadcastd m14, mxm
-%endif
- mov dyd, dym
-%ifidn %1, put
- %if WIN64
- mov r8d, hm
- DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
- %define hm r5m
- %define dxm r8m
- %else
- DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
- %define hm r6m
- %endif
- %if required_stack_alignment > STACK_ALIGNMENT
- %define dsm [rsp+96]
- %define rX r1
- %define rXd r1d
- %else
- %define dsm dsq
- %define rX r14
- %define rXd r14d
- %endif
-%else ; prep
- %if WIN64
- mov r7d, hm
- DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
- %define hm r4m
- %define dxm r7m
- %else
- DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
- %define hm [rsp+96]
- %endif
- MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
- %define rX r14
- %define rXd r14d
-%endif
- vpbroadcastd m10, [base+pd_0x3ff]
- vpbroadcastd m12, [base+pw_8192]
-%ifidn %1, put
- vpbroadcastd m13, [base+pd_512]
-%else
- vpbroadcastd m13, [base+pd_32]
-%endif
- pxor m9, m9
- lea ss3q, [ssq*3]
- movzx r7d, t1b
- shr t1d, 16
- cmp hd, 6
- cmovs t1d, r7d
- sub srcq, ss3q
- cmp dyd, 1024
- je .dy1
- cmp dyd, 2048
- je .dy2
- movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
- add wq, base_reg
- jmp wq
-%ifidn %1, put
-.w2:
- mov myd, mym
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- punpckldq m8, m9, m8
- paddd m14, m8 ; mx+dx*[0-1]
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- pand m8, m14, m10
- psrld m8, 6
- paddd xm15, xm8
- movd r4d, xm15
- pextrd r6d, xm15, 1
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vbroadcasti128 m6, [base+subpel_s_shuf2]
- vpbroadcastd m15, [base+subpel_filters+r4*8+2]
- vpbroadcastd m7, [base+subpel_filters+r6*8+2]
- pcmpeqd m8, m9
- psrld m14, 10
- movq xm0, [srcq+ssq*0]
- movq xm1, [srcq+ssq*2]
- movhps xm0, [srcq+ssq*1]
- movhps xm1, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- pshufb m14, m5
- paddb m14, m6
- vinserti128 m0, [srcq+ssq*0], 1
- vinserti128 m1, [srcq+ssq*2], 1
- vpbroadcastq m2, [srcq+ssq*1]
- vpbroadcastq m3, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- vpblendd m15, m7, 0xaa
- vpblendd m0, m2, 0xc0 ; 0 1 4 5
- vpblendd m1, m3, 0xc0 ; 2 3 6 7
- pblendvb m15, m11, m8
- pshufb m0, m14
- pshufb m1, m14
- pmaddubsw m0, m15
- pmaddubsw m1, m15
- phaddw m0, m1
- pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
- vextracti128 xm1, m0, 1 ; 4 5 6 7
- palignr xm2, xm1, xm0, 4 ; 1 2 3 4
- punpcklwd xm3, xm0, xm2 ; 01 12
- punpckhwd xm0, xm2 ; 23 34
- pshufd xm4, xm1, q0321 ; 5 6 7 _
- punpcklwd xm2, xm1, xm4 ; 45 56
- punpckhwd xm4, xm1, xm4 ; 67 __
-.w2_loop:
- and myd, 0x3ff
- mov r6d, 64 << 24
- mov r4d, myd
- shr r4d, 6
- lea r4d, [t1+r4]
- cmovnz r6q, [base+subpel_filters+r4*8]
- movq xm11, r6q
- punpcklbw xm11, xm11
- psraw xm11, 8
- pshufd xm8, xm11, q0000
- pshufd xm9, xm11, q1111
- pshufd xm10, xm11, q2222
- pshufd xm11, xm11, q3333
- pmaddwd xm5, xm3, xm8
- pmaddwd xm6, xm0, xm9
- pmaddwd xm7, xm2, xm10
- pmaddwd xm8, xm4, xm11
- paddd xm5, xm6
- paddd xm7, xm8
- paddd xm5, xm13
- paddd xm5, xm7
- psrad xm5, 10
- packssdw xm5, xm5
- packuswb xm5, xm5
- pextrw [dstq], xm5, 0
- add dstq, dsq
- dec hd
- jz .ret
- add myd, dyd
- test myd, ~0x3ff
- jz .w2_loop
- movq xm5, [srcq]
- test myd, 0x400
- jz .w2_skip_line
- add srcq, ssq
- shufps xm3, xm0, q1032 ; 01 12
- shufps xm0, xm2, q1032 ; 23 34
- shufps xm2, xm4, q1032 ; 45 56
- pshufb xm5, xm14
- pmaddubsw xm5, xm15
- phaddw xm5, xm5
- pmulhrsw xm5, xm12
- palignr xm1, xm5, xm1, 12
- punpcklqdq xm1, xm1 ; 6 7 6 7
- punpcklwd xm4, xm1, xm5 ; 67 __
- jmp .w2_loop
-.w2_skip_line:
- movhps xm5, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- mova xm3, xm0 ; 01 12
- mova xm0, xm2 ; 23 34
- pshufb xm5, xm14
- pmaddubsw xm5, xm15
- phaddw xm5, xm5
- pmulhrsw xm5, xm12 ; 6 7 6 7
- palignr xm1, xm5, xm1, 8 ; 4 5 6 7
- pshufd xm5, xm1, q0321 ; 5 6 7 _
- punpcklwd xm2, xm1, xm5 ; 45 56
- punpckhwd xm4, xm1, xm5 ; 67 __
- jmp .w2_loop
-%endif
-.w4:
- mov myd, mym
- vbroadcasti128 m7, [base+rescale_mul]
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- pmaddwd m8, m7
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- paddd m14, m8 ; mx+dx*[0-3]
- pand m0, m14, m10
- psrld m0, 6
- paddd xm15, xm0
- movd r4d, xm15
- pextrd r6d, xm15, 1
- pextrd r11d, xm15, 2
- pextrd r13d, xm15, 3
- movd xm15, [base+subpel_filters+r4*8+2]
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vpbroadcastq m6, [base+subpel_s_shuf2]
- pinsrd xm15, [base+subpel_filters+r6*8+2], 1
- pcmpeqd m0, m9
- psrld m14, 10
- movu xm7, [srcq+ssq*0]
- movu xm9, [srcq+ssq*1]
- pinsrd xm15, [base+subpel_filters+r11*8+2], 2
- movu xm8, [srcq+ssq*2]
- movu xm10, [srcq+ss3q ]
- pinsrd xm15, [base+subpel_filters+r13*8+2], 3
- lea srcq, [srcq+ssq*4]
- pshufb m14, m5
- paddb m14, m6
- vinserti128 m7, [srcq+ssq*0], 1
- vinserti128 m9, [srcq+ssq*1], 1
- vinserti128 m15, xm15, 1
- vinserti128 m8, [srcq+ssq*2], 1
- vinserti128 m10, [srcq+ss3q ], 1
- lea srcq, [srcq+ssq*4]
- pblendvb m15, m11, m0
- pshufb m7, m14
- pshufb m9, m14
- pshufb m8, m14
- pshufb m10, m14
- pmaddubsw m7, m15
- pmaddubsw m9, m15
- pmaddubsw m8, m15
- pmaddubsw m10, m15
- phaddw m7, m9
- phaddw m8, m10
- pmulhrsw m7, m12 ; 0 1 4 5
- pmulhrsw m8, m12 ; 2 3 6 7
- vextracti128 xm9, m7, 1 ; 4 5
- vextracti128 xm3, m8, 1 ; 6 7
- shufps xm4, xm7, xm8, q1032 ; 1 2
- shufps xm5, xm8, xm9, q1032 ; 3 4
- shufps xm6, xm9, xm3, q1032 ; 5 6
- psrldq xm11, xm3, 8 ; 7 _
- punpcklwd xm0, xm7, xm4 ; 01
- punpckhwd xm7, xm4 ; 12
- punpcklwd xm1, xm8, xm5 ; 23
- punpckhwd xm8, xm5 ; 34
- punpcklwd xm2, xm9, xm6 ; 45
- punpckhwd xm9, xm6 ; 56
- punpcklwd xm3, xm11 ; 67
- mova [rsp+0x00], xm7
- mova [rsp+0x10], xm8
- mova [rsp+0x20], xm9
-.w4_loop:
- and myd, 0x3ff
- mov r6d, 64 << 24
- mov r4d, myd
- shr r4d, 6
- lea r4d, [t1+r4]
- cmovnz r6q, [base+subpel_filters+r4*8]
- movq xm10, r6q
- punpcklbw xm10, xm10
- psraw xm10, 8
- pshufd xm7, xm10, q0000
- pshufd xm8, xm10, q1111
- pshufd xm9, xm10, q2222
- pshufd xm10, xm10, q3333
- pmaddwd xm4, xm0, xm7
- pmaddwd xm5, xm1, xm8
- pmaddwd xm6, xm2, xm9
- pmaddwd xm7, xm3, xm10
- paddd xm4, xm5
- paddd xm6, xm7
- paddd xm4, xm13
- paddd xm4, xm6
- psrad xm4, rndshift
- packssdw xm4, xm4
-%ifidn %1, put
- packuswb xm4, xm4
- movd [dstq], xm4
- add dstq, dsq
-%else
- movq [tmpq], xm4
- add tmpq, 8
-%endif
- dec hd
- jz .ret
- add myd, dyd
- test myd, ~0x3ff
- jz .w4_loop
- movu xm4, [srcq]
- test myd, 0x400
- jz .w4_skip_line
- mova xm0, [rsp+0x00]
- mova [rsp+0x00], xm1
- mova xm1, [rsp+0x10]
- mova [rsp+0x10], xm2
- mova xm2, [rsp+0x20]
- mova [rsp+0x20], xm3
- pshufb xm4, xm14
- pmaddubsw xm4, xm15
- phaddw xm4, xm4
- pmulhrsw xm4, xm12
- punpcklwd xm3, xm11, xm4
- mova xm11, xm4
- add srcq, ssq
- jmp .w4_loop
-.w4_skip_line:
- movu xm5, [srcq+ssq*1]
- movu m6, [rsp+0x10]
- pshufb xm4, xm14
- pshufb xm5, xm14
- pmaddubsw xm4, xm15
- pmaddubsw xm5, xm15
- movu [rsp+0x00], m6
- phaddw xm4, xm5
- pmulhrsw xm4, xm12
- punpcklwd xm9, xm11, xm4
- mova [rsp+0x20], xm9
- psrldq xm11, xm4, 8
- mova xm0, xm1
- mova xm1, xm2
- mova xm2, xm3
- punpcklwd xm3, xm4, xm11
- lea srcq, [srcq+ssq*2]
- jmp .w4_loop
-.w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+r4*8]
- movq xm10, [base+subpel_filters+r6*8]
- movhps xm15, [base+subpel_filters+r7*8]
- movhps xm10, [base+subpel_filters+r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+rX*8]
- psrld m14, 10
- mova [rsp], xm14
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- mov dyd, dym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- vbroadcasti128 m14, [base+wswap]
-.w8_loop:
- and myd, 0x3ff
- mov r6d, 64 << 24
- mov r4d, myd
- shr r4d, 6
- lea r4d, [t1+r4]
- cmovnz r6q, [base+subpel_filters+r4*8]
- movq xm11, r6q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pshufd m8, m11, q2222
- pshufd m11, m11, q3333
- pmaddwd m6, m2, m8
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- add myd, dyd
- test myd, ~0x3ff
- jz .w8_loop
- test myd, 0x400
- mov [rsp+16], myd
- mov r4d, [rsp+ 0]
- mov r6d, [rsp+ 8]
- mov r7d, [rsp+ 4]
- mov r9d, [rsp+12]
- jz .w8_skip_line
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- add srcq, ssq
- mov myd, [rsp+16]
- mov dyd, dym
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .w8_loop
-.w8_skip_line:
- mova m0, m1
- mova m1, m2
- mova m2, m3
- vpbroadcastq m7, [srcq+r13]
- vpbroadcastq m8, [srcq+ rX]
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- add srcq, ssq
- movq xm5, [srcq+ r4]
- movq xm6, [srcq+ r6]
- movhps xm5, [srcq+ r7]
- movhps xm6, [srcq+ r9]
- vinserti128 m5, [srcq+r10], 1
- vinserti128 m6, [srcq+r11], 1
- vpbroadcastq m9, [srcq+r13]
- vpbroadcastq m11, [srcq+ rX]
- add srcq, ssq
- mov myd, [rsp+16]
- mov dyd, dym
- vpblendd m3, m7, 0xc0
- vpblendd m4, m8, 0xc0
- vpblendd m5, m9, 0xc0
- vpblendd m6, m11, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- pmaddubsw m5, m15
- pmaddubsw m6, m10
- phaddw m3, m4
- phaddw m5, m6
- psrld m4, m3, 16
- pslld m6, m5, 16
- paddw m3, m4
- paddw m5, m6
- pblendw m3, m5, 0xaa
- pmulhrsw m3, m12
- jmp .w8_loop
-.w16:
- mov dword [rsp+48], 2
- movifprep tmp_stridem, 32
- jmp .w_start
-.w32:
- mov dword [rsp+48], 4
- movifprep tmp_stridem, 64
- jmp .w_start
-.w64:
- mov dword [rsp+48], 8
- movifprep tmp_stridem, 128
- jmp .w_start
-.w128:
- mov dword [rsp+48], 16
- movifprep tmp_stridem, 256
-.w_start:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- pmaddwd m8, [base+rescale_mul]
- movd xm15, t0d
- mov [rsp+72], t0d
- mov [rsp+56], srcq
- mov [rsp+64], r0q ; dstq / tmpq
-%if UNIX64
- mov hm, hd
-%endif
- shl dword dxm, 3 ; dx*8
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- jmp .hloop
-.hloop_prep:
- dec dword [rsp+48]
- jz .ret
- add qword [rsp+64], 8*(isprep+1)
- mov hd, hm
- vpbroadcastd m8, dxm
- vpbroadcastd m10, [base+pd_0x3ff]
- paddd m14, m8, [rsp+16]
- vpbroadcastd m15, [rsp+72]
- pxor m9, m9
- mov srcq, [rsp+56]
- mov r0q, [rsp+64] ; dstq / tmpq
-.hloop:
- vpbroadcastq m11, [base+pq_0x40000000]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movu [rsp+16], m14
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- mova [rsp], xm14
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- mov dyd, dym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- vbroadcasti128 m14, [base+wswap]
-.vloop:
- and myd, 0x3ff
- mov r6d, 64 << 24
- mov r4d, myd
- shr r4d, 6
- lea r4d, [t1+r4]
- cmovnz r6q, [base+subpel_filters+r4*8]
- movq xm11, r6q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pshufd m8, m11, q2222
- pshufd m11, m11, q3333
- pmaddwd m6, m2, m8
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, tmp_stridem
-%endif
- dec hd
- jz .hloop_prep
- add myd, dyd
- test myd, ~0x3ff
- jz .vloop
- test myd, 0x400
- mov [rsp+52], myd
- mov r4d, [rsp+ 0]
- mov r6d, [rsp+ 8]
- mov r7d, [rsp+ 4]
- mov r9d, [rsp+12]
- jz .skip_line
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- add srcq, ssq
- mov myd, [rsp+52]
- mov dyd, dym
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .vloop
-.skip_line:
- mova m0, m1
- mova m1, m2
- mova m2, m3
- vpbroadcastq m7, [srcq+r13]
- vpbroadcastq m8, [srcq+ rX]
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- add srcq, ssq
- movq xm5, [srcq+ r4]
- movq xm6, [srcq+ r6]
- movhps xm5, [srcq+ r7]
- movhps xm6, [srcq+ r9]
- vinserti128 m5, [srcq+r10], 1
- vinserti128 m6, [srcq+r11], 1
- vpbroadcastq m9, [srcq+r13]
- vpbroadcastq m11, [srcq+ rX]
- add srcq, ssq
- mov myd, [rsp+52]
- mov dyd, dym
- vpblendd m3, m7, 0xc0
- vpblendd m4, m8, 0xc0
- vpblendd m5, m9, 0xc0
- vpblendd m6, m11, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- pmaddubsw m5, m15
- pmaddubsw m6, m10
- phaddw m3, m4
- phaddw m5, m6
- psrld m4, m3, 16
- pslld m6, m5, 16
- paddw m3, m4
- paddw m5, m6
- pblendw m3, m5, 0xaa
- pmulhrsw m3, m12
- jmp .vloop
-.dy1:
- movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
- add wq, base_reg
- jmp wq
-%ifidn %1, put
-.dy1_w2:
- mov myd, mym
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- punpckldq m8, m9, m8
- paddd m14, m8 ; mx+dx*[0-1]
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- pand m8, m14, m10
- psrld m8, 6
- paddd xm15, xm8
- movd r4d, xm15
- pextrd r6d, xm15, 1
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vbroadcasti128 m6, [base+subpel_s_shuf2]
- vpbroadcastd m15, [base+subpel_filters+r4*8+2]
- vpbroadcastd m7, [base+subpel_filters+r6*8+2]
- pcmpeqd m8, m9
- psrld m14, 10
- movq xm0, [srcq+ssq*0]
- movq xm1, [srcq+ssq*2]
- movhps xm0, [srcq+ssq*1]
- movhps xm1, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- pshufb m14, m5
- paddb m14, m6
- vinserti128 m0, [srcq+ssq*0], 1
- vinserti128 m1, [srcq+ssq*2], 1
- vpbroadcastq m2, [srcq+ssq*1]
- add srcq, ss3q
- movq xm10, r4q
- punpcklbw xm10, xm10
- psraw xm10, 8
- vpblendd m15, m7, 0xaa
- pblendvb m15, m11, m8
- pshufd xm8, xm10, q0000
- pshufd xm9, xm10, q1111
- pshufd xm11, xm10, q3333
- pshufd xm10, xm10, q2222
- vpblendd m0, m2, 0xc0
- pshufb m1, m14
- pshufb m0, m14
- pmaddubsw m1, m15
- pmaddubsw m0, m15
- phaddw m0, m1
- pmulhrsw m0, m12
- vextracti128 xm1, m0, 1
- palignr xm2, xm1, xm0, 4
- pshufd xm4, xm1, q2121
- punpcklwd xm3, xm0, xm2 ; 01 12
- punpckhwd xm0, xm2 ; 23 34
- punpcklwd xm2, xm1, xm4 ; 45 56
-.dy1_w2_loop:
- movq xm1, [srcq+ssq*0]
- movhps xm1, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- pmaddwd xm5, xm3, xm8
- pmaddwd xm6, xm0, xm9
- pmaddwd xm7, xm2, xm10
- mova xm3, xm0
- mova xm0, xm2
- paddd xm5, xm13
- paddd xm6, xm7
- pshufb xm1, xm14
- pmaddubsw xm1, xm15
- phaddw xm1, xm1
- pmulhrsw xm1, xm12
- palignr xm7, xm1, xm4, 12
- punpcklwd xm2, xm7, xm1 ; 67 78
- pmaddwd xm7, xm2, xm11
- mova xm4, xm1
- paddd xm5, xm6
- paddd xm5, xm7
- psrad xm5, rndshift
- packssdw xm5, xm5
- packuswb xm5, xm5
- pextrw [dstq+dsq*0], xm5, 0
- pextrw [dstq+dsq*1], xm5, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .dy1_w2_loop
- RET
-%endif
-.dy1_w4:
- mov myd, mym
- vbroadcasti128 m7, [base+rescale_mul]
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- pmaddwd m8, m7
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- paddd m14, m8 ; mx+dx*[0-3]
- pand m8, m14, m10
- psrld m8, 6
- paddd xm15, xm8
- vpermq m8, m8, q3120
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r11d, xm15, 1
- pextrd r13d, xm15, 3
- movd xm15, [base+subpel_filters+r4*8+2]
- vpbroadcastd m7, [base+subpel_filters+r6*8+2]
- movu xm2, [srcq+ssq*0]
- movu xm3, [srcq+ssq*2]
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vpbroadcastq m6, [base+subpel_s_shuf2]
- pcmpeqd m8, m9
- psrld m14, 10
- pinsrd xm15, [base+subpel_filters+r11*8+2], 1
- vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
- vinserti128 m2, [srcq+ssq*1], 1
- vinserti128 m3, [srcq+ss3q ], 1
- lea srcq, [srcq+ssq*4]
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- pshufb m14, m5
- paddb m14, m6
- movu xm4, [srcq+ssq*0]
- movu xm5, [srcq+ssq*2]
- vinserti128 m4, [srcq+ssq*1], 1
- add srcq, ss3q
- vpblendd m15, m7, 0x30
- punpcklqdq m15, m15
- pblendvb m15, m11, m8
- movq xm10, r4q
- punpcklbw xm10, xm10
- psraw xm10, 8
- vinserti128 m10, xm10, 1
- pshufb m2, m14
- pshufb m3, m14
- pshufb m4, m14
- pshufb xm5, xm14
- vpermq m2, m2, q3120
- vpermq m3, m3, q3120
- vpermq m4, m4, q3120
- vpermq m5, m5, q3120
- pshufd m7, m10, q0000
- pshufd m8, m10, q1111
- pshufd m9, m10, q2222
- pshufd m10, m10, q3333
- pmaddubsw m2, m15
- pmaddubsw m3, m15
- pmaddubsw m4, m15
- pmaddubsw m5, m15
- phaddw m2, m3
- phaddw m4, m5
- pmulhrsw m2, m12
- pmulhrsw m4, m12
- palignr m5, m4, m2, 4
- pshufd m3, m4, q2121
- punpcklwd m0, m2, m5 ; 01 12
- punpckhwd m1, m2, m5 ; 23 34
- punpcklwd m2, m4, m3 ; 45 56
-.dy1_w4_loop:
- movu xm11, [srcq+ssq*0]
- vinserti128 m11, [srcq+ssq*1], 1
- lea srcq, [srcq+ssq*2]
- pmaddwd m4, m0, m7
- pmaddwd m5, m1, m8
- pmaddwd m6, m2, m9
- mova m0, m1
- mova m1, m2
- paddd m4, m13
- paddd m5, m6
- pshufb m11, m14
- vpermq m11, m11, q3120
- pmaddubsw m11, m15
- phaddw m11, m11
- pmulhrsw m11, m12
- palignr m6, m11, m3, 12
- punpcklwd m2, m6, m11 ; 67 78
- mova m3, m11
- pmaddwd m6, m2, m10
- paddd m4, m5
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- pshuflw xm4, xm4, q3120
- movd [dstq+dsq*0], xm4
- pextrd [dstq+dsq*1], xm4, 1
- lea dstq, [dstq+dsq*2]
-%else
- pshufd xm4, xm4, q3120
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- sub hd, 2
- jg .dy1_w4_loop
- MC_8TAP_SCALED_RET
-.dy1_w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- mov [rsp+32], r7d
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- movu [rsp], m10
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- lea myd, [t1+myq]
- mov t1d, 64 << 24
- cmovnz t1q, [base+subpel_filters+myq*8]
- vbroadcasti128 m14, [base+wswap]
- movq xm11, t1q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r7d, [rsp+32]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m10, m11, q2222
- pshufd m11, m11, q3333
-.dy1_w8_loop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m10
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, [rsp]
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .dy1_w8_loop
-.dy1_w16:
- mov dword [rsp+72], 2
- movifprep tmp_stridem, 32
- jmp .dy1_w_start
-.dy1_w32:
- mov dword [rsp+72], 4
- movifprep tmp_stridem, 64
- jmp .dy1_w_start
-.dy1_w64:
- mov dword [rsp+72], 8
- movifprep tmp_stridem, 128
- jmp .dy1_w_start
-.dy1_w128:
- mov dword [rsp+72], 16
- movifprep tmp_stridem, 256
-.dy1_w_start:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- pmaddwd m8, [base+rescale_mul]
- movd xm15, t0d
- mov [rsp+76], t0d
- mov [rsp+80], srcq
- mov [rsp+88], r0q ; dstq / tmpq
-%if UNIX64
- mov hm, hd
-%endif
- shl dword dxm, 3 ; dx*8
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- jmp .dy1_hloop
-.dy1_hloop_prep:
- dec dword [rsp+72]
- jz .ret
- add qword [rsp+88], 8*(isprep+1)
- mov hd, hm
- vpbroadcastd m8, dxm
- vpbroadcastd m10, [base+pd_0x3ff]
- paddd m14, m8, [rsp+32]
- vpbroadcastd m15, [rsp+76]
- pxor m9, m9
- mov srcq, [rsp+80]
- mov r0q, [rsp+88] ; dstq / tmpq
-.dy1_hloop:
- vpbroadcastq m11, [base+pq_0x40000000]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movu [rsp+32], m14
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movq [rsp+64], xm14
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- movu [rsp], m10
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- vbroadcasti128 m14, [base+wswap]
- movq xm11, r4q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r4d, [rsp+64]
- mov r7d, [rsp+68]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m10, m11, q2222
- pshufd m11, m11, q3333
-.dy1_vloop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m10
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, tmp_stridem
-%endif
- dec hd
- jz .dy1_hloop_prep
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- pshufb m0, m14
- pshufb m1, m14
- pshufb m2, m14
- pshufb m3, m14
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, [rsp]
- phaddw m4, m5
- pslld m5, m4, 16
- paddw m4, m5
- pmulhrsw m4, m12
- pblendw m0, m1, 0xaa
- pblendw m1, m2, 0xaa
- pblendw m2, m3, 0xaa
- pblendw m3, m4, 0xaa
- jmp .dy1_vloop
-.dy2:
- movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
- add wq, base_reg
- jmp wq
-%ifidn %1, put
-.dy2_w2:
- mov myd, mym
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- punpckldq m8, m9, m8
- paddd m14, m8 ; mx+dx*[0-1]
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- pand m8, m14, m10
- psrld m8, 6
- paddd xm15, xm8
- movd r4d, xm15
- pextrd r6d, xm15, 1
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vbroadcasti128 m6, [base+subpel_s_shuf2]
- vpbroadcastd m15, [base+subpel_filters+r4*8+2]
- vpbroadcastd m7, [base+subpel_filters+r6*8+2]
- pcmpeqd m8, m9
- psrld m14, 10
- movq xm0, [srcq+ssq*0]
- vpbroadcastq m2, [srcq+ssq*1]
- movhps xm0, [srcq+ssq*2]
- vpbroadcastq m3, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- pshufb m14, m5
- paddb m14, m6
- vpblendd m15, m7, 0xaa
- pblendvb m15, m11, m8
- movhps xm1, [srcq+ssq*0]
- vpbroadcastq m4, [srcq+ssq*1]
- lea srcq, [srcq+ssq*2]
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- vpblendd m0, m2, 0x30
- vpblendd m1, m4, 0xc0
- vpblendd m0, m3, 0xc0
- pshufb m0, m14
- pshufb m1, m14
- pmaddubsw m0, m15
- pmaddubsw m1, m15
- movq xm11, r4q
- punpcklbw xm11, xm11
- psraw xm11, 8
- phaddw m0, m1
- pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
- pshufd xm8, xm11, q0000
- pshufd xm9, xm11, q1111
- pshufd xm10, xm11, q2222
- pshufd xm11, xm11, q3333
- pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
- vextracti128 xm1, m2, 1
- punpcklwd xm3, xm2, xm1 ; 01 23
- punpckhwd xm2, xm1 ; 23 45
-.dy2_w2_loop:
- movq xm6, [srcq+ssq*0]
- vpbroadcastq m7, [srcq+ssq*1]
- movhps xm6, [srcq+ssq*2]
- vpbroadcastq m1, [srcq+ss3q ]
- lea srcq, [srcq+ssq*4]
- pmaddwd xm4, xm3, xm8
- pmaddwd xm5, xm2, xm9
- vpblendd m6, m7, 0x30
- vpblendd m6, m1, 0xc0
- pshufb m6, m14
- pmaddubsw m6, m15
- phaddw m6, m6
- pmulhrsw m6, m12
- palignr m0, m6, m0, 8
- pshufd m2, m0, q3221
- vextracti128 xm1, m2, 1
- punpcklwd xm3, xm2, xm1 ; 45 67
- punpckhwd xm2, xm1 ; 67 89
- pmaddwd xm6, xm3, xm10
- pmaddwd xm7, xm2, xm11
- paddd xm4, xm5
- paddd xm4, xm13
- paddd xm6, xm7
- paddd xm4, xm6
- psrad xm4, rndshift
- packssdw xm4, xm4
- packuswb xm4, xm4
- pextrw [dstq+dsq*0], xm4, 0
- pextrw [dstq+dsq*1], xm4, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .dy2_w2_loop
- RET
-%endif
-.dy2_w4:
- mov myd, mym
- vbroadcasti128 m7, [base+rescale_mul]
- movzx t0d, t0b
- dec srcq
- movd xm15, t0d
- pmaddwd m8, m7
- vpbroadcastd m11, [base+pd_0x4000]
- vpbroadcastd xm15, xm15
- paddd m14, m8 ; mx+dx*[0-3]
- pand m8, m14, m10
- psrld m8, 6
- paddd xm15, xm8
- movd r4d, xm15
- pextrd r6d, xm15, 1
- pextrd r11d, xm15, 2
- pextrd r13d, xm15, 3
- movd xm15, [base+subpel_filters+r4*8+2]
- vbroadcasti128 m5, [base+bdct_lb_dw]
- vpbroadcastq m6, [base+subpel_s_shuf2]
- pinsrd xm15, [base+subpel_filters+r6*8+2], 1
- pcmpeqd m8, m9
- psrld m14, 10
- movu xm0, [srcq+ssq*0]
- movu xm2, [srcq+ssq*2]
- pinsrd xm15, [base+subpel_filters+r11*8+2], 2
- movu xm1, [srcq+ssq*1]
- movu xm3, [srcq+ss3q ]
- pinsrd xm15, [base+subpel_filters+r13*8+2], 3
- lea srcq, [srcq+ssq*4]
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- vinserti128 m15, xm15, 1
- pshufb m14, m5
- paddb m14, m6
- vinserti128 m2, [srcq+ssq*0], 1
- vinserti128 m3, [srcq+ssq*1], 1
- lea srcq, [srcq+ssq*2]
- pblendvb m15, m11, m8
- pshufb xm0, xm14
- pshufb m2, m14
- pshufb xm1, xm14
- pshufb m3, m14
- pmaddubsw xm0, xm15
- pmaddubsw m2, m15
- pmaddubsw xm1, xm15
- pmaddubsw m3, m15
- movq xm11, r4q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- phaddw m0, m2
- phaddw m1, m3
- pmulhrsw m0, m12 ; 0 2 _ 4
- pmulhrsw m1, m12 ; 1 3 _ 5
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m10, m11, q2222
- pshufd m11, m11, q3333
- punpcklwd xm2, xm0, xm1
- punpckhwd m1, m0, m1 ; 23 45
- vinserti128 m0, m2, xm1, 1 ; 01 23
-.dy2_w4_loop:
- movu xm6, [srcq+ssq*0]
- movu xm7, [srcq+ssq*1]
- vinserti128 m6, [srcq+ssq*2], 1
- vinserti128 m7, [srcq+ss3q ], 1
- lea srcq, [srcq+ssq*4]
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pshufb m6, m14
- pshufb m7, m14
- pmaddubsw m6, m15
- pmaddubsw m7, m15
- psrld m2, m6, 16
- pslld m3, m7, 16
- paddw m6, m2
- paddw m7, m3
- pblendw m6, m7, 0xaa ; 67 89
- pmulhrsw m6, m12
- paddd m4, m5
- vpblendd m0, m1, m6, 0x0f
- mova m1, m6
- vpermq m0, m0, q1032 ; 45 67
- pmaddwd m6, m0, m10
- pmaddwd m7, m1, m11
- paddd m4, m13
- paddd m6, m7
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movd [dstq+dsq*0], xm4
- pextrd [dstq+dsq*1], xm4, 1
- lea dstq, [dstq+dsq*2]
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- sub hd, 2
- jg .dy2_w4_loop
- MC_8TAP_SCALED_RET
-.dy2_w8:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- movd xm15, t0d
- pmaddwd m8, [base+rescale_mul]
- vpbroadcastq m11, [base+pq_0x40000000]
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- mov [rsp], r7d
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- lea myd, [t1+myq]
- mov t1d, 64 << 24
- cmovnz t1q, [base+subpel_filters+myq*8]
- movq xm11, t1q
- punpcklbw xm11, xm11
- psraw xm11, 8
- vinserti128 m11, xm11, 1
- mov r7d, [rsp]
- pshufd m8, m11, q0000
- pshufd m9, m11, q1111
- pshufd m14, m11, q2222
- pshufd m11, m11, q3333
-.dy2_w8_loop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m14
- pmaddwd m7, m3, m11
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, 16
-%endif
- dec hd
- jz .ret
- mova m0, m1
- mova m1, m2
- mova m2, m3
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- vpbroadcastq m5, [srcq+r13]
- vpbroadcastq m6, [srcq+ rX]
- add srcq, ssq
- vpblendd m3, m5, 0xc0
- vpblendd m4, m6, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- phaddw m3, m4
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- psrld m5, m3, 16
- pslld m6, m4, 16
- paddw m3, m5
- paddw m4, m6
- pblendw m3, m4, 0xaa
- pmulhrsw m3, m12
- jmp .dy2_w8_loop
-.dy2_w16:
- mov dword [rsp+40], 2
- movifprep tmp_stridem, 32
- jmp .dy2_w_start
-.dy2_w32:
- mov dword [rsp+40], 4
- movifprep tmp_stridem, 64
- jmp .dy2_w_start
-.dy2_w64:
- mov dword [rsp+40], 8
- movifprep tmp_stridem, 128
- jmp .dy2_w_start
-.dy2_w128:
- mov dword [rsp+40], 16
- movifprep tmp_stridem, 256
-.dy2_w_start:
-%ifidn %1, put
- movifnidn dsm, dsq
-%endif
- shr t0d, 16
- sub srcq, 3
- pmaddwd m8, [base+rescale_mul]
- movd xm15, t0d
- mov [rsp+64], t0d
- mov [rsp+48], srcq
- mov [rsp+56], r0q ; dstq / tmpq
-%if UNIX64
- mov hm, hd
-%endif
- shl dword dxm, 3 ; dx*8
- vpbroadcastd m15, xm15
- paddd m14, m8 ; mx+dx*[0-7]
- jmp .dy2_hloop
-.dy2_hloop_prep:
- dec dword [rsp+40]
- jz .ret
- add qword [rsp+56], 8*(isprep+1)
- mov hd, hm
- vpbroadcastd m8, dxm
- vpbroadcastd m10, [base+pd_0x3ff]
- paddd m14, m8, [rsp]
- vpbroadcastd m15, [rsp+64]
- pxor m9, m9
- mov srcq, [rsp+48]
- mov r0q, [rsp+56] ; dstq / tmpq
-.dy2_hloop:
- vpbroadcastq m11, [base+pq_0x40000000]
- pand m6, m14, m10
- psrld m6, 6
- paddd m15, m6
- pcmpeqd m6, m9
- vextracti128 xm7, m15, 1
- movd r4d, xm15
- pextrd r6d, xm15, 2
- pextrd r7d, xm15, 1
- pextrd r9d, xm15, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- movu [rsp], m14
- movq xm15, [base+subpel_filters+ r4*8]
- movq xm10, [base+subpel_filters+ r6*8]
- movhps xm15, [base+subpel_filters+ r7*8]
- movhps xm10, [base+subpel_filters+ r9*8]
- vinserti128 m15, [base+subpel_filters+r10*8], 1
- vinserti128 m10, [base+subpel_filters+r11*8], 1
- vpbroadcastq m9, [base+subpel_filters+r13*8]
- vpbroadcastq m8, [base+subpel_filters+ rX*8]
- psrld m14, 10
- vextracti128 xm7, m14, 1
- movq [rsp+32], xm14
- movd r4d, xm14
- pextrd r6d, xm14, 2
- pextrd r7d, xm14, 1
- pextrd r9d, xm14, 3
- movd r10d, xm7
- pextrd r11d, xm7, 2
- pextrd r13d, xm7, 1
- pextrd rXd, xm7, 3
- pshufd m5, m6, q1100
- pshufd m6, m6, q3322
- vpblendd m15, m9, 0xc0
- vpblendd m10, m8, 0xc0
- pblendvb m15, m11, m5
- pblendvb m10, m11, m6
- vbroadcasti128 m14, [base+subpel_s_shuf8]
- MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
- MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
- MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
- MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
- mov myd, mym
- pshufb m0, m14 ; 01a 01b
- pshufb m1, m14 ; 23a 23b
- pshufb m2, m14 ; 45a 45b
- pshufb m3, m14 ; 67a 67b
- shr myd, 6
- mov r4d, 64 << 24
- lea myd, [t1+myq]
- cmovnz r4q, [base+subpel_filters+myq*8]
- movq xm14, r4q
- punpcklbw xm14, xm14
- psraw xm14, 8
- vinserti128 m14, xm14, 1
- mov r4d, [rsp+32]
- mov r7d, [rsp+36]
- pshufd m8, m14, q0000
- pshufd m9, m14, q1111
- pshufd m11, m14, q2222
- pshufd m14, m14, q3333
-.dy2_vloop:
- pmaddwd m4, m0, m8
- pmaddwd m5, m1, m9
- pmaddwd m6, m2, m11
- pmaddwd m7, m3, m14
- paddd m4, m5
- paddd m6, m7
- paddd m4, m13
- paddd m4, m6
- psrad m4, rndshift
- vextracti128 xm5, m4, 1
- packssdw xm4, xm5
-%ifidn %1, put
- packuswb xm4, xm4
- movq [dstq], xm4
- add dstq, dsm
-%else
- mova [tmpq], xm4
- add tmpq, tmp_stridem
-%endif
- dec hd
- jz .dy2_hloop_prep
- mova m0, m1
- mova m1, m2
- mova m2, m3
- movq xm3, [srcq+ r4]
- movq xm4, [srcq+ r6]
- movhps xm3, [srcq+ r7]
- movhps xm4, [srcq+ r9]
- vinserti128 m3, [srcq+r10], 1
- vinserti128 m4, [srcq+r11], 1
- vpbroadcastq m5, [srcq+r13]
- vpbroadcastq m6, [srcq+ rX]
- add srcq, ssq
- vpblendd m3, m5, 0xc0
- vpblendd m4, m6, 0xc0
- pmaddubsw m3, m15
- pmaddubsw m4, m10
- phaddw m3, m4
- movq xm4, [srcq+ r4]
- movq xm5, [srcq+ r6]
- movhps xm4, [srcq+ r7]
- movhps xm5, [srcq+ r9]
- vinserti128 m4, [srcq+r10], 1
- vinserti128 m5, [srcq+r11], 1
- vpbroadcastq m6, [srcq+r13]
- vpbroadcastq m7, [srcq+ rX]
- add srcq, ssq
- vpblendd m4, m6, 0xc0
- vpblendd m5, m7, 0xc0
- pmaddubsw m4, m15
- pmaddubsw m5, m10
- phaddw m4, m5
- psrld m5, m3, 16
- pslld m6, m4, 16
- paddw m3, m5
- paddw m4, m6
- pblendw m3, m4, 0xaa
- pmulhrsw m3, m12
- jmp .dy2_vloop
-.ret:
- MC_8TAP_SCALED_RET 0
-%undef isprep
-%endmacro
-
-%macro BILIN_SCALED_FN 1
-cglobal %1_bilin_scaled
- mov t0d, (5*15 << 16) | 5*15
- mov t1d, t0d
- jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
-%endmacro
-
-%if WIN64
-DECLARE_REG_TMP 6, 5
-%else
-DECLARE_REG_TMP 6, 8
-%endif
-
-%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
-%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
-
-BILIN_SCALED_FN put
-PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
-PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
-MC_8TAP_SCALED put
-
-%if WIN64
-DECLARE_REG_TMP 5, 4
-%else
-DECLARE_REG_TMP 6, 7
-%endif
-
-BILIN_SCALED_FN prep
-PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
-PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
-PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
-PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
-PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
-PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
-PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
-PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
-PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
-MC_8TAP_SCALED prep
-
-%macro WARP_V 5 ; dst, 02, 46, 13, 57
- ; Can be done using gathers, but that's terribly slow on many CPU:s
- lea tmp1d, [myq+deltaq*4]
- lea tmp2d, [myq+deltaq*1]
- shr myd, 10
- shr tmp1d, 10
- movq xm8, [filterq+myq *8]
- vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
- lea tmp1d, [tmp2q+deltaq*4]
- lea myd, [tmp2q+deltaq*1]
- shr tmp2d, 10
- shr tmp1d, 10
- movq xm0, [filterq+tmp2q*8]
- vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
- lea tmp1d, [myq+deltaq*4]
- lea tmp2d, [myq+deltaq*1]
- shr myd, 10
- shr tmp1d, 10
- movq xm9, [filterq+myq *8]
- vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
- lea tmp1d, [tmp2q+deltaq*4]
- lea myd, [tmp2q+gammaq] ; my += gamma
- shr tmp2d, 10
- shr tmp1d, 10
- punpcklwd m8, m0
- movq xm0, [filterq+tmp2q*8]
- vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
- punpcklwd m0, m9, m0
- punpckldq m9, m8, m0
- punpckhdq m0, m8, m0
- punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
- punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
- pmaddwd m%2, m8
- pmaddwd m9, m%3
- punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
- punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
- pmaddwd m8, m%4
- pmaddwd m0, m%5
- paddd m%2, m9
- paddd m0, m8
- paddd m%1, m0, m%2
-%endmacro
-
-cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
-%if WIN64
- sub rsp, 0xa0
-%endif
- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
-.loop:
- psrad m7, 13
- psrad m0, 13
- packssdw m7, m0
- pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
- vpermq m7, m7, q3120
- mova [tmpq+tsq*0], xm7
- vextracti128 [tmpq+tsq*2], m7, 1
- dec r4d
- jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
- call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
- lea tmpq, [tmpq+tsq*4]
- jmp .loop
-
-cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
- beta, filter, tmp1, delta, my, gamma
-%if WIN64
- sub rsp, 0xa0
- %assign xmm_regs_used 16
- %assign stack_size_padded 0xa0
- %assign stack_offset stack_offset+stack_size_padded
-%endif
- call .main
- jmp .start
-.loop:
- call .main2
- lea dstq, [dstq+dsq*2]
-.start:
- psrad m7, 18
- psrad m0, 18
- packusdw m7, m0
- pavgw m7, m11 ; (x + (1 << 10)) >> 11
- vextracti128 xm0, m7, 1
- packuswb xm7, xm0
- pshufd xm7, xm7, q3120
- movq [dstq+dsq*0], xm7
- movhps [dstq+dsq*1], xm7
- dec r4d
- jg .loop
-.end:
- RET
-ALIGN function_align
-.main:
- ; Stack args offset by one (r4m -> r5m etc.) due to call
-%if WIN64
- mov abcdq, r5m
- mov mxd, r6m
- movaps [rsp+stack_offset+0x10], xmm6
- movaps [rsp+stack_offset+0x20], xmm7
- movaps [rsp+0x28], xmm8
- movaps [rsp+0x38], xmm9
- movaps [rsp+0x48], xmm10
- movaps [rsp+0x58], xmm11
- movaps [rsp+0x68], xmm12
- movaps [rsp+0x78], xmm13
- movaps [rsp+0x88], xmm14
- movaps [rsp+0x98], xmm15
-%endif
- movsx alphad, word [abcdq+2*0]
- movsx betad, word [abcdq+2*1]
- mova m12, [warp_8x8_shufA]
- mova m13, [warp_8x8_shufB]
- vpbroadcastd m14, [pw_8192]
- vpbroadcastd m15, [pd_32768]
- pxor m11, m11
- lea filterq, [mc_warp_filter]
- lea tmp1q, [ssq*3+3]
- add mxd, 512+(64<<10)
- lea tmp2d, [alphaq*3]
- sub srcq, tmp1q ; src -= src_stride*3 + 3
- sub betad, tmp2d ; beta -= alpha*3
- mov myd, r7m
- call .h
- psrld m1, m0, 16
- call .h
- psrld m4, m0, 16
- call .h
- pblendw m1, m0, 0xaa ; 02
- call .h
- pblendw m4, m0, 0xaa ; 13
- call .h
- psrld m2, m1, 16
- pblendw m2, m0, 0xaa ; 24
- call .h
- psrld m5, m4, 16
- pblendw m5, m0, 0xaa ; 35
- call .h
- psrld m3, m2, 16
- pblendw m3, m0, 0xaa ; 46
- movsx deltad, word [abcdq+2*2]
- movsx gammad, word [abcdq+2*3]
- add myd, 512+(64<<10)
- mov r4d, 4
- lea tmp1d, [deltaq*3]
- sub gammad, tmp1d ; gamma -= delta*3
-.main2:
- call .h
- psrld m6, m5, 16
- pblendw m6, m0, 0xaa ; 57
- WARP_V 7, 1, 3, 4, 6
- call .h
- mova m1, m2
- mova m2, m3
- psrld m3, 16
- pblendw m3, m0, 0xaa ; 68
- WARP_V 0, 4, 6, 1, 3
- mova m4, m5
- mova m5, m6
- ret
-ALIGN function_align
-.h:
- lea tmp1d, [mxq+alphaq*4]
- lea tmp2d, [mxq+alphaq*1]
- vbroadcasti128 m10, [srcq]
- shr mxd, 10
- shr tmp1d, 10
- movq xm8, [filterq+mxq *8]
- vinserti128 m8, [filterq+tmp1q*8], 1
- lea tmp1d, [tmp2q+alphaq*4]
- lea mxd, [tmp2q+alphaq*1]
- shr tmp2d, 10
- shr tmp1d, 10
- movq xm0, [filterq+tmp2q*8]
- vinserti128 m0, [filterq+tmp1q*8], 1
- lea tmp1d, [mxq+alphaq*4]
- lea tmp2d, [mxq+alphaq*1]
- shr mxd, 10
- shr tmp1d, 10
- movq xm9, [filterq+mxq *8]
- vinserti128 m9, [filterq+tmp1q*8], 1
- lea tmp1d, [tmp2q+alphaq*4]
- lea mxd, [tmp2q+betaq] ; mx += beta
- shr tmp2d, 10
- shr tmp1d, 10
- punpcklqdq m8, m0 ; 0 1 4 5
- movq xm0, [filterq+tmp2q*8]
- vinserti128 m0, [filterq+tmp1q*8], 1
- punpcklqdq m9, m0 ; 2 3 6 7
- pshufb m0, m10, m12
- pmaddubsw m0, m8
- pshufb m10, m13
- pmaddubsw m10, m9
- add srcq, ssq
- phaddw m0, m10
- pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
- paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
- ret
-
-%macro WRAP_YMM 1+
- INIT_YMM cpuname
- %1
- INIT_ZMM cpuname
-%endmacro
-
-%macro BIDIR_FN 1 ; op
-%if mmsize == 64
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- cmp hd, 8
- jg .w4_h16
- WRAP_YMM %1 0
- vextracti32x4 xmm1, ym0, 1
- movd [dstq ], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
- jl .w4_ret
- lea dstq, [dstq+strideq*4]
- pextrd [dstq ], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
-.w4_ret:
- RET
-.w4_h16:
- vpbroadcastd m7, strided
- pmulld m7, [bidir_sctr_w4]
- %1 0
- kxnorw k1, k1, k1
- vpscatterdd [dstq+m7]{k1}, m0
- RET
-.w8:
- cmp hd, 4
- jne .w8_h8
- WRAP_YMM %1 0
- vextracti128 xmm1, ym0, 1
- movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
- RET
-.w8_loop:
- %1_INC_PTR 2
- lea dstq, [dstq+strideq*4]
-.w8_h8:
- %1 0
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
- movq [dstq ], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
- lea dstq, [dstq+strideq*4]
- movhps [dstq ], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
- sub hd, 8
- jg .w8_loop
- RET
-.w16_loop:
- %1_INC_PTR 2
- lea dstq, [dstq+strideq*4]
-.w16:
- %1 0
- vpermq m0, m0, q3120
- mova [dstq ], xm0
- vextracti32x4 [dstq+strideq*1], m0, 2
- vextracti32x4 [dstq+strideq*2], ym0, 1
- vextracti32x4 [dstq+stride3q ], m0, 3
- sub hd, 4
- jg .w16_loop
- RET
-.w32:
- pmovzxbq m7, [warp_8x8_shufA]
-.w32_loop:
- %1 0
- %1_INC_PTR 2
- vpermq m0, m7, m0
- mova [dstq+strideq*0], ym0
- vextracti32x8 [dstq+strideq*1], m0, 1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w32_loop
- RET
-.w64:
- pmovzxbq m7, [warp_8x8_shufA]
-.w64_loop:
- %1 0
- %1_INC_PTR 2
- vpermq m0, m7, m0
- mova [dstq], m0
- add dstq, strideq
- dec hd
- jg .w64_loop
- RET
-.w128:
- pmovzxbq m7, [warp_8x8_shufA]
-.w128_loop:
- %1 0
- vpermq m6, m7, m0
- %1 2
- mova [dstq+64*0], m6
- %1_INC_PTR 4
- vpermq m6, m7, m0
- mova [dstq+64*1], m6
- add dstq, strideq
- dec hd
- jg .w128_loop
- RET
-%else
- %1 0
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- vextracti128 xm1, m0, 1
- movd [dstq ], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- cmp hd, 4
- je .ret
- lea dstq, [dstq+strideq*4]
- pextrd [dstq ], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- cmp hd, 8
- je .ret
- %1 2
- lea dstq, [dstq+strideq*4]
- vextracti128 xm1, m0, 1
- movd [dstq ], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- lea dstq, [dstq+strideq*4]
- pextrd [dstq ], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
-.ret:
- RET
-.w8_loop:
- %1_INC_PTR 2
- %1 0
- lea dstq, [dstq+strideq*4]
-.w8:
- vextracti128 xm1, m0, 1
- movq [dstq ], xm0
- movq [dstq+strideq*1], xm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xm1
- sub hd, 4
- jg .w8_loop
- RET
-.w16_loop:
- %1_INC_PTR 4
- %1 0
- lea dstq, [dstq+strideq*4]
-.w16:
- vpermq m0, m0, q3120
- mova [dstq ], xm0
- vextracti128 [dstq+strideq*1], m0, 1
- %1 2
- vpermq m0, m0, q3120
- mova [dstq+strideq*2], xm0
- vextracti128 [dstq+stride3q ], m0, 1
- sub hd, 4
- jg .w16_loop
- RET
-.w32_loop:
- %1_INC_PTR 4
- %1 0
- lea dstq, [dstq+strideq*2]
-.w32:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], m0
- %1 2
- vpermq m0, m0, q3120
- mova [dstq+strideq*1], m0
- sub hd, 2
- jg .w32_loop
- RET
-.w64_loop:
- %1_INC_PTR 4
- %1 0
- add dstq, strideq
-.w64:
- vpermq m0, m0, q3120
- mova [dstq], m0
- %1 2
- vpermq m0, m0, q3120
- mova [dstq+32], m0
- dec hd
- jg .w64_loop
- RET
-.w128_loop:
- %1 0
- add dstq, strideq
-.w128:
- vpermq m0, m0, q3120
- mova [dstq+0*32], m0
- %1 2
- vpermq m0, m0, q3120
- mova [dstq+1*32], m0
- %1_INC_PTR 8
- %1 -4
- vpermq m0, m0, q3120
- mova [dstq+2*32], m0
- %1 -2
- vpermq m0, m0, q3120
- mova [dstq+3*32], m0
- dec hd
- jg .w128_loop
- RET
-%endif
-%endmacro
-
-%macro AVG 1 ; src_offset
- mova m0, [tmp1q+(%1+0)*mmsize]
- paddw m0, [tmp2q+(%1+0)*mmsize]
- mova m1, [tmp1q+(%1+1)*mmsize]
- paddw m1, [tmp2q+(%1+1)*mmsize]
- pmulhrsw m0, m2
- pmulhrsw m1, m2
- packuswb m0, m1
-%endmacro
-
-%macro AVG_INC_PTR 1
- add tmp1q, %1*mmsize
- add tmp2q, %1*mmsize
-%endmacro
-
-%macro AVG_FN 0
-cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
-%define base r6-avg %+ SUFFIX %+ _table
- lea r6, [avg %+ SUFFIX %+ _table]
- tzcnt wd, wm
- movifnidn hd, hm
- movsxd wq, dword [r6+wq*4]
- vpbroadcastd m2, [base+pw_1024]
- add wq, r6
- BIDIR_FN AVG
-%endmacro
-
-%macro W_AVG 1 ; src_offset
- ; (a * weight + b * (16 - weight) + 128) >> 8
- ; = ((a - b) * weight + (b << 4) + 128) >> 8
- ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
- ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
- mova m0, [tmp1q+(%1+0)*mmsize]
- psubw m2, m0, [tmp2q+(%1+0)*mmsize]
- mova m1, [tmp1q+(%1+1)*mmsize]
- psubw m3, m1, [tmp2q+(%1+1)*mmsize]
- pmulhw m2, m4
- pmulhw m3, m4
- paddw m0, m2
- paddw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
-%endmacro
-
-%define W_AVG_INC_PTR AVG_INC_PTR
-
-%macro W_AVG_FN 0
-cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
-%define base r6-w_avg %+ SUFFIX %+ _table
- lea r6, [w_avg %+ SUFFIX %+ _table]
- tzcnt wd, wm
- movifnidn hd, hm
- vpbroadcastw m4, r6m ; weight
- movsxd wq, dword [r6+wq*4]
- vpbroadcastd m5, [base+pw_2048]
- psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
- add wq, r6
- cmp dword r6m, 7
- jg .weight_gt7
- mov r6, tmp1q
- pxor m0, m0
- mov tmp1q, tmp2q
- psubw m4, m0, m4 ; -weight
- mov tmp2q, r6
-.weight_gt7:
- BIDIR_FN W_AVG
-%endmacro
-
-%macro MASK 1 ; src_offset
- ; (a * m + b * (64 - m) + 512) >> 10
- ; = ((a - b) * m + (b << 6) + 512) >> 10
- ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
-%if mmsize == 64
- vpermq m3, m8, [maskq+%1*32]
-%else
- vpermq m3, [maskq+%1*16], q3120
-%endif
- mova m0, [tmp2q+(%1+0)*mmsize]
- psubw m1, m0, [tmp1q+(%1+0)*mmsize]
- psubb m3, m4, m3
- paddw m1, m1 ; (b - a) << 1
- paddb m3, m3
- punpcklbw m2, m4, m3 ; -m << 9
- pmulhw m1, m2
- paddw m0, m1
- mova m1, [tmp2q+(%1+1)*mmsize]
- psubw m2, m1, [tmp1q+(%1+1)*mmsize]
- paddw m2, m2
- punpckhbw m3, m4, m3
- pmulhw m2, m3
- paddw m1, m2
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
-%endmacro
-
-%macro MASK_INC_PTR 1
- add maskq, %1*mmsize/2
- add tmp2q, %1*mmsize
- add tmp1q, %1*mmsize
-%endmacro
-
-%macro MASK_FN 0
-cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-mask %+ SUFFIX %+ _table
- lea r7, [mask %+ SUFFIX %+ _table]
- tzcnt wd, wm
- movifnidn hd, hm
- mov maskq, maskmp
- movsxd wq, dword [r7+wq*4]
- pxor m4, m4
-%if mmsize == 64
- mova m8, [base+bilin_v_perm64]
-%endif
- vpbroadcastd m5, [base+pw_2048]
- add wq, r7
- BIDIR_FN MASK
-%endmacro MASK_FN
-
-%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
- mova m%1, [tmp1q+mmsize*%3]
- mova m1, [tmp2q+mmsize*%3]
- psubw m1, m%1
- pabsw m%2, m1
- psubusw m%2, m6, m%2
- psrlw m%2, 8 ; 64 - m
- psllw m2, m%2, 10
- pmulhw m1, m2
- paddw m%1, m1
- mova m1, [tmp1q+mmsize*%4]
- mova m2, [tmp2q+mmsize*%4]
- psubw m2, m1
- pabsw m3, m2
- psubusw m3, m6, m3
-%if cpuflag(avx512icl)
- vpshldw m%2, m3, 8
- psllw m3, m%2, 10
-%if %5
- psubb m%2, m5, m%2
-%endif
-%else
- psrlw m3, 8
-%if %5
- packuswb m%2, m3
- psubb m%2, m5, m%2
- vpermq m%2, m%2, q3120
-%else
- phaddw m%2, m3
-%endif
- psllw m3, 10
-%endif
- pmulhw m2, m3
- paddw m1, m2
- pmulhrsw m%1, m7
- pmulhrsw m1, m7
- packuswb m%1, m1
-%endmacro
-
-cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
-%define base r6-blend_avx2_table
- lea r6, [blend_avx2_table]
- tzcnt wd, wm
- movifnidn hd, hm
- movifnidn maskq, maskmp
- movsxd wq, dword [r6+wq*4]
- vpbroadcastd m4, [base+pb_64]
- vpbroadcastd m5, [base+pw_512]
- add wq, r6
- lea r6, [dsq*3]
- jmp wq
-.w4:
- movd xm0, [dstq+dsq*0]
- pinsrd xm0, [dstq+dsq*1], 1
- vpbroadcastd xm1, [dstq+dsq*2]
- pinsrd xm1, [dstq+r6 ], 3
- mova xm6, [maskq]
- psubb xm3, xm4, xm6
- punpcklbw xm2, xm3, xm6
- punpckhbw xm3, xm6
- mova xm6, [tmpq]
- add maskq, 4*4
- add tmpq, 4*4
- punpcklbw xm0, xm6
- punpckhbw xm1, xm6
- pmaddubsw xm0, xm2
- pmaddubsw xm1, xm3
- pmulhrsw xm0, xm5
- pmulhrsw xm1, xm5
- packuswb xm0, xm1
- movd [dstq+dsq*0], xm0
- pextrd [dstq+dsq*1], xm0, 1
- pextrd [dstq+dsq*2], xm0, 2
- pextrd [dstq+r6 ], xm0, 3
- lea dstq, [dstq+dsq*4]
- sub hd, 4
- jg .w4
- RET
-ALIGN function_align
-.w8:
- movq xm1, [dstq+dsq*0]
- movhps xm1, [dstq+dsq*1]
- vpbroadcastq m2, [dstq+dsq*2]
- vpbroadcastq m3, [dstq+r6 ]
- mova m0, [maskq]
- mova m6, [tmpq]
- add maskq, 8*4
- add tmpq, 8*4
- vpblendd m1, m2, 0x30
- vpblendd m1, m3, 0xc0
- psubb m3, m4, m0
- punpcklbw m2, m3, m0
- punpckhbw m3, m0
- punpcklbw m0, m1, m6
- punpckhbw m1, m6
- pmaddubsw m0, m2
- pmaddubsw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- vextracti128 xm1, m0, 1
- movq [dstq+dsq*0], xm0
- movhps [dstq+dsq*1], xm0
- movq [dstq+dsq*2], xm1
- movhps [dstq+r6 ], xm1
- lea dstq, [dstq+dsq*4]
- sub hd, 4
- jg .w8
- RET
-ALIGN function_align
-.w16:
- mova m0, [maskq]
- mova xm1, [dstq+dsq*0]
- vinserti128 m1, [dstq+dsq*1], 1
- psubb m3, m4, m0
- punpcklbw m2, m3, m0
- punpckhbw m3, m0
- mova m6, [tmpq]
- add maskq, 16*2
- add tmpq, 16*2
- punpcklbw m0, m1, m6
- punpckhbw m1, m6
- pmaddubsw m0, m2
- pmaddubsw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], m0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w16
- RET
-ALIGN function_align
-.w32:
- mova m0, [maskq]
- mova m1, [dstq]
- mova m6, [tmpq]
- add maskq, 32
- add tmpq, 32
- psubb m3, m4, m0
- punpcklbw m2, m3, m0
- punpckhbw m3, m0
- punpcklbw m0, m1, m6
- punpckhbw m1, m6
- pmaddubsw m0, m2
- pmaddubsw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .w32
- RET
-
-cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
-%define base r5-blend_v_avx2_table
- lea r5, [blend_v_avx2_table]
- tzcnt wd, wm
- movifnidn hd, hm
- movsxd wq, dword [r5+wq*4]
- vpbroadcastd m5, [base+pw_512]
- add wq, r5
- add maskq, obmc_masks-blend_v_avx2_table
- jmp wq
-.w2:
- vpbroadcastd xm2, [maskq+2*2]
-.w2_s0_loop:
- movd xm0, [dstq+dsq*0]
- pinsrw xm0, [dstq+dsq*1], 1
- movd xm1, [tmpq]
- add tmpq, 2*2
- punpcklbw xm0, xm1
- pmaddubsw xm0, xm2
- pmulhrsw xm0, xm5
- packuswb xm0, xm0
- pextrw [dstq+dsq*0], xm0, 0
- pextrw [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w2_s0_loop
- RET
-ALIGN function_align
-.w4:
- vpbroadcastq xm2, [maskq+4*2]
-.w4_loop:
- movd xm0, [dstq+dsq*0]
- pinsrd xm0, [dstq+dsq*1], 1
- movq xm1, [tmpq]
- add tmpq, 4*2
- punpcklbw xm0, xm1
- pmaddubsw xm0, xm2
- pmulhrsw xm0, xm5
- packuswb xm0, xm0
- movd [dstq+dsq*0], xm0
- pextrd [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w4_loop
- RET
-ALIGN function_align
-.w8:
- vbroadcasti128 m4, [maskq+8*2]
-.w8_loop:
- vpbroadcastq m2, [dstq+dsq*0]
- movq xm0, [dstq+dsq*1]
- vpblendd m0, m2, 0x30
- movq xm1, [tmpq+8*1]
- vinserti128 m1, [tmpq+8*0], 1
- add tmpq, 8*2
- punpcklbw m0, m1
- pmaddubsw m0, m4
- pmulhrsw m0, m5
- vextracti128 xm1, m0, 1
- packuswb xm0, xm1
- movhps [dstq+dsq*0], xm0
- movq [dstq+dsq*1], xm0
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w8_loop
- RET
-ALIGN function_align
-.w16:
- vbroadcasti128 m3, [maskq+16*2]
- vbroadcasti128 m4, [maskq+16*3]
-.w16_loop:
- mova xm1, [dstq+dsq*0]
- vinserti128 m1, [dstq+dsq*1], 1
- mova m2, [tmpq]
- add tmpq, 16*2
- punpcklbw m0, m1, m2
- punpckhbw m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m4
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], m0, 1
- lea dstq, [dstq+dsq*2]
- sub hd, 2
- jg .w16_loop
- RET
-ALIGN function_align
-.w32:
- mova xm3, [maskq+16*4]
- vinserti128 m3, [maskq+16*6], 1
- mova xm4, [maskq+16*5]
- vinserti128 m4, [maskq+16*7], 1
-.w32_loop:
- mova m1, [dstq]
- mova m2, [tmpq]
- add tmpq, 32
- punpcklbw m0, m1, m2
- punpckhbw m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m4
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq], m0
- add dstq, dsq
- dec hd
- jg .w32_loop
- RET
-
-cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
-%define base r5-blend_h_avx2_table
- lea r5, [blend_h_avx2_table]
- mov r6d, wd
- tzcnt wd, wd
- mov hd, hm
- movsxd wq, dword [r5+wq*4]
- vpbroadcastd m5, [base+pw_512]
- add wq, r5
- lea maskq, [base+obmc_masks+hq*2]
- lea hd, [hq*3]
- shr hd, 2 ; h * 3/4
- lea maskq, [maskq+hq*2]
- neg hq
- jmp wq
-.w2:
- movd xm0, [dstq+dsq*0]
- pinsrw xm0, [dstq+dsq*1], 1
- movd xm2, [maskq+hq*2]
- movd xm1, [tmpq]
- add tmpq, 2*2
- punpcklwd xm2, xm2
- punpcklbw xm0, xm1
- pmaddubsw xm0, xm2
- pmulhrsw xm0, xm5
- packuswb xm0, xm0
- pextrw [dstq+dsq*0], xm0, 0
- pextrw [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- add hq, 2
- jl .w2
- RET
-ALIGN function_align
-.w4:
- mova xm3, [blend_shuf]
-.w4_loop:
- movd xm0, [dstq+dsq*0]
- pinsrd xm0, [dstq+dsq*1], 1
- movd xm2, [maskq+hq*2]
- movq xm1, [tmpq]
- add tmpq, 4*2
- pshufb xm2, xm3
- punpcklbw xm0, xm1
- pmaddubsw xm0, xm2
- pmulhrsw xm0, xm5
- packuswb xm0, xm0
- movd [dstq+dsq*0], xm0
- pextrd [dstq+dsq*1], xm0, 1
- lea dstq, [dstq+dsq*2]
- add hq, 2
- jl .w4_loop
- RET
-ALIGN function_align
-.w8:
- vbroadcasti128 m4, [blend_shuf]
- shufpd m4, m4, 0x03
-.w8_loop:
- vpbroadcastq m1, [dstq+dsq*0]
- movq xm0, [dstq+dsq*1]
- vpblendd m0, m1, 0x30
- vpbroadcastd m3, [maskq+hq*2]
- movq xm1, [tmpq+8*1]
- vinserti128 m1, [tmpq+8*0], 1
- add tmpq, 8*2
- pshufb m3, m4
- punpcklbw m0, m1
- pmaddubsw m0, m3
- pmulhrsw m0, m5
- vextracti128 xm1, m0, 1
- packuswb xm0, xm1
- movhps [dstq+dsq*0], xm0
- movq [dstq+dsq*1], xm0
- lea dstq, [dstq+dsq*2]
- add hq, 2
- jl .w8_loop
- RET
-ALIGN function_align
-.w16:
- vbroadcasti128 m4, [blend_shuf]
- shufpd m4, m4, 0x0c
-.w16_loop:
- mova xm1, [dstq+dsq*0]
- vinserti128 m1, [dstq+dsq*1], 1
- vpbroadcastd m3, [maskq+hq*2]
- mova m2, [tmpq]
- add tmpq, 16*2
- pshufb m3, m4
- punpcklbw m0, m1, m2
- punpckhbw m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq+dsq*0], xm0
- vextracti128 [dstq+dsq*1], m0, 1
- lea dstq, [dstq+dsq*2]
- add hq, 2
- jl .w16_loop
- RET
-ALIGN function_align
-.w32: ; w32/w64/w128
- sub dsq, r6
-.w32_loop0:
- vpbroadcastw m3, [maskq+hq*2]
- mov wd, r6d
-.w32_loop:
- mova m1, [dstq]
- mova m2, [tmpq]
- add tmpq, 32
- punpcklbw m0, m1, m2
- punpckhbw m1, m2
- pmaddubsw m0, m3
- pmaddubsw m1, m3
- pmulhrsw m0, m5
- pmulhrsw m1, m5
- packuswb m0, m1
- mova [dstq], m0
- add dstq, 32
- sub wd, 32
- jg .w32_loop
- add dstq, dsq
- inc hq
- jl .w32_loop0
- RET
-
-cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
- bottomext, rightext
- ; we assume that the buffer (stride) is larger than width, so we can
- ; safely overwrite by a few bytes
-
- ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
- xor r12d, r12d
- lea r10, [ihq-1]
- cmp yq, ihq
- cmovs r10, yq
- test yq, yq
- cmovs r10, r12
- imul r10, sstrideq
- add srcq, r10
-
- ; ref += iclip(x, 0, iw - 1)
- lea r10, [iwq-1]
- cmp xq, iwq
- cmovs r10, xq
- test xq, xq
- cmovs r10, r12
- add srcq, r10
-
- ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
- lea bottomextq, [yq+bhq]
- sub bottomextq, ihq
- lea r3, [bhq-1]
- cmovs bottomextq, r12
-
- DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
- bottomext, rightext
-
- ; top_ext = iclip(-y, 0, bh - 1)
- neg topextq
- cmovs topextq, r12
- cmp bottomextq, bhq
- cmovns bottomextq, r3
- cmp topextq, bhq
- cmovg topextq, r3
-
- ; right_ext = iclip(x + bw - iw, 0, bw - 1)
- lea rightextq, [xq+bwq]
- sub rightextq, iwq
- lea r2, [bwq-1]
- cmovs rightextq, r12
-
- DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
- bottomext, rightext
-
- ; left_ext = iclip(-x, 0, bw - 1)
- neg leftextq
- cmovs leftextq, r12
- cmp rightextq, bwq
- cmovns rightextq, r2
- cmp leftextq, bwq
- cmovns leftextq, r2
-
- DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
- dst, dstride, src, sstride, bottomext, rightext
-
- ; center_h = bh - top_ext - bottom_ext
- lea r3, [bottomextq+topextq]
- sub centerhq, r3
-
- ; blk += top_ext * PXSTRIDE(dst_stride)
- mov r2, topextq
- imul r2, dstrideq
- add dstq, r2
- mov r9m, dstq
-
- ; center_w = bw - left_ext - right_ext
- mov centerwq, bwq
- lea r3, [rightextq+leftextq]
- sub centerwq, r3
-
-%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
-.v_loop_%3:
-%if %1
- ; left extension
- xor r3, r3
- vpbroadcastb m0, [srcq]
-.left_loop_%3:
- mova [dstq+r3], m0
- add r3, 32
- cmp r3, leftextq
- jl .left_loop_%3
-
- ; body
- lea r12, [dstq+leftextq]
-%endif
- xor r3, r3
-.body_loop_%3:
- movu m0, [srcq+r3]
-%if %1
- movu [r12+r3], m0
-%else
- movu [dstq+r3], m0
-%endif
- add r3, 32
- cmp r3, centerwq
- jl .body_loop_%3
-
-%if %2
- ; right extension
-%if %1
- add r12, centerwq
-%else
- lea r12, [dstq+centerwq]
-%endif
- xor r3, r3
- vpbroadcastb m0, [srcq+centerwq-1]
-.right_loop_%3:
- movu [r12+r3], m0
- add r3, 32
- cmp r3, rightextq
- jl .right_loop_%3
-
-%endif
- add dstq, dstrideq
- add srcq, sstrideq
- dec centerhq
- jg .v_loop_%3
-%endmacro
-
- test leftextq, leftextq
- jnz .need_left_ext
- test rightextq, rightextq
- jnz .need_right_ext
- v_loop 0, 0, 0
- jmp .body_done
-
-.need_left_ext:
- test rightextq, rightextq
- jnz .need_left_right_ext
- v_loop 1, 0, 1
- jmp .body_done
-
-.need_left_right_ext:
- v_loop 1, 1, 2
- jmp .body_done
-
-.need_right_ext:
- v_loop 0, 1, 3
-
-.body_done:
- ; bottom edge extension
- test bottomextq, bottomextq
- jz .top
- mov srcq, dstq
- sub srcq, dstrideq
- xor r1, r1
-.bottom_x_loop:
- mova m0, [srcq+r1]
- lea r3, [dstq+r1]
- mov r4, bottomextq
-.bottom_y_loop:
- mova [r3], m0
- add r3, dstrideq
- dec r4
- jg .bottom_y_loop
- add r1, 32
- cmp r1, bwq
- jl .bottom_x_loop
-
-.top:
- ; top edge extension
- test topextq, topextq
- jz .end
- mov srcq, r9m
- mov dstq, dstm
- xor r1, r1
-.top_x_loop:
- mova m0, [srcq+r1]
- lea r3, [dstq+r1]
- mov r4, topextq
-.top_y_loop:
- mova [r3], m0
- add r3, dstrideq
- dec r4
- jg .top_y_loop
- add r1, 32
- cmp r1, bwq
- jl .top_x_loop
-
-.end:
- RET
-
-cextern resize_filter
-
-INIT_YMM avx2
-cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
- dst_w, h, src_w, dx, mx0
- sub dword mx0m, 4<<14
- sub dword src_wm, 8
- vpbroadcastd m5, dxm
- vpbroadcastd m8, mx0m
- vpbroadcastd m6, src_wm
-
- DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
- LEA r7, $$
-%define base r7-$$
-
- vpbroadcastd m3, [base+pw_m256]
- vpbroadcastd m7, [base+pd_63]
- vbroadcasti128 m15, [base+pb_8x0_8x8]
- pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
- pslld m5, 3 ; dx*8
- pslld m6, 14
- paddd m8, m2 ; mx+[0..7]*dx
- pxor m2, m2
-
- ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
- ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
-
-.loop_y:
- xor xd, xd
- mova m4, m8 ; per-line working version of mx
-
-.loop_x:
- pmaxsd m0, m4, m2
- psrad m9, m4, 8 ; filter offset (unmasked)
- pminsd m0, m6 ; iclip(mx, 0, src_w-8)
- psubd m1, m4, m0 ; pshufb offset
- psrad m0, 14 ; clipped src_x offset
- psrad m1, 14 ; pshufb edge_emu offset
- pand m9, m7 ; filter offset (masked)
-
- ; load source pixels - this ugly code is vpgatherdq emulation since
- ; directly using vpgatherdq on Haswell is quite a bit slower :(
- movd r8d, xm0
- pextrd r9d, xm0, 1
- pextrd r10d, xm0, 2
- pextrd r11d, xm0, 3
- vextracti128 xm0, m0, 1
- movq xm12, [srcq+r8]
- movq xm13, [srcq+r10]
- movhps xm12, [srcq+r9]
- movhps xm13, [srcq+r11]
- movd r8d, xm0
- pextrd r9d, xm0, 1
- pextrd r10d, xm0, 2
- pextrd r11d, xm0, 3
- vinserti128 m12, [srcq+r8], 1
- vinserti128 m13, [srcq+r10], 1
- vpbroadcastq m10, [srcq+r9]
- vpbroadcastq m11, [srcq+r11]
- vpblendd m12, m12, m10, 11000000b
- vpblendd m13, m13, m11, 11000000b
-
- ; if no emulation is required, we don't need to shuffle or emulate edges
- ; this also saves 2 quasi-vpgatherdqs
- vptest m1, m1
- jz .filter
-
- movd r8d, xm1
- pextrd r9d, xm1, 1
- pextrd r10d, xm1, 2
- pextrd r11d, xm1, 3
- movsxd r8, r8d
- movsxd r9, r9d
- movsxd r10, r10d
- movsxd r11, r11d
- vextracti128 xm1, m1, 1
- movq xm14, [base+resize_shuf+4+r8]
- movq xm0, [base+resize_shuf+4+r10]
- movhps xm14, [base+resize_shuf+4+r9]
- movhps xm0, [base+resize_shuf+4+r11]
- movd r8d, xm1
- pextrd r9d, xm1, 1
- pextrd r10d, xm1, 2
- pextrd r11d, xm1, 3
- movsxd r8, r8d
- movsxd r9, r9d
- movsxd r10, r10d
- movsxd r11, r11d
- vinserti128 m14, [base+resize_shuf+4+r8], 1
- vinserti128 m0, [base+resize_shuf+4+r10], 1
- vpbroadcastq m10, [base+resize_shuf+4+r9]
- vpbroadcastq m11, [base+resize_shuf+4+r11]
- vpblendd m14, m14, m10, 11000000b
- vpblendd m0, m0, m11, 11000000b
-
- paddb m14, m15
- paddb m0, m15
- pshufb m12, m14
- pshufb m13, m0
-
-.filter:
- movd r8d, xm9
- pextrd r9d, xm9, 1
- pextrd r10d, xm9, 2
- pextrd r11d, xm9, 3
- vextracti128 xm9, m9, 1
- movq xm10, [base+resize_filter+r8*8]
- movq xm11, [base+resize_filter+r10*8]
- movhps xm10, [base+resize_filter+r9*8]
- movhps xm11, [base+resize_filter+r11*8]
- movd r8d, xm9
- pextrd r9d, xm9, 1
- pextrd r10d, xm9, 2
- pextrd r11d, xm9, 3
- vinserti128 m10, [base+resize_filter+r8*8], 1
- vinserti128 m11, [base+resize_filter+r10*8], 1
- vpbroadcastq m14, [base+resize_filter+r9*8]
- vpbroadcastq m1, [base+resize_filter+r11*8]
- vpblendd m10, m10, m14, 11000000b
- vpblendd m11, m11, m1, 11000000b
-
- pmaddubsw m12, m10
- pmaddubsw m13, m11
- phaddw m12, m13
- vextracti128 xm13, m12, 1
- phaddsw xm12, xm13
- pmulhrsw xm12, xm3 ; x=(x+64)>>7
- packuswb xm12, xm12
- movq [dstq+xq], xm12
-
- paddd m4, m5
- add xd, 8
- cmp xd, dst_wd
- jl .loop_x
-
- add dstq, dst_strideq
- add srcq, src_strideq
- dec hd
- jg .loop_y
- RET
-
-INIT_YMM avx2
-PREP_BILIN
-PREP_8TAP
-AVG_FN
-W_AVG_FN
-MASK_FN
-
-cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_420_avx2_table
- lea r7, [w_mask_420_avx2_table]
- tzcnt wd, wm
- mov r6d, r7m ; sign
- movifnidn hd, hm
- movsxd wq, [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m7, [base+pw_2048]
- pmovzxbd m9, [base+deint_shuf4]
- vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
- add wq, r7
- W_MASK 0, 4, 0, 1
- mov maskq, maskmp
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- vextracti128 xm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- cmp hd, 8
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- jg .w4_h16
-.w4_end:
- vextracti128 xm0, m4, 1
- vpblendd xm1, xm4, xm0, 0x05
- vpblendd xm4, xm4, xm0, 0x0a
- pshufd xm1, xm1, q2301
- psubw xm4, xm8, xm4
- psubw xm4, xm1
- psrlw xm4, 2
- packuswb xm4, xm4
- movq [maskq], xm4
- RET
-.w4_h16:
- W_MASK 0, 5, 2, 3
- lea dstq, [dstq+strideq*4]
- phaddd m4, m5
- vextracti128 xm1, m0, 1
- psubw m4, m8, m4
- psrlw m4, 2
- vpermd m4, m9, m4
- vextracti128 xm5, m4, 1
- packuswb xm4, xm5
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q], xm1, 1
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- mova [maskq], xm4
- RET
-.w8_loop:
- add tmp1q, 2*32
- add tmp2q, 2*32
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*4]
- add maskq, 8
-.w8:
- vextracti128 xm2, m4, 1
- vextracti128 xm1, m0, 1
- psubw xm4, xm8, xm4
- psubw xm4, xm2
- psrlw xm4, 2
- packuswb xm4, xm4
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xm1
- movq [maskq], xm4
- sub hd, 4
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*4]
- add maskq, 16
-.w16:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], xm0
- vextracti128 [dstq+strideq*1], m0, 1
- W_MASK 0, 5, 2, 3
- punpckhqdq m1, m4, m5
- punpcklqdq m4, m5
- psubw m1, m8, m1
- psubw m1, m4
- psrlw m1, 2
- vpermq m0, m0, q3120
- packuswb m1, m1
- vpermd m1, m9, m1
- mova [dstq+strideq*2], xm0
- vextracti128 [dstq+stride3q ], m0, 1
- mova [maskq], xm1
- sub hd, 4
- jg .w16_loop
- RET
-.w32_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*2]
- add maskq, 16
-.w32:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], m0
- W_MASK 0, 5, 2, 3
- psubw m4, m8, m4
- psubw m4, m5
- psrlw m4, 2
- vpermq m0, m0, q3120
- packuswb m4, m4
- vpermd m4, m9, m4
- mova [dstq+strideq*1], m0
- mova [maskq], xm4
- sub hd, 2
- jg .w32_loop
- RET
-.w64_loop_even:
- psubw m10, m8, m4
- psubw m11, m8, m5
- dec hd
-.w64_loop:
- add tmp1q, 4*32
- add tmp2q, 4*32
- W_MASK 0, 4, 0, 1
- add dstq, strideq
-.w64:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- W_MASK 0, 5, 2, 3
- vpermq m0, m0, q3120
- mova [dstq+32*1], m0
- test hd, 1
- jz .w64_loop_even
- psubw m4, m10, m4
- psubw m5, m11, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq], m4
- add maskq, 32
- dec hd
- jg .w64_loop
- RET
-.w128_loop_even:
- psubw m12, m8, m4
- psubw m13, m8, m5
- dec hd
-.w128_loop:
- W_MASK 0, 4, 0, 1
- add dstq, strideq
-.w128:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- W_MASK 0, 5, 2, 3
- vpermq m0, m0, q3120
- mova [dstq+32*1], m0
- add tmp1q, 8*32
- add tmp2q, 8*32
- test hd, 1
- jz .w128_even
- psubw m4, m10, m4
- psubw m5, m11, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq+32*0], m4
- jmp .w128_odd
-.w128_even:
- psubw m10, m8, m4
- psubw m11, m8, m5
-.w128_odd:
- W_MASK 0, 4, -4, -3
- vpermq m0, m0, q3120
- mova [dstq+32*2], m0
- W_MASK 0, 5, -2, -1
- vpermq m0, m0, q3120
- mova [dstq+32*3], m0
- test hd, 1
- jz .w128_loop_even
- psubw m4, m12, m4
- psubw m5, m13, m5
- psrlw m4, 2
- psrlw m5, 2
- packuswb m4, m5
- vpermd m4, m9, m4
- mova [maskq+32*1], m4
- add maskq, 64
- dec hd
- jg .w128_loop
- RET
-
-cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_422_avx2_table
- lea r7, [w_mask_422_avx2_table]
- tzcnt wd, wm
- mov r6d, r7m ; sign
- movifnidn hd, hm
- pxor m9, m9
- movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m7, [base+pw_2048]
- pmovzxbd m10, [base+deint_shuf4]
- vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign
- add wq, r7
- mov maskq, maskmp
- W_MASK 0, 4, 0, 1
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- vextracti128 xm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- cmp hd, 8
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- jg .w4_h16
-.w4_end:
- vextracti128 xm5, m4, 1
- packuswb xm4, xm5
- psubb xm5, xm8, xm4
- pavgb xm5, xm9
- pshufd xm5, xm5, q3120
- mova [maskq], xm5
- RET
-.w4_h16:
- W_MASK 0, 5, 2, 3
- lea dstq, [dstq+strideq*4]
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermd m5, m10, m5
- vextracti128 xm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- mova [maskq], m5
- RET
-.w8_loop:
- add tmp1q, 32*2
- add tmp2q, 32*2
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*4]
- add maskq, 16
-.w8:
- vextracti128 xm5, m4, 1
- vextracti128 xm1, m0, 1
- packuswb xm4, xm5
- psubb xm5, xm8, xm4
- pavgb xm5, xm9
- pshufd xm5, xm5, q3120
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xm1
- mova [maskq], xm5
- sub hd, 4
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 32*4
- add tmp2q, 32*4
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*4]
- add maskq, 32
-.w16:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], xm0
- vextracti128 [dstq+strideq*1], m0, 1
- W_MASK 0, 5, 2, 3
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermq m0, m0, q3120
- vpermd m5, m10, m5
- mova [dstq+strideq*2], xm0
- vextracti128 [dstq+stride3q ], m0, 1
- mova [maskq], m5
- sub hd, 4
- jg .w16_loop
- RET
-.w32_loop:
- add tmp1q, 32*4
- add tmp2q, 32*4
- W_MASK 0, 4, 0, 1
- lea dstq, [dstq+strideq*2]
- add maskq, 32
-.w32:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], m0
- W_MASK 0, 5, 2, 3
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermq m0, m0, q3120
- vpermd m5, m10, m5
- mova [dstq+strideq*1], m0
- mova [maskq], m5
- sub hd, 2
- jg .w32_loop
- RET
-.w64_loop:
- add tmp1q, 32*4
- add tmp2q, 32*4
- W_MASK 0, 4, 0, 1
- add dstq, strideq
- add maskq, 32
-.w64:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- W_MASK 0, 5, 2, 3
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermq m0, m0, q3120
- vpermd m5, m10, m5
- mova [dstq+32*1], m0
- mova [maskq], m5
- dec hd
- jg .w64_loop
- RET
-.w128_loop:
- add tmp1q, 32*8
- add tmp2q, 32*8
- W_MASK 0, 4, 0, 1
- add dstq, strideq
- add maskq, 32*2
-.w128:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- W_MASK 0, 5, 2, 3
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermq m0, m0, q3120
- vpermd m5, m10, m5
- mova [dstq+32*1], m0
- mova [maskq+32*0], m5
- W_MASK 0, 4, 4, 5
- vpermq m0, m0, q3120
- mova [dstq+32*2], m0
- W_MASK 0, 5, 6, 7
- packuswb m4, m5
- psubb m5, m8, m4
- pavgb m5, m9
- vpermq m0, m0, q3120
- vpermd m5, m10, m5
- mova [dstq+32*3], m0
- mova [maskq+32*1], m5
- dec hd
- jg .w128_loop
- RET
-
-cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_444_avx2_table
- lea r7, [w_mask_444_avx2_table]
- tzcnt wd, wm
- movifnidn hd, hm
- mov maskq, maskmp
- movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m5, [base+pb_64]
- vpbroadcastd m7, [base+pw_2048]
- add wq, r7
- W_MASK 0, 4, 0, 1, 1
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- vextracti128 xm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- mova [maskq+32*0], m4
- cmp hd, 8
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- je .w4_end
- W_MASK 0, 4, 2, 3, 1
- lea dstq, [dstq+strideq*4]
- vextracti128 xm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xm1
- pextrd [dstq+stride3q ], xm1, 1
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xm1, 2
- pextrd [dstq+stride3q ], xm1, 3
- mova [maskq+32*1], m4
-.w4_end:
- RET
-.w8_loop:
- add tmp1q, 32*2
- add tmp2q, 32*2
- W_MASK 0, 4, 0, 1, 1
- lea dstq, [dstq+strideq*4]
- add maskq, 32
-.w8:
- vextracti128 xm1, m0, 1
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xm1
- mova [maskq], m4
- sub hd, 4
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 32*2
- add tmp2q, 32*2
- W_MASK 0, 4, 0, 1, 1
- lea dstq, [dstq+strideq*2]
- add maskq, 32
-.w16:
- vpermq m0, m0, q3120
- mova [dstq+strideq*0], xm0
- vextracti128 [dstq+strideq*1], m0, 1
- mova [maskq], m4
- sub hd, 2
- jg .w16_loop
- RET
-.w32_loop:
- add tmp1q, 32*2
- add tmp2q, 32*2
- W_MASK 0, 4, 0, 1, 1
- add dstq, strideq
- add maskq, 32
-.w32:
- vpermq m0, m0, q3120
- mova [dstq], m0
- mova [maskq], m4
- dec hd
- jg .w32_loop
- RET
-.w64_loop:
- add tmp1q, 32*4
- add tmp2q, 32*4
- W_MASK 0, 4, 0, 1, 1
- add dstq, strideq
- add maskq, 32*2
-.w64:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- mova [maskq+32*0], m4
- W_MASK 0, 4, 2, 3, 1
- vpermq m0, m0, q3120
- mova [dstq+32*1], m0
- mova [maskq+32*1], m4
- dec hd
- jg .w64_loop
- RET
-.w128_loop:
- add tmp1q, 32*8
- add tmp2q, 32*8
- W_MASK 0, 4, 0, 1, 1
- add dstq, strideq
- add maskq, 32*4
-.w128:
- vpermq m0, m0, q3120
- mova [dstq+32*0], m0
- mova [maskq+32*0], m4
- W_MASK 0, 4, 2, 3, 1
- vpermq m0, m0, q3120
- mova [dstq+32*1], m0
- mova [maskq+32*1], m4
- W_MASK 0, 4, 4, 5, 1
- vpermq m0, m0, q3120
- mova [dstq+32*2], m0
- mova [maskq+32*2], m4
- W_MASK 0, 4, 6, 7, 1
- vpermq m0, m0, q3120
- mova [dstq+32*3], m0
- mova [maskq+32*3], m4
- dec hd
- jg .w128_loop
- RET
-
-%if HAVE_AVX512ICL
-INIT_ZMM avx512icl
-PREP_BILIN
-PREP_8TAP
-AVG_FN
-W_AVG_FN
-MASK_FN
-
-cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_420_avx512icl_table
- lea r7, [w_mask_420_avx512icl_table]
- tzcnt wd, wm
- mov r6d, r7m ; sign
- movifnidn hd, hm
- movsxd wq, [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m7, [base+pw_2048]
- vpbroadcastd m9, [base+pb_m64] ; -1 << 6
- mova ym10, [base+wm_420_mask+32]
- vpbroadcastd m8, [base+wm_sign_avx512+r6*8] ; (258 - sign) << 6
- add wq, r7
- mov maskq, maskmp
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- mova m5, [wm_420_perm4]
- cmp hd, 8
- jg .w4_h16
- WRAP_YMM W_MASK 0, 4, 0, 1
- vinserti128 ym5, [wm_420_perm4+32], 1
- vpermb ym4, ym5, ym4
- vpdpbusd ym8, ym4, ym9
- vextracti128 xmm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
-.w4_end:
- vpermb ym8, ym10, ym8
- movq [maskq], xm8
- RET
-.w4_h16:
- vpbroadcastd m11, strided
- pmulld m11, [bidir_sctr_w4]
- W_MASK 0, 4, 0, 1
- vpermb m4, m5, m4
- vpdpbusd m8, m4, m9
- kxnorw k1, k1, k1
- vpermb m8, m10, m8
- mova [maskq], xm8
- vpscatterdd [dstq+m11]{k1}, m0
- RET
-.w8:
- mova m5, [wm_420_perm8]
- cmp hd, 4
- jne .w8_h8
- WRAP_YMM W_MASK 0, 4, 0, 1
- vinserti128 ym5, [wm_420_perm8+32], 1
- vpermb ym4, ym5, ym4
- vpdpbusd ym8, ym4, ym9
- vpermb m8, m10, m8
- mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
- RET
-.w8_loop:
- add tmp1q, 128
- add tmp2q, 128
- add maskq, 16
- lea dstq, [dstq+strideq*4]
-.w8_h8:
- W_MASK 0, 4, 0, 1
- vpermb m4, m5, m4
- mova m1, m8
- vpdpbusd m1, m4, m9
- vpermb m1, m10, m1
- mova [maskq], xm1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
- lea dstq, [dstq+strideq*4]
- movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
- sub hd, 8
- jg .w8_loop
- RET
-.w16:
- mova m5, [wm_420_perm16]
-.w16_loop:
- W_MASK 0, 4, 0, 1
- vpermb m4, m5, m4
- mova m1, m8
- vpdpbusd m1, m4, m9
- add tmp1q, 128
- add tmp2q, 128
- vpermb m1, m10, m1
- vpermq m0, m0, q3120
- mova [maskq], xm1
- add maskq, 16
- mova [dstq+strideq*0], xm0
- vextracti32x4 [dstq+strideq*1], m0, 2
- vextracti32x4 [dstq+strideq*2], ym0, 1
- vextracti32x4 [dstq+stride3q ], m0, 3
- lea dstq, [dstq+strideq*4]
- sub hd, 4
- jg .w16_loop
- RET
-.w32:
- pmovzxbq m5, [warp_8x8_shufA]
-.w32_loop:
- W_MASK 0, 4, 0, 1
- mova m1, m8
- vpdpbusd m1, m4, m9
- add tmp1q, 128
- add tmp2q, 128
- vpermb m1, m10, m1
- vpermq m0, m5, m0
- mova [maskq], xm1
- add maskq, 16
- mova [dstq+strideq*0], ym0
- vextracti32x8 [dstq+strideq*1], m0, 1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w32_loop
- RET
-.w64:
- pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
- psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
-.w64_loop:
- W_MASK 0, 4, 0, 2
- W_MASK 11, 5, 1, 3
- mova m2, m8
- vpdpbusd m2, m4, m9
- mova m3, m8
- vpdpbusd m3, m5, m9
- add tmp1q, 256
- add tmp2q, 256
- vpermt2b m2, m10, m3
- mova m1, m0
- vpermt2q m0, m12, m11
- vpermt2q m1, m13, m11
- mova [maskq], ym2
- add maskq, 32
- mova [dstq+strideq*0], m0
- mova [dstq+strideq*1], m1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w64_loop
- RET
-.w128:
- pmovzxbq m14, [wm_420_perm64]
- mova m10, [wm_420_mask]
- psrlq m15, m14, 4
-.w128_loop:
- W_MASK 0, 12, 0, 4
- W_MASK 11, 13, 1, 5
- mova m4, m8
- vpdpbusd m4, m12, m9
- mova m5, m8
- vpdpbusd m5, m13, m9
- mova m1, m0
- vpermt2q m0, m14, m11
- vpermt2q m1, m15, m11
- mova [dstq+strideq*0+64*0], m0
- mova [dstq+strideq*1+64*0], m1
- W_MASK 0, 12, 2, 6
- W_MASK 11, 13, 3, 7
- vprold m4, 16
- vprold m5, 16
- vpdpbusd m4, m12, m9
- vpdpbusd m5, m13, m9
- add tmp1q, 512
- add tmp2q, 512
- vpermt2b m4, m10, m5
- mova m1, m0
- vpermt2q m0, m14, m11
- vpermt2q m1, m15, m11
- mova [maskq], m4
- add maskq, 64
- mova [dstq+strideq*0+64*1], m0
- mova [dstq+strideq*1+64*1], m1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w128_loop
- RET
-
-cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_422_avx512icl_table
- lea r7, [w_mask_422_avx512icl_table]
- tzcnt wd, wm
- mov r6d, r7m ; sign
- movifnidn hd, hm
- movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m7, [base+pw_2048]
- vpbroadcastd m9, [base+pw_m128]
- mova m10, [base+wm_422_mask]
- vpbroadcastd m11, [base+pb_127]
- add wq, r7
- vpbroadcastd m8, [base+wm_sign_avx512+4+r6*4]
- mov maskq, maskmp
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- cmp hd, 8
- jg .w4_h16
- WRAP_YMM W_MASK 0, 4, 0, 1
- movhps xm10, [wm_422_mask+16]
- vpdpwssd ym8, ym4, ym9
- vpermb ym8, ym10, ym8
- vextracti128 xmm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
-.w4_end:
- pand xm8, xm11
- mova [maskq], xm8
- RET
-.w4_h16:
- vpbroadcastd m5, strided
- pmulld m5, [bidir_sctr_w4]
- W_MASK 0, 4, 0, 1
- vpdpwssd m8, m4, m9
- kxnorw k1, k1, k1
- vpermb m8, m10, m8
- pand ym8, ym11
- mova [maskq], ym8
- vpscatterdd [dstq+m5]{k1}, m0
- RET
-.w8:
- cmp hd, 4
- jne .w8_h8
- WRAP_YMM W_MASK 0, 4, 0, 1
- movhps xm10, [wm_422_mask+16]
- vpdpwssd ym8, ym4, ym9
- vpermb ym8, ym10, ym8
- pand xm8, xm11
- mova [maskq], xm8
- vextracti128 xmm1, ym0, 1
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
- RET
-.w8_loop:
- add tmp1q, 128
- add tmp2q, 128
- add maskq, 32
- lea dstq, [dstq+strideq*4]
-.w8_h8:
- W_MASK 0, 4, 0, 1
- mova m1, m8
- vpdpwssd m1, m4, m9
- vpermb m1, m10, m1
- pand ym1, ym11
- mova [maskq], ym1
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
- lea dstq, [dstq+strideq*4]
- movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
- sub hd, 8
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 128
- add tmp2q, 128
- add maskq, 32
- lea dstq, [dstq+strideq*4]
-.w16:
- W_MASK 0, 4, 0, 1
- mova m1, m8
- vpdpwssd m1, m4, m9
- vpermb m1, m10, m1
- vpermq m0, m0, q3120
- pand ym1, ym11
- mova [maskq], ym1
- mova [dstq+strideq*0], xm0
- vextracti32x4 [dstq+strideq*1], m0, 2
- vextracti32x4 [dstq+strideq*2], ym0, 1
- vextracti32x4 [dstq+stride3q ], m0, 3
- sub hd, 4
- jg .w16_loop
- RET
-.w32:
- pmovzxbq m5, [warp_8x8_shufA]
-.w32_loop:
- W_MASK 0, 4, 0, 1
- mova m1, m8
- vpdpwssd m1, m4, m9
- add tmp1q, 128
- add tmp2q, 128
- vpermb m1, m10, m1
- vpermq m0, m5, m0
- pand ym1, ym11
- mova [maskq], ym1
- add maskq, 32
- mova [dstq+strideq*0], ym0
- vextracti32x8 [dstq+strideq*1], m0, 1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w32_loop
- RET
-.w64:
- pmovzxbq m5, [warp_8x8_shufA]
-.w64_loop:
- W_MASK 0, 4, 0, 1
- mova m1, m8
- vpdpwssd m1, m4, m9
- add tmp1q, 128
- add tmp2q, 128
- vpermb m1, m10, m1
- vpermq m0, m5, m0
- pand ym1, ym11
- mova [maskq], ym1
- add maskq, 32
- mova [dstq], m0
- add dstq, strideq
- dec hd
- jg .w64_loop
- RET
-.w128:
- pmovzxbq m13, [warp_8x8_shufA]
-.w128_loop:
- W_MASK 0, 4, 0, 1
- W_MASK 12, 5, 2, 3
- mova m2, m8
- vpdpwssd m2, m4, m9
- mova m3, m8
- vpdpwssd m3, m5, m9
- add tmp1q, 256
- add tmp2q, 256
- vpermt2b m2, m10, m3
- vpermq m0, m13, m0
- vpermq m1, m13, m12
- pand m2, m11
- mova [maskq], m2
- add maskq, 64
- mova [dstq+64*0], m0
- mova [dstq+64*1], m1
- add dstq, strideq
- dec hd
- jg .w128_loop
- RET
-
-cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
-%define base r7-w_mask_444_avx512icl_table
- lea r7, [w_mask_444_avx512icl_table]
- tzcnt wd, wm
- movifnidn hd, hm
- movsxd wq, dword [r7+wq*4]
- vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
- vpbroadcastd m5, [base+pb_64]
- vpbroadcastd m7, [base+pw_2048]
- mova m8, [base+wm_444_mask]
- add wq, r7
- mov maskq, maskmp
- lea stride3q, [strideq*3]
- jmp wq
-.w4:
- cmp hd, 8
- jg .w4_h16
- WRAP_YMM W_MASK 0, 4, 0, 1, 1
- vinserti128 ym8, [wm_444_mask+32], 1
- vpermb ym4, ym8, ym4
- mova [maskq], ym4
- vextracti128 xmm1, m0, 1
- movd [dstq+strideq*0], xm0
- pextrd [dstq+strideq*1], xm0, 1
- movd [dstq+strideq*2], xmm1
- pextrd [dstq+stride3q ], xmm1, 1
- jl .w4_end
- lea dstq, [dstq+strideq*4]
- pextrd [dstq+strideq*0], xm0, 2
- pextrd [dstq+strideq*1], xm0, 3
- pextrd [dstq+strideq*2], xmm1, 2
- pextrd [dstq+stride3q ], xmm1, 3
-.w4_end:
- RET
-.w4_h16:
- vpbroadcastd m9, strided
- pmulld m9, [bidir_sctr_w4]
- W_MASK 0, 4, 0, 1, 1
- vpermb m4, m8, m4
- kxnorw k1, k1, k1
- mova [maskq], m4
- vpscatterdd [dstq+m9]{k1}, m0
- RET
-.w8:
- cmp hd, 4
- jne .w8_h8
- WRAP_YMM W_MASK 0, 4, 0, 1, 1
- vinserti128 ym8, [wm_444_mask+32], 1
- vpermb ym4, ym8, ym4
- mova [maskq], ym4
- vextracti128 xmm1, ym0, 1
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xm0
- movhps [dstq+stride3q ], xmm1
- RET
-.w8_loop:
- add tmp1q, 128
- add tmp2q, 128
- add maskq, 64
- lea dstq, [dstq+strideq*4]
-.w8_h8:
- W_MASK 0, 4, 0, 1, 1
- vpermb m4, m8, m4
- mova [maskq], m4
- vextracti32x4 xmm1, ym0, 1
- vextracti32x4 xmm2, m0, 2
- vextracti32x4 xmm3, m0, 3
- movq [dstq+strideq*0], xm0
- movq [dstq+strideq*1], xmm1
- movq [dstq+strideq*2], xmm2
- movq [dstq+stride3q ], xmm3
- lea dstq, [dstq+strideq*4]
- movhps [dstq+strideq*0], xm0
- movhps [dstq+strideq*1], xmm1
- movhps [dstq+strideq*2], xmm2
- movhps [dstq+stride3q ], xmm3
- sub hd, 8
- jg .w8_loop
- RET
-.w16_loop:
- add tmp1q, 128
- add tmp2q, 128
- add maskq, 64
- lea dstq, [dstq+strideq*4]
-.w16:
- W_MASK 0, 4, 0, 1, 1
- vpermb m4, m8, m4
- vpermq m0, m0, q3120
- mova [maskq], m4
- mova [dstq+strideq*0], xm0
- vextracti32x4 [dstq+strideq*1], m0, 2
- vextracti32x4 [dstq+strideq*2], ym0, 1
- vextracti32x4 [dstq+stride3q ], m0, 3
- sub hd, 4
- jg .w16_loop
- RET
-.w32:
- pmovzxbq m9, [warp_8x8_shufA]
-.w32_loop:
- W_MASK 0, 4, 0, 1, 1
- vpermb m4, m8, m4
- add tmp1q, 128
- add tmp2q, 128
- vpermq m0, m9, m0
- mova [maskq], m4
- add maskq, 64
- mova [dstq+strideq*0], ym0
- vextracti32x8 [dstq+strideq*1], m0, 1
- lea dstq, [dstq+strideq*2]
- sub hd, 2
- jg .w32_loop
- RET
-.w64:
- pmovzxbq m9, [warp_8x8_shufA]
-.w64_loop:
- W_MASK 0, 4, 0, 1, 1
- vpermb m4, m8, m4
- add tmp1q, 128
- add tmp2q, 128
- vpermq m0, m9, m0
- mova [maskq], m4
- add maskq, 64
- mova [dstq], m0
- add dstq, strideq
- dec hd
- jg .w64_loop
- RET
-.w128:
- pmovzxbq m11, [warp_8x8_shufA]
-.w128_loop:
- W_MASK 0, 4, 0, 1, 1
- W_MASK 10, 9, 2, 3, 1
- vpermb m4, m8, m4
- vpermb m9, m8, m9
- add tmp1q, 256
- add tmp2q, 256
- vpermq m0, m11, m0
- vpermq m10, m11, m10
- mova [maskq+64*0], m4
- mova [maskq+64*1], m9
- add maskq, 128
- mova [dstq+64*0], m0
- mova [dstq+64*1], m10
- add dstq, strideq
- dec hd
- jg .w128_loop
- RET
-
-%endif ; HAVE_AVX512ICL
-
-%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/mc_avx2.asm
@@ -1,0 +1,6167 @@
+; Copyright © 2018-2020, VideoLAN and dav1d authors
+; Copyright © 2018-2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if ARCH_X86_64
+
+SECTION_RODATA 32
+
+; dav1d_obmc_masks[] with 64-x interleaved
+obmc_masks: db 0, 0, 0, 0
+ ; 2
+ db 45, 19, 64, 0
+ ; 4
+ db 39, 25, 50, 14, 59, 5, 64, 0
+ ; 8
+ db 36, 28, 42, 22, 48, 16, 53, 11, 57, 7, 61, 3, 64, 0, 64, 0
+ ; 16
+ db 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
+ db 56, 8, 58, 6, 60, 4, 61, 3, 64, 0, 64, 0, 64, 0, 64, 0
+ ; 32
+ db 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
+ db 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55, 9
+ db 56, 8, 57, 7, 58, 6, 59, 5, 60, 4, 60, 4, 61, 3, 62, 2
+ db 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0
+
+warp_8x8_shufA: db 0, 2, 4, 6, 1, 3, 5, 7, 1, 3, 5, 7, 2, 4, 6, 8
+ db 4, 6, 8, 10, 5, 7, 9, 11, 5, 7, 9, 11, 6, 8, 10, 12
+warp_8x8_shufB: db 2, 4, 6, 8, 3, 5, 7, 9, 3, 5, 7, 9, 4, 6, 8, 10
+ db 6, 8, 10, 12, 7, 9, 11, 13, 7, 9, 11, 13, 8, 10, 12, 14
+subpel_h_shuf4: db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+ db 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+subpel_v_shuf4: db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
+subpel_s_shuf2: db 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11
+subpel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3
+pb_8x0_8x8: db 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8
+bdct_lb_dw: db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
+wswap: db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+rescale_mul: dd 0, 1, 2, 3, 4, 5, 6, 7
+resize_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7
+ db 7, 7, 7, 7, 7, 7, 7, 7
+
+wm_420_sign: dd 0x01020102, 0x01010101
+wm_422_sign: dd 0x80808080, 0x7f7f7f7f
+
+pb_64: times 4 db 64
+pw_m256: times 2 dw -256
+pw_32: times 2 dw 32
+pw_34: times 2 dw 34
+pw_258: times 2 dw 258
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_32: dd 32
+pd_63: dd 63
+pd_512: dd 512
+pd_32768: dd 32768
+pd_0x3ff: dd 0x3ff
+pd_0x4000: dd 0x4000
+pq_0x40000000: dq 0x40000000
+
+cextern mc_subpel_filters
+cextern mc_warp_filter
+cextern resize_filter
+
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 1-*
+ %xdefine %1_table (%%table - 2*%2)
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SCALED_JMP_TABLE 1-*
+ %xdefine %1_table (%%table - %2)
+ %xdefine %%base mangle(private_prefix %+ _%1)
+%%table:
+ %rep %0 - 1
+ dw %%base %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_1024:
+ %xdefine %1_dy1_table (%%dy_1024 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy1_w%2 - %%base
+ %rotate 1
+ %endrep
+ %rotate 1
+%%dy_2048:
+ %xdefine %1_dy2_table (%%dy_2048 - %2)
+ %rep %0 - 1
+ dw %%base %+ .dy2_w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine put_avx2 mangle(private_prefix %+ _put_bilin_avx2.put)
+%xdefine prep_avx2 mangle(private_prefix %+ _prep_bilin_avx2.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE put, avx2, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, avx2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, bilin, avx2, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx2, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE put, 8tap, avx2, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx2, 1, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE put_8tap_scaled_avx2, 2, 4, 8, 16, 32, 64, 128
+SCALED_JMP_TABLE prep_8tap_scaled_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444_avx2, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE blend_avx2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_v_avx2, 2, 4, 8, 16, 32
+BIDIR_JMP_TABLE blend_h_avx2, 2, 4, 8, 16, 32, 32, 32
+
+SECTION .text
+
+INIT_XMM avx2
+DECLARE_REG_TMP 4, 6, 7
+cglobal put_bilin, 4, 8, 0, dst, ds, src, ss, w, h, mxy
+ movifnidn mxyd, r6m ; mx
+ lea t2, [put_avx2]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r7m ; my
+ test mxyd, mxyd
+ jnz .v
+.put:
+ movzx wd, word [t2+wq*2+table_offset(put,)]
+ add wq, t2
+ jmp wq
+.put_w2:
+ movzx t0d, word [srcq+ssq*0]
+ movzx t1d, word [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], t0w
+ mov [dstq+dsq*1], t1w
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w2
+ RET
+.put_w4:
+ mov t0d, [srcq+ssq*0]
+ mov t1d, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], t0d
+ mov [dstq+dsq*1], t1d
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w4
+ RET
+.put_w8:
+ mov t0, [srcq+ssq*0]
+ mov t1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mov [dstq+dsq*0], t0
+ mov [dstq+dsq*1], t1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w8
+ RET
+.put_w16:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w16
+ RET
+INIT_YMM avx2
+.put_w32:
+ movu m0, [srcq+ssq*0]
+ movu m1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0], m0
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w32
+ RET
+.put_w64:
+ movu m0, [srcq+ssq*0+32*0]
+ movu m1, [srcq+ssq*0+32*1]
+ movu m2, [srcq+ssq*1+32*0]
+ movu m3, [srcq+ssq*1+32*1]
+ lea srcq, [srcq+ssq*2]
+ mova [dstq+dsq*0+32*0], m0
+ mova [dstq+dsq*0+32*1], m1
+ mova [dstq+dsq*1+32*0], m2
+ mova [dstq+dsq*1+32*1], m3
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .put_w64
+ RET
+.put_w128:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+ movu m2, [srcq+32*2]
+ movu m3, [srcq+32*3]
+ add srcq, ssq
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ mova [dstq+32*2], m2
+ mova [dstq+32*3], m3
+ add dstq, dsq
+ dec hd
+ jg .put_w128
+ RET
+.h:
+ ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
+ ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ movd xm5, mxyd
+ mov mxyd, r7m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_h)]
+ vpbroadcastd m3, [pw_2048]
+ add wq, t2
+ jmp wq
+.h_w2:
+ movd xm0, [srcq+ssq*0]
+ pinsrd xm0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2
+ RET
+.h_w4:
+ mova xm4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+ pmulhrsw xm0, xm3
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+ssq*0]
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pshufb xm1, xm4
+ pmaddubsw xm0, xm5
+ pmaddubsw xm1, xm5
+ pmulhrsw xm0, xm3
+ pmulhrsw xm1, xm3
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ add srcq, ssq
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .h_w32
+ RET
+.h_w64:
+ movu m0, [srcq+8*0]
+ movu m1, [srcq+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ movu m1, [srcq+8*4]
+ movu m2, [srcq+8*5]
+ add srcq, ssq
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmulhrsw m1, m3
+ pmulhrsw m2, m3
+ packuswb m1, m2
+ mova [dstq+32*0], m0
+ mova [dstq+32*1], m1
+ add dstq, dsq
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ mov t1, -32*3
+.h_w128_loop:
+ movu m0, [srcq+t1+32*3+8*0]
+ movu m1, [srcq+t1+32*3+8*1]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ packuswb m0, m1
+ mova [dstq+t1+32*3], m0
+ add t1, 32
+ jle .h_w128_loop
+ add srcq, ssq
+ add dstq, dsq
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_v)]
+ imul mxyd, 0xff01
+ vpbroadcastd m5, [pw_2048]
+ add mxyd, 16 << 8
+ add wq, t2
+ movd xm4, mxyd
+ vpbroadcastw m4, xm4
+ jmp wq
+.v_w2:
+ movd xm0, [srcq+ssq*0]
+.v_w2_loop:
+ pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
+ lea srcq, [srcq+ssq*2]
+ pinsrw xm0, xm1, [srcq+ssq*0], 0 ; 2 1
+ pshuflw xm1, xm1, q2301 ; 1 0
+ punpcklbw xm1, xm0, xm1
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 1
+ pextrw [dstq+dsq*1], xm1, 0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm0, [srcq+ssq*0]
+.v_w4_loop:
+ vpbroadcastd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm2, xm1, xm0, 0x01 ; 0 1
+ vpbroadcastd xm0, [srcq+ssq*0]
+ vpblendd xm1, xm0, 0x02 ; 1 2
+ punpcklbw xm1, xm2
+ pmaddubsw xm1, xm4
+ pmulhrsw xm1, xm5
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+ssq*0]
+.v_w8_loop:
+ movq xm3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw xm1, xm3, xm0
+ movq xm0, [srcq+ssq*0]
+ punpcklbw xm2, xm0, xm3
+ pmaddubsw xm1, xm4
+ pmaddubsw xm2, xm4
+ pmulhrsw xm1, xm5
+ pmulhrsw xm2, xm5
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+ movu xm0, [srcq+ssq*0]
+.v_w16_loop:
+ vbroadcasti128 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m3, m2, m0, 0x0f ; 0 1
+ vbroadcasti128 m0, [srcq+ssq*0]
+ vpblendd m2, m0, 0xf0 ; 1 2
+ punpcklbw m1, m2, m3
+ punpckhbw m2, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ RET
+.v_w32:
+%macro PUT_BILIN_V_W32 0
+ movu m0, [srcq+ssq*0]
+%%loop:
+ movu m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ punpcklbw m1, m3, m0
+ punpckhbw m2, m3, m0
+ movu m0, [srcq+ssq*0]
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*0], m1
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmulhrsw m1, m5
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ mova [dstq+dsq*1], m1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg %%loop
+%endmacro
+ PUT_BILIN_V_W32
+ RET
+.v_w64:
+ movu m0, [srcq+32*0]
+ movu m1, [srcq+32*1]
+.v_w64_loop:
+ add srcq, ssq
+ movu m3, [srcq+32*0]
+ punpcklbw m2, m3, m0
+ punpckhbw m0, m3, m0
+ pmaddubsw m2, m4
+ pmaddubsw m0, m4
+ pmulhrsw m2, m5
+ pmulhrsw m0, m5
+ packuswb m2, m0
+ mova m0, m3
+ movu m3, [srcq+32*1]
+ mova [dstq+32*0], m2
+ punpcklbw m2, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m2, m4
+ pmaddubsw m1, m4
+ pmulhrsw m2, m5
+ pmulhrsw m1, m5
+ packuswb m2, m1
+ mova m1, m3
+ mova [dstq+32*1], m2
+ add dstq, dsq
+ dec hd
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mov t0, dstq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.v_w128_loop:
+ PUT_BILIN_V_W32
+ movzx hd, t2b
+ add t0, 32
+ add t1, 32
+ mov dstq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
+ ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
+ movzx wd, word [t2+wq*2+table_offset(put, _bilin_hv)]
+ WIN64_SPILL_XMM 8
+ shl mxyd, 11 ; can't shift by 12 due to signed overflow
+ vpbroadcastd m7, [pw_2048]
+ movd xm6, mxyd
+ add wq, t2
+ vpbroadcastw m6, xm6
+ jmp wq
+.hv_w2:
+ vpbroadcastd xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w2_loop:
+ movd xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pinsrd xm1, [srcq+ssq*0], 1
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 _ 2 _
+ shufps xm2, xm0, xm1, q1032 ; 0 _ 1 _
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ pextrw [dstq+dsq*0], xm1, 0
+ pextrw [dstq+dsq*1], xm1, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova xm4, [bilin_h_shuf4]
+ movddup xm0, [srcq+ssq*0]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm5
+.hv_w4_loop:
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movhps xm1, [srcq+ssq*0]
+ pshufb xm1, xm4
+ pmaddubsw xm1, xm5 ; 1 2
+ shufps xm2, xm0, xm1, q1032 ; 0 1
+ mova xm0, xm1
+ psubw xm1, xm2
+ paddw xm1, xm1
+ pmulhw xm1, xm6
+ paddw xm1, xm2
+ pmulhrsw xm1, xm7
+ packuswb xm1, xm1
+ movd [dstq+dsq*0], xm1
+ pextrd [dstq+dsq*1], xm1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+ssq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m1, [srcq+ssq*0], 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m2, m0, m1, 0x21 ; 0 1
+ mova m0, m1
+ psubw m1, m2
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m2
+ pmulhrsw m1, m7
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2
+ movq [dstq+dsq*0], xm1
+ movhps [dstq+dsq*1], xm1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu m0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm2, [srcq+ssq*1+8*0]
+ vinserti128 m2, [srcq+ssq*1+8*1], 1
+ lea srcq, [srcq+ssq*2]
+ movu xm3, [srcq+ssq*0+8*0]
+ vinserti128 m3, [srcq+ssq*0+8*1], 1
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m2, m5
+ psubw m1, m2, m0
+ paddw m1, m1
+ pmulhw m1, m6
+ paddw m1, m0
+ pmaddubsw m0, m3, m5
+ psubw m3, m0, m2
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m2
+ pmulhrsw m1, m7
+ pmulhrsw m3, m7
+ packuswb m1, m3
+ vpermq m1, m1, q3120
+ mova [dstq+dsq*0], xm1
+ vextracti128 [dstq+dsq*1], m1, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ xor t2d, t2d
+.hv_w32gt:
+ mov t0, dstq
+ mov t1, srcq
+%if WIN64
+ movaps r4m, xmm8
+%endif
+.hv_w32_loop0:
+ movu m0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*2], 1
+ movu m1, [srcq+8*1]
+ vinserti128 m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, ssq
+ movu xm2, [srcq+8*1]
+ vinserti128 m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m1
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ pmulhrsw m8, m3, m7
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, [srcq+8*2], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ paddw m3, m3
+ pmulhw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ pmulhrsw m3, m7
+ packuswb m3, m8
+ mova [dstq], m3
+ add dstq, dsq
+ dec hd
+ jg .hv_w32_loop
+ movzx hd, t2b
+ add t0, 32
+ add t1, 32
+ mov dstq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w32_loop0
+%if WIN64
+ movaps xmm8, r4m
+%endif
+ RET
+.hv_w64:
+ lea t2d, [hq+(1<<8)]
+ jmp .hv_w32gt
+.hv_w128:
+ lea t2d, [hq+(3<<8)]
+ jmp .hv_w32gt
+
+DECLARE_REG_TMP 3, 5, 6
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep%+SUFFIX]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xm0, [srcq+strideq*0]
+ pinsrd xm0, [srcq+strideq*1], 1
+ pinsrd xm0, [srcq+strideq*2], 2
+ pinsrd xm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, xm0
+ pmovzxbw m1, xm1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0+16*0]
+ pmovzxbw m1, [srcq+strideq*0+16*1]
+ pmovzxbw m2, [srcq+strideq*1+16*0]
+ pmovzxbw m3, [srcq+strideq*1+16*1]
+ lea srcq, [srcq+strideq*2]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+16*0]
+ pmovzxbw m1, [srcq+16*1]
+ pmovzxbw m2, [srcq+16*2]
+ pmovzxbw m3, [srcq+16*3]
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ pmovzxbw m0, [srcq+16*4]
+ pmovzxbw m1, [srcq+16*5]
+ pmovzxbw m2, [srcq+16*6]
+ pmovzxbw m3, [srcq+16*7]
+ add tmpq, 32*8
+ add srcq, strideq
+ psllw m0, 4
+ psllw m1, 4
+ psllw m2, 4
+ psllw m3, 4
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ vbroadcasti128 m4, [bilin_h_shuf8]
+ add mxyd, 16 << 8
+ movd xm5, mxyd
+ mov mxyd, r6m ; my
+ vpbroadcastw m5, xm5
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ movhps xm0, [srcq+strideq*1]
+ movq xm1, [srcq+strideq*2]
+ movhps xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m0, xm1, 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+.h_w8_loop:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ movu xm1, [srcq+strideq*2]
+ vinserti128 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ add tmpq, 32*2
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+.h_w16_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ movu xm2, [srcq+strideq*2+8*0]
+ vinserti128 m2, [srcq+strideq*2+8*1], 1
+ movu xm3, [srcq+stride3q +8*0]
+ vinserti128 m3, [srcq+stride3q +8*1], 1
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+.h_w32_loop:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ movu xm1, [srcq+strideq*0+8*2]
+ vinserti128 m1, [srcq+strideq*0+8*3], 1
+ movu xm2, [srcq+strideq*1+8*0]
+ vinserti128 m2, [srcq+strideq*1+8*1], 1
+ movu xm3, [srcq+strideq*1+8*2]
+ vinserti128 m3, [srcq+strideq*1+8*3], 1
+ lea srcq, [srcq+strideq*2]
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ add tmpq, 32*4
+ dec hd
+ jg .h_w64
+ RET
+.h_w128:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ movu xm2, [srcq+8*4]
+ vinserti128 m2, [srcq+8*5], 1
+ movu xm3, [srcq+8*6]
+ vinserti128 m3, [srcq+8*7], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+32*0], m0
+ mova [tmpq+32*1], m1
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m3
+ movu xm0, [srcq+8* 8]
+ vinserti128 m0, [srcq+8* 9], 1
+ movu xm1, [srcq+8*10]
+ vinserti128 m1, [srcq+8*11], 1
+ movu xm2, [srcq+8*12]
+ vinserti128 m2, [srcq+8*13], 1
+ movu xm3, [srcq+8*14]
+ vinserti128 m3, [srcq+8*15], 1
+ add tmpq, 32*8
+ add srcq, strideq
+ pshufb m0, m4
+ pshufb m1, m4
+ pshufb m2, m4
+ pshufb m3, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq-32*4], m0
+ mova [tmpq-32*3], m1
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ dec hd
+ jg .h_w128
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ jmp wq
+.v_w4:
+ movd xm0, [srcq+strideq*0]
+.v_w4_loop:
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x05 ; 0 2 2 2
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpblendd m3, m2, 0x0f ; 1 1 3 3
+ vpblendd m2, m1, m0, 0xa0 ; 0 2 2 4
+ vpblendd m1, m3, 0xaa ; 0 1 2 3
+ vpblendd m2, m3, 0x55 ; 1 2 3 4
+ punpcklbw m2, m1
+ pmaddubsw m2, m6
+ mova [tmpq], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm0, [srcq+strideq*0]
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpbroadcastq m2, [srcq+strideq*1]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x03 ; 0 2 2 2
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m3, m2, 0x33 ; 1 3 1 3
+ vpblendd m2, m1, m3, 0x0f ; 1 3 2 2
+ vpblendd m1, m3, 0xf0 ; 0 2 1 3
+ vpblendd m2, m0, 0xc0 ; 1 3 2 4
+ punpcklbw m3, m2, m1
+ punpckhbw m2, m1
+ pmaddubsw m3, m6
+ pmaddubsw m2, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ vbroadcasti128 m0, [srcq+strideq*0]
+.v_w16_loop:
+ vbroadcasti128 m1, [srcq+strideq*2]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ shufpd m4, m0, m1, 0x0c ; 0 2 ; 0l2l 0h2h
+ vbroadcasti128 m0, [srcq+strideq*0]
+ shufpd m2, m2, m3, 0x0c ; 1 3 ; 1l3l 1h3h
+ shufpd m1, m1, m0, 0x0c ; 2 4 ; 2l4l 2h4h
+ punpcklbw m3, m2, m4
+ punpcklbw m5, m1, m2
+ punpckhbw m1, m2
+ punpckhbw m2, m4
+ pmaddubsw m3, m6
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ pmaddubsw m1, m6
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m5
+ mova [tmpq+32*2], m2
+ mova [tmpq+32*3], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w32_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ vpermq m2, [srcq+strideq*2], q3120
+ vpermq m3, [srcq+stride3q ], q3120
+ lea srcq, [srcq+strideq*4]
+ punpcklbw m4, m1, m0
+ punpckhbw m5, m1, m0
+ vpermq m0, [srcq+strideq*0], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m2, m1
+ punpckhbw m5, m2, m1
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m3, m2
+ punpckhbw m5, m3, m2
+ punpcklbw m1, m0, m3
+ punpckhbw m2, m0, m3
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m5
+ mova [tmpq-32*2], m1
+ mova [tmpq-32*1], m2
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+.v_w64_loop:
+ vpermq m2, [srcq+strideq*1+32*0], q3120
+ vpermq m3, [srcq+strideq*1+32*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m5, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m4
+ mova [tmpq+32*1], m5
+ punpcklbw m4, m3, m1
+ punpckhbw m5, m3, m1
+ vpermq m0, [srcq+strideq*0+32*0], q3120
+ vpermq m1, [srcq+strideq*0+32*1], q3120
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*2], m4
+ mova [tmpq+32*3], m5
+ add tmpq, 32*8
+ punpcklbw m4, m0, m2
+ punpckhbw m5, m0, m2
+ punpcklbw m2, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ mova [tmpq-32*4], m4
+ mova [tmpq-32*3], m5
+ mova [tmpq-32*2], m2
+ mova [tmpq-32*1], m3
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.v_w128_loop0:
+ vpermq m0, [srcq+strideq*0], q3120
+.v_w128_loop:
+ vpermq m1, [srcq+strideq*1], q3120
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m2, m1, m0
+ punpckhbw m3, m1, m0
+ vpermq m0, [srcq+strideq*0], q3120
+ punpcklbw m4, m0, m1
+ punpckhbw m5, m0, m1
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ pmaddubsw m5, m6
+ mova [tmpq+32*0], m2
+ mova [tmpq+32*1], m3
+ mova [tmpq+32*8], m4
+ mova [tmpq+32*9], m5
+ add tmpq, 32*16
+ sub hd, 2
+ jg .v_w128_loop
+ movzx hd, t2b
+ add t0, 64
+ add t1, 32
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .v_w128_loop0
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ movd xm6, mxyd
+ vpbroadcastw m6, xm6
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti128 m4, [bilin_h_shuf4]
+ vpbroadcastq m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w4_loop:
+ movq xm1, [srcq+strideq*1]
+ movhps xm1, [srcq+strideq*2]
+ movq xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movhps xm2, [srcq+strideq*0]
+ vinserti128 m1, xm2, 1
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ vpblendd m2, m1, m0, 0xc0
+ vpermq m2, m2, q2103 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti128 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xm1, [srcq+strideq*1]
+ vinserti128 m1, [srcq+strideq*2], 1
+ movu xm2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m2, [srcq+strideq*0], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5 ; 1 2
+ vperm2i128 m3, m0, m1, 0x21 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vperm2i128 m2, m1, m0, 0x21 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+32*0], m1
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*1], m2
+ add tmpq, 32*2
+ sub hd, 2
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ movu xm0, [srcq+8*0]
+ vinserti128 m0, [srcq+8*1], 1
+ movu xm1, [srcq+8*2]
+ vinserti128 m1, [srcq+8*3], 1
+ pshufb m0, m4
+ pshufb m1, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w32_loop:
+ add srcq, strideq
+ movu xm2, [srcq+8*0]
+ vinserti128 m2, [srcq+8*1], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ psubw m3, m2, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ mova m0, m2
+ movu xm2, [srcq+8*2]
+ vinserti128 m2, [srcq+8*3], 1
+ pshufb m2, m4
+ pmaddubsw m2, m5
+ mova [tmpq+32*0], m3
+ psubw m3, m2, m1
+ pmulhrsw m3, m6
+ paddw m3, m1
+ mova m1, m2
+ mova [tmpq+32*1], m3
+ add tmpq, 32*2
+ dec hd
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(3<<8)]
+.hv_w64_loop0:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w64_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ add tmpq, 32*8
+ mova [tmpq-32*4], m2
+ sub hd, 2
+ jg .hv_w64_loop
+ movzx hd, t2b
+ add t0, 32
+ add t1, 16
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w64_loop0
+ RET
+.hv_w128:
+ mov t0, tmpq
+ mov t1, srcq
+ lea t2d, [hq+(7<<8)]
+.hv_w128_loop0:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w128_loop:
+ movu xm1, [srcq+strideq*1+8*0]
+ vinserti128 m1, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu xm2, [srcq+strideq*0+8*0]
+ vinserti128 m2, [srcq+strideq*0+8*1], 1
+ pshufb m1, m4
+ pshufb m2, m4
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+32*0], m3
+ mova [tmpq+32*8], m2
+ add tmpq, 32*16
+ sub hd, 2
+ jg .hv_w128_loop
+ movzx hd, t2b
+ add t0, 32
+ add t1, 16
+ mov tmpq, t0
+ mov srcq, t1
+ sub t2d, 1<<8
+ jg .hv_w128_loop0
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
+%endif
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 4, 5
+%else
+DECLARE_REG_TMP 7, 8
+%endif
+
+%define PUT_8TAP_FN FN put_8tap,
+
+PUT_8TAP_FN sharp, SHARP, SHARP
+PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal put_8tap, 4, 9, 0, dst, ds, src, ss, w, h, mx, my, ss3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r8, [put_avx2]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r8+wq*2+table_offset(put,)]
+ add wq, r8
+ lea r6, [ssq*3]
+ lea r7, [dsq*3]
+%if WIN64
+ pop r8
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
+ WIN64_SPILL_XMM 11
+ cmp wd, 4
+ jl .h_w2
+ vbroadcasti128 m6, [subpel_h_shufA]
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m7, [subpel_h_shufB]
+ vbroadcasti128 m8, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r8+wq*2+table_offset(put, _8tap_h)]
+ vpbroadcastd m9, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+4]
+ add wq, r8
+ jmp wq
+.h_w2:
+ movzx mxd, mxb
+ dec srcq
+ mova xm4, [subpel_h_shuf4]
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w2_loop:
+ movq xm0, [srcq+ssq*0]
+ movhps xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm4
+ pmaddubsw xm0, xm3
+ phaddw xm0, xm0
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w2_loop
+ RET
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
+.h_w4_loop:
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm0, xm6
+ pshufb xm1, xm6
+ pmaddubsw xm0, xm3
+ pmaddubsw xm1, xm3
+ phaddw xm0, xm1
+ paddw xm0, xm5
+ psraw xm0, 6
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w4_loop
+ RET
+.h_w8:
+%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
+ pshufb m%2, m%1, m7
+ pshufb m%3, m%1, m8
+ pshufb m%1, m6
+ pmaddubsw m%4, m%2, m9
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m10
+ pmaddubsw m%1, m9
+ paddw m%3, m%4
+ paddw m%1, m%2
+ phaddw m%1, m%3
+ paddw m%1, m5
+ psraw m%1, 6
+%endmacro
+ movu xm0, [srcq+ssq*0]
+ vinserti128 m0, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 0, 1, 2, 3
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+ssq*0+8*0]
+ vinserti128 m0, [srcq+ssq*1+8*0], 1
+ movu xm1, [srcq+ssq*0+8*1]
+ vinserti128 m1, [srcq+ssq*1+8*1], 1
+ PUT_8TAP_H 0, 2, 3, 4
+ lea srcq, [srcq+ssq*2]
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ sub dstq, r6
+ mov r4, r6
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PUT_8TAP_H 0, 2, 3, 4
+ PUT_8TAP_H 1, 2, 3, 4
+ packuswb m0, m1
+ mova [dstq+r6], m0
+ add r6, 32
+ jle .h_loop
+ add srcq, ssq
+ add dstq, dsq
+ mov r6, r4
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ tzcnt r6d, wd
+ movzx r6d, word [r8+r6*2+table_offset(put, _8tap_v)]
+ vpbroadcastd m7, [pw_512]
+ lea myq, [r8+myq*8+subpel_filters-put_avx2]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ add r6, r8
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ jmp r6
+.v_w2:
+ movd xm2, [srcq+ssq*0]
+ pinsrw xm2, [srcq+ssq*1], 2
+ pinsrw xm2, [srcq+ssq*2], 4
+ pinsrw xm2, [srcq+ss3q ], 6 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w2_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*0]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 2
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w2_loop
+ RET
+.v_w4:
+ movd xm2, [srcq+ssq*0]
+ pinsrd xm2, [srcq+ssq*1], 1
+ pinsrd xm2, [srcq+ssq*2], 2
+ pinsrd xm2, [srcq+ss3q ], 3 ; 0 1 2 3
+ lea srcq, [srcq+ssq*4]
+ movd xm3, [srcq+ssq*0]
+ vpbroadcastd xm1, [srcq+ssq*1]
+ vpbroadcastd xm0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd xm3, xm1, 0x02 ; 4 5
+ vpblendd xm1, xm0, 0x02 ; 5 6
+ palignr xm4, xm3, xm2, 4 ; 1 2 3 4
+ punpcklbw xm3, xm1 ; 45 56
+ punpcklbw xm1, xm2, xm4 ; 01 12
+ punpckhbw xm2, xm4 ; 23 34
+.v_w4_loop:
+ pmaddubsw xm5, xm1, xm8 ; a0 b0
+ mova xm1, xm2
+ pmaddubsw xm2, xm9 ; a1 b1
+ paddw xm5, xm2
+ mova xm2, xm3
+ pmaddubsw xm3, xm10 ; a2 b2
+ paddw xm5, xm3
+ vpbroadcastd xm4, [srcq+ssq*0]
+ vpblendd xm3, xm0, xm4, 0x02 ; 6 7
+ vpbroadcastd xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd xm4, xm0, 0x02 ; 7 8
+ punpcklbw xm3, xm4 ; 67 78
+ pmaddubsw xm4, xm3, xm11 ; a3 b3
+ paddw xm5, xm4
+ pmulhrsw xm5, xm7
+ packuswb xm5, xm5
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m2, [srcq+ssq*2]
+ vpbroadcastq m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m6, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ pmaddubsw m5, m1, m8 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, m9 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, m10 ; a2 b2
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+ssq*0]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, m11 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ vextracti128 xm4, m5, 1
+ packuswb xm5, xm4
+ movq [dstq+dsq*0], xm5
+ movhps [dstq+dsq*1], xm5
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w8_loop
+ RET
+.v_w16:
+.v_w32:
+.v_w64:
+.v_w128:
+ lea r6d, [wq-16]
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 4
+ mov r6b, hb
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+ssq*0]
+ vbroadcasti128 m5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m0, [srcq+ssq*1]
+ vbroadcasti128 m6, [srcq+ssq*0]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m1, [srcq+ssq*0]
+ vbroadcasti128 m2, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m3, [srcq+ssq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vbroadcasti128 m13, [srcq+ssq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ packuswb m14, m15
+ vpermq m14, m14, q3120
+ mova [dstq+dsq*0], xm14
+ vextracti128 [dstq+dsq*1], m14, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .v_w16_loop
+ movzx hd, r6b
+ add r4, 16
+ add r7, 16
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ jg .hv_w8
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m7, [r8+mxq*8+subpel_filters-put_avx2+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m8, [pw_8192]
+ vpbroadcastd m9, [pd_512]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+ cmp wd, 4
+ je .hv_w4
+ vbroadcasti128 m6, [subpel_h_shuf4]
+ movq xm2, [srcq+ssq*0]
+ movhps xm2, [srcq+ssq*1]
+ movq xm0, [srcq+ssq*2]
+ movhps xm0, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m2, m3, 0x30
+ vpblendd m0, m1, 0x30
+ vpblendd m2, m4, 0xc0
+ pshufb m2, m6
+ pshufb m0, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ phaddw m2, m0
+ pmulhrsw m2, m8
+ vextracti128 xm3, m2, 1
+ palignr xm4, xm3, xm2, 4
+ punpcklwd xm1, xm2, xm4 ; 01 12
+ punpckhwd xm2, xm4 ; 23 34
+ pshufd xm0, xm3, q2121
+ punpcklwd xm3, xm0 ; 45 56
+.hv_w2_loop:
+ pmaddwd xm5, xm1, xm10 ; a0 b0
+ mova xm1, xm2
+ pmaddwd xm2, xm11 ; a1 b1
+ paddd xm5, xm2
+ mova xm2, xm3
+ pmaddwd xm3, xm12 ; a2 b2
+ paddd xm5, xm3
+ movq xm4, [srcq+ssq*0]
+ movhps xm4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pshufb xm4, xm6
+ pmaddubsw xm4, xm7
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm8
+ palignr xm3, xm4, xm0, 12
+ mova xm0, xm4
+ punpcklwd xm3, xm0 ; 67 78
+ pmaddwd xm4, xm3, xm13 ; a3 b3
+ paddd xm5, xm9
+ paddd xm5, xm4
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w2_loop
+ RET
+.hv_w4:
+ mova m6, [subpel_h_shuf4]
+ vpbroadcastq m2, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m0, [srcq+ssq*2]
+ vpbroadcastq m5, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpbroadcastq m3, [srcq+ssq*0]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpbroadcastq m4, [srcq+ssq*1]
+ vpbroadcastq m1, [srcq+ssq*2]
+ add srcq, ss3q
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m4, 0xcc ; 4 5
+ pshufb m2, m6
+ pshufb m0, m6
+ pshufb m3, m6
+ pshufb m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m0, m7
+ pmaddubsw m3, m7
+ pmaddubsw m1, m7
+ phaddw m2, m0
+ phaddw m3, m1
+ pmulhrsw m2, m8
+ pmulhrsw m3, m8
+ palignr m4, m3, m2, 4
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m10 ; a0 b0
+ mova m1, m2
+ pmaddwd m2, m11 ; a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, m12 ; a2 b2
+ paddd m5, m3
+ vpbroadcastq m4, [srcq+ssq*0]
+ vpbroadcastq m3, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m3, 0xcc ; 7 8
+ pshufb m4, m6
+ pmaddubsw m4, m7
+ phaddw m4, m4
+ pmulhrsw m4, m8
+ palignr m3, m4, m0, 12
+ mova m0, m4
+ punpcklwd m3, m0 ; 67 78
+ pmaddwd m4, m3, m13 ; a3 b3
+ paddd m5, m9
+ paddd m5, m4
+ psrad m5, 10
+ vextracti128 xm4, m5, 1
+ packssdw xm5, xm4
+ packuswb xm5, xm5
+ pshuflw xm5, xm5, q3120
+ movd [dstq+dsq*0], xm5
+ pextrd [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
+ vpbroadcastd m11, [r8+mxq*8+subpel_filters-put_avx2+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 6
+ cmovs myd, mxd
+ vpbroadcastq m0, [r8+myq*8+subpel_filters-put_avx2]
+ lea ss3q, [ssq*3]
+ sub srcq, ss3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ lea r6d, [wq-8]
+ mov r4, dstq
+ mov r7, srcq
+ shl r6d, 5
+ mov r6b, hb
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ movu xm6, [srcq+ssq*0]
+ vbroadcasti128 m0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+ssq*0], 1 ; 1 4
+ vinserti128 m6, [srcq+ssq*1], 1 ; 2 5
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 3 6
+%macro HV_H_W8 4-7 ; src/dst, tmp[1-3], shuf[1-3]
+ pshufb %3, %1, %6
+ pshufb %4, %1, %7
+ pshufb %1, %5
+ pmaddubsw %2, %3, m10
+ pmaddubsw %4, m11
+ pmaddubsw %3, m11
+ pmaddubsw %1, m10
+ paddw %2, %4
+ paddw %1, %3
+ phaddw %1, %2
+%endmacro
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 r6m, m0, 1 ; not enough registers
+ movu xm0, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ vinserti128 m0, [srcq+ssq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_512]
+ vbroadcasti128 m6, r6m
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 10
+ psrad m7, 10
+ packssdw m8, m7
+ vextracti128 xm7, m8, 1
+ packuswb xm8, xm7
+ pshufd xm7, xm8, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .hv_w8_loop
+ movzx hd, r6b
+ add r4, 8
+ add r7, 8
+ mov dstq, r4
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro PREP_8TAP_H 0
+ pshufb m1, m0, m5
+ pshufb m2, m0, m6
+ pshufb m3, m0, m7
+ pmaddubsw m1, m8
+ pmaddubsw m0, m2, m8
+ pmaddubsw m2, m9
+ pmaddubsw m3, m9
+ paddw m1, m2
+ paddw m0, m3
+ phaddw m0, m1, m0
+ pmulhrsw m0, m4
+%endmacro
+
+%macro PREP_8TAP_V_W4 5 ; round, weights
+ movd xm0, [srcq+strideq*0]
+ vpbroadcastd m1, [srcq+strideq*2]
+ vpbroadcastd xm2, [srcq+strideq*1]
+ vpbroadcastd m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd m3, m2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd m0, [srcq+strideq*0]
+ vpbroadcastd m2, [srcq+strideq*1]
+ vpblendd m1, m0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd m0, [srcq+strideq*2]
+ vbroadcasti128 m5, [deint_shuf4]
+ vpblendd m3, m2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd m2, m3, m1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd m3, m1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw m1, m2, m3 ; 01 12 23 34
+ vpblendd m3, m0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw m2, m3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd m3, [srcq+strideq*0]
+ vpbroadcastd m4, [srcq+strideq*1]
+ vpblendd m3, m4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd m3, m0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd m0, [srcq+strideq*2]
+ vpblendd m3, m0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb m3, m5 ; 67 78 89 9a
+ pmaddubsw m4, m1, m%2
+ vperm2i128 m1, m2, m3, 0x21 ; 45 56 67 78
+ pmaddubsw m2, m%3
+ paddw m4, m2
+ mova m2, m3
+ pmaddubsw m3, m%5
+ paddw m3, m4
+ pmaddubsw m4, m1, m%4
+ paddw m3, m4
+ pmulhrsw m3, m%1
+ mova [tmpq], m3
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep%+SUFFIX]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pw_8192]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq m2, [srcq+strideq*2]
+ movq xm1, [srcq+strideq*1]
+ vpblendd m0, m2, 0xf0
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd m1, m2, 0xf0
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m6
+ pmaddubsw m1, m6
+ phaddw m0, m1
+ pmulhrsw m0, m4
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ movu xm0, [srcq+strideq*0]
+ vinserti128 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq], m0
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8
+ RET
+.h_w16:
+ movu xm0, [srcq+strideq*0+8*0]
+ vinserti128 m0, [srcq+strideq*0+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+strideq*1+8*0]
+ vinserti128 m0, [srcq+strideq*1+8*1], 1
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ sub hd, 2
+ jg .h_w16
+ RET
+.h_w32:
+ xor r6d, r6d
+ jmp .h_start
+.h_w64:
+ mov r6, -32*1
+ jmp .h_start
+.h_w128:
+ mov r6, -32*3
+.h_start:
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu xm0, [srcq+r6+8*0]
+ vinserti128 m0, [srcq+r6+8*1], 1
+ PREP_8TAP_H
+ mova [tmpq+32*0], m0
+ movu xm0, [srcq+r6+8*2]
+ vinserti128 m0, [srcq+r6+8*3], 1
+ PREP_8TAP_H
+ mova [tmpq+32*1], m0
+ add tmpq, 32*2
+ add r6, 32
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+.v_w4:
+ PREP_8TAP_V_W4 7, 8, 9, 10, 11
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ movq xm1, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m1, m4, 0x30
+ vpblendd m4, m2, 0x30
+ punpcklbw m1, m4 ; 01 12
+ vpblendd m2, m5, 0x30
+ vpblendd m5, m3, 0x30
+ punpcklbw m2, m5 ; 23 34
+ vpblendd m3, m6, 0x30
+ vpblendd m6, m0, 0x30
+ punpcklbw m3, m6 ; 45 56
+.v_w8_loop:
+ vpbroadcastq m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m5, m2, m9 ; a1
+ pmaddubsw m6, m2, m8 ; b0
+ vpblendd m2, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*0]
+ vpblendd m4, m0, 0x30
+ punpcklbw m2, m4 ; 67 78
+ pmaddubsw m1, m8 ; a0
+ pmaddubsw m4, m3, m9 ; b1
+ paddw m5, m1
+ mova m1, m3
+ pmaddubsw m3, m10 ; a2
+ paddw m6, m4
+ paddw m5, m3
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpblendd m3, m0, m4, 0x30
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpblendd m4, m0, 0x30
+ punpcklbw m3, m4 ; 89 9a
+ pmaddubsw m4, m2, m11 ; a3
+ paddw m5, m4
+ pmaddubsw m4, m2, m10 ; b2
+ paddw m6, m4
+ pmaddubsw m4, m3, m11 ; b3
+ paddw m6, m4
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ mova [tmpq+32*0], m5
+ mova [tmpq+32*1], m6
+ add tmpq, 32*2
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ lea r6d, [wq-16]
+ mov r5, tmpq
+ mov r7, srcq
+ shl r6d, 4
+ mov r6b, hb
+.v_w16_loop0:
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*0]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m1, [srcq+strideq*0]
+ vbroadcasti128 m2, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+strideq*0]
+ shufpd m4, m4, m0, 0x0c
+ shufpd m5, m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w16_loop:
+ vbroadcasti128 m12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vbroadcasti128 m13, [srcq+strideq*0]
+ pmaddubsw m14, m1, m8 ; a0
+ pmaddubsw m15, m2, m8 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, m9 ; a1
+ pmaddubsw m4, m9 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, m10 ; a2
+ pmaddubsw m6, m10 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, m11 ; a3
+ pmaddubsw m13, m6, m11 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova [tmpq+wq*0], m14
+ mova [tmpq+wq*2], m15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w16_loop
+ movzx hd, r6b
+ add r5, 32
+ add r7, 16
+ mov tmpq, r5
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .v_w16_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep%+SUFFIX+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep%+SUFFIX+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp .hv_w8
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep%+SUFFIX]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mova m7, [subpel_h_shuf4]
+ pmovzxbd m9, [deint_shuf4]
+ vpbroadcastd m10, [pw_8192]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ vpbroadcastq m2, [srcq+strideq*0]
+ vpbroadcastq m4, [srcq+strideq*1]
+ vpbroadcastq m0, [srcq+strideq*2]
+ vpbroadcastq m5, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m6, [srcq+strideq*1]
+ vpbroadcastq m1, [srcq+strideq*2]
+ vpblendd m2, m4, 0xcc ; 0 1
+ vpblendd m0, m5, 0xcc ; 2 3
+ vpblendd m3, m6, 0xcc ; 4 5
+ pshufb m2, m7 ; 00 01 10 11 02 03 12 13
+ pshufb m0, m7 ; 20 21 30 31 22 23 32 33
+ pshufb m3, m7 ; 40 41 50 51 42 43 52 53
+ pshufb m1, m7 ; 60 61 60 61 62 63 62 63
+ pmaddubsw m2, m8
+ pmaddubsw m0, m8
+ pmaddubsw m3, m8
+ pmaddubsw m1, m8
+ phaddw m2, m0 ; 0a 1a 2a 3a 0b 1b 2b 3b
+ phaddw m3, m1 ; 4a 5a 6a __ 4b 5b 6b __
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ palignr m4, m3, m2, 4 ; 1a 2a 3a 4a 1b 2b 3b 4b
+ punpcklwd m1, m2, m4 ; 01 12
+ punpckhwd m2, m4 ; 23 34
+ pshufd m0, m3, q2121
+ punpcklwd m3, m0 ; 45 56
+.hv_w4_loop:
+ pmaddwd m5, m1, m12 ; a0 b0
+ pmaddwd m6, m2, m12 ; c0 d0
+ pmaddwd m2, m13 ; a1 b1
+ pmaddwd m4, m3, m13 ; c1 d1
+ mova m1, m3
+ pmaddwd m3, m14 ; a2 b2
+ paddd m5, m2
+ vpbroadcastq m2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ paddd m6, m4
+ paddd m5, m3
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m3, [srcq+strideq*1]
+ vpblendd m2, m4, 0xcc
+ vpbroadcastq m4, [srcq+strideq*2]
+ vpblendd m3, m4, 0xcc
+ pshufb m2, m7
+ pshufb m3, m7
+ pmaddubsw m2, m8
+ pmaddubsw m3, m8
+ phaddw m2, m3
+ pmulhrsw m2, m10
+ palignr m3, m2, m0, 12
+ mova m0, m2
+ punpcklwd m2, m3, m0 ; 67 78
+ punpckhwd m3, m0 ; 89 9a
+ pmaddwd m4, m2, m14 ; c2 d2
+ paddd m6, m11
+ paddd m5, m11
+ paddd m6, m4
+ pmaddwd m4, m2, m15 ; a3 b3
+ paddd m5, m4
+ pmaddwd m4, m3, m15 ; c3 d3
+ paddd m6, m4
+ psrad m5, 6
+ psrad m6, 6
+ packssdw m5, m6
+ vpermd m5, m9, m5
+ mova [tmpq], m5
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ lea r6d, [wq-8]
+ mov r5, tmpq
+ mov r7, srcq
+ shl r6d, 5
+ mov r6b, hb
+.hv_w8_loop0:
+ vbroadcasti128 m7, [subpel_h_shufA]
+ vbroadcasti128 m8, [subpel_h_shufB]
+ vbroadcasti128 m9, [subpel_h_shufC]
+ movu xm4, [srcq+strideq*0]
+ movu xm5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu xm6, [srcq+strideq*0]
+ vbroadcasti128 m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpblendd m4, m0, 0xf0 ; 0 3
+ vinserti128 m5, [srcq+strideq*0], 1 ; 1 4
+ vinserti128 m6, [srcq+strideq*1], 1 ; 2 5
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 3 6
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9
+ vpbroadcastd m7, [pw_8192]
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ vpermq m6, m6, q3120
+ pmulhrsw m0, m7
+ pmulhrsw m4, m7
+ pmulhrsw m5, m7
+ pmulhrsw m6, m7
+ vpermq m7, m0, q3120
+ punpcklwd m1, m4, m5 ; 01
+ punpckhwd m4, m5 ; 34
+ punpcklwd m2, m5, m6 ; 12
+ punpckhwd m5, m6 ; 45
+ punpcklwd m3, m6, m7 ; 23
+ punpckhwd m6, m7 ; 56
+.hv_w8_loop:
+ vextracti128 [tmpq], m0, 1 ; not enough registers
+ movu xm0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti128 m0, [srcq+strideq*0], 1 ; 7 8
+ pmaddwd m8, m1, m12 ; a0
+ pmaddwd m9, m2, m12 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddwd m3, m13 ; a1
+ pmaddwd m4, m13 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddwd m5, m14 ; a2
+ pmaddwd m6, m14 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ vbroadcasti128 m5, [subpel_h_shufA]
+ HV_H_W8 m0, m5, m6, m7, m5, m6, m7
+ vpbroadcastd m5, [pw_8192]
+ vpbroadcastd m7, [pd_32]
+ vbroadcasti128 m6, [tmpq]
+ pmulhrsw m0, m5
+ paddd m8, m7
+ paddd m9, m7
+ vpermq m7, m0, q3120 ; 7 8
+ shufpd m6, m6, m7, 0x04 ; 6 7
+ punpcklwd m5, m6, m7 ; 67
+ punpckhwd m6, m7 ; 78
+ pmaddwd m7, m5, m15 ; a3
+ paddd m8, m7
+ pmaddwd m7, m6, m15 ; b3
+ paddd m7, m9
+ psrad m8, 6
+ psrad m7, 6
+ packssdw m8, m7
+ vpermq m7, m8, q3120
+ mova [tmpq+wq*0], xm7
+ vextracti128 [tmpq+wq*2], m7, 1
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .hv_w8_loop
+ movzx hd, r6b
+ add r5, 16
+ add r7, 8
+ mov tmpq, r5
+ mov srcq, r7
+ sub r6d, 1<<8
+ jg .hv_w8_loop0
+ RET
+
+%macro movifprep 2
+ %if isprep
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro REMAP_REG 2
+ %xdefine r%1 r%2
+ %xdefine r%1q r%2q
+ %xdefine r%1d r%2d
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
+ %if isprep
+ %xdefine r14_save r14
+ %assign %%i 14
+ %rep 14
+ %assign %%j %%i-1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i-1
+ %endrep
+ %endif
+%endmacro
+
+%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
+ %if isprep
+ %assign %%i 1
+ %rep 13
+ %assign %%j %%i+1
+ REMAP_REG %%i, %%j
+ %assign %%i %%i+1
+ %endrep
+ %xdefine r14 r14_save
+ %undef r14_save
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
+ MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
+ RET
+ %if %1
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %endif
+%endmacro
+
+%macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
+ movq xm%1, [srcq+ r4]
+ movq xm%2, [srcq+ r6]
+ movhps xm%1, [srcq+ r7]
+ movhps xm%2, [srcq+ r9]
+ vinserti128 m%1, [srcq+r10], 1
+ vinserti128 m%2, [srcq+r11], 1
+ vpbroadcastq m%5, [srcq+r13]
+ vpbroadcastq m%6, [srcq+ rX]
+ add srcq, ssq
+ movq xm%3, [srcq+ r4]
+ movq xm%4, [srcq+ r6]
+ movhps xm%3, [srcq+ r7]
+ movhps xm%4, [srcq+ r9]
+ vinserti128 m%3, [srcq+r10], 1
+ vinserti128 m%4, [srcq+r11], 1
+ vpbroadcastq m%7, [srcq+r13]
+ vpbroadcastq m%8, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m%1, m%5, 0xc0
+ vpblendd m%2, m%6, 0xc0
+ vpblendd m%3, m%7, 0xc0
+ vpblendd m%4, m%8, 0xc0
+ pmaddubsw m%1, m15
+ pmaddubsw m%2, m10
+ pmaddubsw m%3, m15
+ pmaddubsw m%4, m10
+ phaddw m%1, m%2
+ phaddw m%3, m%4
+ phaddw m%1, m%3
+ pmulhrsw m%1, m12
+%endmacro
+
+%macro MC_8TAP_SCALED 1
+%ifidn %1, put
+ %assign isprep 0
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal put_8tap_scaled, 4, 15, 16, 96, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %else
+cglobal put_8tap_scaled, 4, 14, 16, 112, dst, ds, src, ss, w, h, mx, my, dx, dy
+ %endif
+ %xdefine base_reg r12
+ %define rndshift 10
+%else
+ %assign isprep 1
+ %if required_stack_alignment <= STACK_ALIGNMENT
+cglobal prep_8tap_scaled, 4, 15, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %xdefine tmp_stridem r14q
+ %else
+cglobal prep_8tap_scaled, 4, 14, 16, 112, tmp, src, ss, w, h, mx, my, dx, dy
+ %define tmp_stridem qword [rsp+104]
+ %endif
+ %xdefine base_reg r11
+ %define rndshift 6
+%endif
+ lea base_reg, [%1_8tap_scaled_avx2]
+%define base base_reg-%1_8tap_scaled_avx2
+ tzcnt wd, wm
+ vpbroadcastd m8, dxm
+%if isprep && UNIX64
+ movd xm14, mxd
+ vpbroadcastd m14, xm14
+ mov r5d, t0d
+ DECLARE_REG_TMP 5, 7
+%else
+ vpbroadcastd m14, mxm
+%endif
+ mov dyd, dym
+%ifidn %1, put
+ %if WIN64
+ mov r8d, hm
+ DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r5m
+ %define dxm r8m
+ %else
+ DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm r6m
+ %endif
+ %if required_stack_alignment > STACK_ALIGNMENT
+ %define dsm [rsp+96]
+ %define rX r1
+ %define rXd r1d
+ %else
+ %define dsm dsq
+ %define rX r14
+ %define rXd r14d
+ %endif
+%else ; prep
+ %if WIN64
+ mov r7d, hm
+ DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
+ %define hm r4m
+ %define dxm r7m
+ %else
+ DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
+ %define hm [rsp+96]
+ %endif
+ MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
+ %define rX r14
+ %define rXd r14d
+%endif
+ vpbroadcastd m10, [base+pd_0x3ff]
+ vpbroadcastd m12, [base+pw_8192]
+%ifidn %1, put
+ vpbroadcastd m13, [base+pd_512]
+%else
+ vpbroadcastd m13, [base+pd_32]
+%endif
+ pxor m9, m9
+ lea ss3q, [ssq*3]
+ movzx r7d, t1b
+ shr t1d, 16
+ cmp hd, 6
+ cmovs t1d, r7d
+ sub srcq, ss3q
+ cmp dyd, 1024
+ je .dy1
+ cmp dyd, 2048
+ je .dy2
+ movzx wd, word [base+%1_8tap_scaled_avx2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ vpblendd m15, m7, 0xaa
+ vpblendd m0, m2, 0xc0 ; 0 1 4 5
+ vpblendd m1, m3, 0xc0 ; 2 3 6 7
+ pblendvb m15, m11, m8
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 1 2 3 4 5 6 7
+ vextracti128 xm1, m0, 1 ; 4 5 6 7
+ palignr xm2, xm1, xm0, 4 ; 1 2 3 4
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ pshufd xm4, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm4 ; 45 56
+ punpckhwd xm4, xm1, xm4 ; 67 __
+.w2_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ pmaddwd xm8, xm4, xm11
+ paddd xm5, xm6
+ paddd xm7, xm8
+ paddd xm5, xm13
+ paddd xm5, xm7
+ psrad xm5, 10
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq], xm5, 0
+ add dstq, dsq
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w2_loop
+ movq xm5, [srcq]
+ test myd, 0x400
+ jz .w2_skip_line
+ add srcq, ssq
+ shufps xm3, xm0, q1032 ; 01 12
+ shufps xm0, xm2, q1032 ; 23 34
+ shufps xm2, xm4, q1032 ; 45 56
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12
+ palignr xm1, xm5, xm1, 12
+ punpcklqdq xm1, xm1 ; 6 7 6 7
+ punpcklwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+.w2_skip_line:
+ movhps xm5, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ mova xm3, xm0 ; 01 12
+ mova xm0, xm2 ; 23 34
+ pshufb xm5, xm14
+ pmaddubsw xm5, xm15
+ phaddw xm5, xm5
+ pmulhrsw xm5, xm12 ; 6 7 6 7
+ palignr xm1, xm5, xm1, 8 ; 4 5 6 7
+ pshufd xm5, xm1, q0321 ; 5 6 7 _
+ punpcklwd xm2, xm1, xm5 ; 45 56
+ punpckhwd xm4, xm1, xm5 ; 67 __
+ jmp .w2_loop
+%endif
+.w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m0, m14, m10
+ psrld m0, 6
+ paddd xm15, xm0
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m0, m9
+ psrld m14, 10
+ movu xm7, [srcq+ssq*0]
+ movu xm9, [srcq+ssq*1]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm8, [srcq+ssq*2]
+ movu xm10, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m7, [srcq+ssq*0], 1
+ vinserti128 m9, [srcq+ssq*1], 1
+ vinserti128 m15, xm15, 1
+ vinserti128 m8, [srcq+ssq*2], 1
+ vinserti128 m10, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pblendvb m15, m11, m0
+ pshufb m7, m14
+ pshufb m9, m14
+ pshufb m8, m14
+ pshufb m10, m14
+ pmaddubsw m7, m15
+ pmaddubsw m9, m15
+ pmaddubsw m8, m15
+ pmaddubsw m10, m15
+ phaddw m7, m9
+ phaddw m8, m10
+ pmulhrsw m7, m12 ; 0 1 4 5
+ pmulhrsw m8, m12 ; 2 3 6 7
+ vextracti128 xm9, m7, 1 ; 4 5
+ vextracti128 xm3, m8, 1 ; 6 7
+ shufps xm4, xm7, xm8, q1032 ; 1 2
+ shufps xm5, xm8, xm9, q1032 ; 3 4
+ shufps xm6, xm9, xm3, q1032 ; 5 6
+ psrldq xm11, xm3, 8 ; 7 _
+ punpcklwd xm0, xm7, xm4 ; 01
+ punpckhwd xm7, xm4 ; 12
+ punpcklwd xm1, xm8, xm5 ; 23
+ punpckhwd xm8, xm5 ; 34
+ punpcklwd xm2, xm9, xm6 ; 45
+ punpckhwd xm9, xm6 ; 56
+ punpcklwd xm3, xm11 ; 67
+ mova [rsp+0x00], xm7
+ mova [rsp+0x10], xm8
+ mova [rsp+0x20], xm9
+.w4_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm10, r6q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ pshufd xm7, xm10, q0000
+ pshufd xm8, xm10, q1111
+ pshufd xm9, xm10, q2222
+ pshufd xm10, xm10, q3333
+ pmaddwd xm4, xm0, xm7
+ pmaddwd xm5, xm1, xm8
+ pmaddwd xm6, xm2, xm9
+ pmaddwd xm7, xm3, xm10
+ paddd xm4, xm5
+ paddd xm6, xm7
+ paddd xm4, xm13
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq], xm4
+ add dstq, dsq
+%else
+ movq [tmpq], xm4
+ add tmpq, 8
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w4_loop
+ movu xm4, [srcq]
+ test myd, 0x400
+ jz .w4_skip_line
+ mova xm0, [rsp+0x00]
+ mova [rsp+0x00], xm1
+ mova xm1, [rsp+0x10]
+ mova [rsp+0x10], xm2
+ mova xm2, [rsp+0x20]
+ mova [rsp+0x20], xm3
+ pshufb xm4, xm14
+ pmaddubsw xm4, xm15
+ phaddw xm4, xm4
+ pmulhrsw xm4, xm12
+ punpcklwd xm3, xm11, xm4
+ mova xm11, xm4
+ add srcq, ssq
+ jmp .w4_loop
+.w4_skip_line:
+ movu xm5, [srcq+ssq*1]
+ movu m6, [rsp+0x10]
+ pshufb xm4, xm14
+ pshufb xm5, xm14
+ pmaddubsw xm4, xm15
+ pmaddubsw xm5, xm15
+ movu [rsp+0x00], m6
+ phaddw xm4, xm5
+ pmulhrsw xm4, xm12
+ punpcklwd xm9, xm11, xm4
+ mova [rsp+0x20], xm9
+ psrldq xm11, xm4, 8
+ mova xm0, xm1
+ mova xm1, xm2
+ mova xm2, xm3
+ punpcklwd xm3, xm4, xm11
+ lea srcq, [srcq+ssq*2]
+ jmp .w4_loop
+.w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+r4*8]
+ movq xm10, [base+subpel_filters+r6*8]
+ movhps xm15, [base+subpel_filters+r7*8]
+ movhps xm10, [base+subpel_filters+r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+rX*8]
+ psrld m14, 10
+ mova [rsp], xm14
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.w8_loop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .w8_loop
+ test myd, 0x400
+ mov [rsp+16], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .w8_skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+16]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .w8_loop
+.w8_skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+16]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .w8_loop
+.w16:
+ mov dword [rsp+48], 2
+ movifprep tmp_stridem, 32
+ jmp .w_start
+.w32:
+ mov dword [rsp+48], 4
+ movifprep tmp_stridem, 64
+ jmp .w_start
+.w64:
+ mov dword [rsp+48], 8
+ movifprep tmp_stridem, 128
+ jmp .w_start
+.w128:
+ mov dword [rsp+48], 16
+ movifprep tmp_stridem, 256
+.w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+72], t0d
+ mov [rsp+56], srcq
+ mov [rsp+64], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .hloop
+.hloop_prep:
+ dec dword [rsp+48]
+ jz .ret
+ add qword [rsp+64], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+16]
+ vpbroadcastd m15, [rsp+72]
+ pxor m9, m9
+ mov srcq, [rsp+56]
+ mov r0q, [rsp+64] ; dstq / tmpq
+.hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+16], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ mova [rsp], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ mov dyd, dym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ vbroadcasti128 m14, [base+wswap]
+.vloop:
+ and myd, 0x3ff
+ mov r6d, 64 << 24
+ mov r4d, myd
+ shr r4d, 6
+ lea r4d, [t1+r4]
+ cmovnz r6q, [base+subpel_filters+r4*8]
+ movq xm11, r6q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufd m8, m11, q2222
+ pshufd m11, m11, q3333
+ pmaddwd m6, m2, m8
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .hloop_prep
+ add myd, dyd
+ test myd, ~0x3ff
+ jz .vloop
+ test myd, 0x400
+ mov [rsp+52], myd
+ mov r4d, [rsp+ 0]
+ mov r6d, [rsp+ 8]
+ mov r7d, [rsp+ 4]
+ mov r9d, [rsp+12]
+ jz .skip_line
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .vloop
+.skip_line:
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ vpbroadcastq m7, [srcq+r13]
+ vpbroadcastq m8, [srcq+ rX]
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ add srcq, ssq
+ movq xm5, [srcq+ r4]
+ movq xm6, [srcq+ r6]
+ movhps xm5, [srcq+ r7]
+ movhps xm6, [srcq+ r9]
+ vinserti128 m5, [srcq+r10], 1
+ vinserti128 m6, [srcq+r11], 1
+ vpbroadcastq m9, [srcq+r13]
+ vpbroadcastq m11, [srcq+ rX]
+ add srcq, ssq
+ mov myd, [rsp+52]
+ mov dyd, dym
+ vpblendd m3, m7, 0xc0
+ vpblendd m4, m8, 0xc0
+ vpblendd m5, m9, 0xc0
+ vpblendd m6, m11, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ pmaddubsw m5, m15
+ pmaddubsw m6, m10
+ phaddw m3, m4
+ phaddw m5, m6
+ psrld m4, m3, 16
+ pslld m6, m5, 16
+ paddw m3, m4
+ paddw m5, m6
+ pblendw m3, m5, 0xaa
+ pmulhrsw m3, m12
+ jmp .vloop
+.dy1:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy1_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ movq xm1, [srcq+ssq*2]
+ movhps xm0, [srcq+ssq*1]
+ movhps xm1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m0, [srcq+ssq*0], 1
+ vinserti128 m1, [srcq+ssq*2], 1
+ vpbroadcastq m2, [srcq+ssq*1]
+ add srcq, ss3q
+ movq xm10, r4q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ pshufd xm8, xm10, q0000
+ pshufd xm9, xm10, q1111
+ pshufd xm11, xm10, q3333
+ pshufd xm10, xm10, q2222
+ vpblendd m0, m2, 0xc0
+ pshufb m1, m14
+ pshufb m0, m14
+ pmaddubsw m1, m15
+ pmaddubsw m0, m15
+ phaddw m0, m1
+ pmulhrsw m0, m12
+ vextracti128 xm1, m0, 1
+ palignr xm2, xm1, xm0, 4
+ pshufd xm4, xm1, q2121
+ punpcklwd xm3, xm0, xm2 ; 01 12
+ punpckhwd xm0, xm2 ; 23 34
+ punpcklwd xm2, xm1, xm4 ; 45 56
+.dy1_w2_loop:
+ movq xm1, [srcq+ssq*0]
+ movhps xm1, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ pmaddwd xm5, xm3, xm8
+ pmaddwd xm6, xm0, xm9
+ pmaddwd xm7, xm2, xm10
+ mova xm3, xm0
+ mova xm0, xm2
+ paddd xm5, xm13
+ paddd xm6, xm7
+ pshufb xm1, xm14
+ pmaddubsw xm1, xm15
+ phaddw xm1, xm1
+ pmulhrsw xm1, xm12
+ palignr xm7, xm1, xm4, 12
+ punpcklwd xm2, xm7, xm1 ; 67 78
+ pmaddwd xm7, xm2, xm11
+ mova xm4, xm1
+ paddd xm5, xm6
+ paddd xm5, xm7
+ psrad xm5, rndshift
+ packssdw xm5, xm5
+ packuswb xm5, xm5
+ pextrw [dstq+dsq*0], xm5, 0
+ pextrw [dstq+dsq*1], xm5, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy1_w2_loop
+ RET
+%endif
+.dy1_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ vpermq m8, m8, q3120
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r11d, xm15, 1
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ movu xm2, [srcq+ssq*0]
+ movu xm3, [srcq+ssq*2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 1
+ vpblendd m7, [base+subpel_filters+r13*8+2-20], 0x20
+ vinserti128 m2, [srcq+ssq*1], 1
+ vinserti128 m3, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ pshufb m14, m5
+ paddb m14, m6
+ movu xm4, [srcq+ssq*0]
+ movu xm5, [srcq+ssq*2]
+ vinserti128 m4, [srcq+ssq*1], 1
+ add srcq, ss3q
+ vpblendd m15, m7, 0x30
+ punpcklqdq m15, m15
+ pblendvb m15, m11, m8
+ movq xm10, r4q
+ punpcklbw xm10, xm10
+ psraw xm10, 8
+ vinserti128 m10, xm10, 1
+ pshufb m2, m14
+ pshufb m3, m14
+ pshufb m4, m14
+ pshufb xm5, xm14
+ vpermq m2, m2, q3120
+ vpermq m3, m3, q3120
+ vpermq m4, m4, q3120
+ vpermq m5, m5, q3120
+ pshufd m7, m10, q0000
+ pshufd m8, m10, q1111
+ pshufd m9, m10, q2222
+ pshufd m10, m10, q3333
+ pmaddubsw m2, m15
+ pmaddubsw m3, m15
+ pmaddubsw m4, m15
+ pmaddubsw m5, m15
+ phaddw m2, m3
+ phaddw m4, m5
+ pmulhrsw m2, m12
+ pmulhrsw m4, m12
+ palignr m5, m4, m2, 4
+ pshufd m3, m4, q2121
+ punpcklwd m0, m2, m5 ; 01 12
+ punpckhwd m1, m2, m5 ; 23 34
+ punpcklwd m2, m4, m3 ; 45 56
+.dy1_w4_loop:
+ movu xm11, [srcq+ssq*0]
+ vinserti128 m11, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pmaddwd m4, m0, m7
+ pmaddwd m5, m1, m8
+ pmaddwd m6, m2, m9
+ mova m0, m1
+ mova m1, m2
+ paddd m4, m13
+ paddd m5, m6
+ pshufb m11, m14
+ vpermq m11, m11, q3120
+ pmaddubsw m11, m15
+ phaddw m11, m11
+ pmulhrsw m11, m12
+ palignr m6, m11, m3, 12
+ punpcklwd m2, m6, m11 ; 67 78
+ mova m3, m11
+ pmaddwd m6, m2, m10
+ paddd m4, m5
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ pshuflw xm4, xm4, q3120
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ pshufd xm4, xm4, q3120
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy1_w4_loop
+ MC_8TAP_SCALED_RET
+.dy1_w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ mov [rsp+32], r7d
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ movu [rsp], m10
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ lea myd, [t1+myq]
+ mov t1d, 64 << 24
+ cmovnz t1q, [base+subpel_filters+myq*8]
+ vbroadcasti128 m14, [base+wswap]
+ movq xm11, t1q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r7d, [rsp+32]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+.dy1_w8_loop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_w8_loop
+.dy1_w16:
+ mov dword [rsp+72], 2
+ movifprep tmp_stridem, 32
+ jmp .dy1_w_start
+.dy1_w32:
+ mov dword [rsp+72], 4
+ movifprep tmp_stridem, 64
+ jmp .dy1_w_start
+.dy1_w64:
+ mov dword [rsp+72], 8
+ movifprep tmp_stridem, 128
+ jmp .dy1_w_start
+.dy1_w128:
+ mov dword [rsp+72], 16
+ movifprep tmp_stridem, 256
+.dy1_w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+76], t0d
+ mov [rsp+80], srcq
+ mov [rsp+88], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .dy1_hloop
+.dy1_hloop_prep:
+ dec dword [rsp+72]
+ jz .ret
+ add qword [rsp+88], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp+32]
+ vpbroadcastd m15, [rsp+76]
+ pxor m9, m9
+ mov srcq, [rsp+80]
+ mov r0q, [rsp+88] ; dstq / tmpq
+.dy1_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp+32], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+64], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ movu [rsp], m10
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vbroadcasti128 m14, [base+wswap]
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r4d, [rsp+64]
+ mov r7d, [rsp+68]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+.dy1_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m10
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy1_hloop_prep
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ pshufb m0, m14
+ pshufb m1, m14
+ pshufb m2, m14
+ pshufb m3, m14
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, [rsp]
+ phaddw m4, m5
+ pslld m5, m4, 16
+ paddw m4, m5
+ pmulhrsw m4, m12
+ pblendw m0, m1, 0xaa
+ pblendw m1, m2, 0xaa
+ pblendw m2, m3, 0xaa
+ pblendw m3, m4, 0xaa
+ jmp .dy1_vloop
+.dy2:
+ movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
+ add wq, base_reg
+ jmp wq
+%ifidn %1, put
+.dy2_w2:
+ mov myd, mym
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ punpckldq m8, m9, m8
+ paddd m14, m8 ; mx+dx*[0-1]
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vbroadcasti128 m6, [base+subpel_s_shuf2]
+ vpbroadcastd m15, [base+subpel_filters+r4*8+2]
+ vpbroadcastd m7, [base+subpel_filters+r6*8+2]
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movq xm0, [srcq+ssq*0]
+ vpbroadcastq m2, [srcq+ssq*1]
+ movhps xm0, [srcq+ssq*2]
+ vpbroadcastq m3, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pshufb m14, m5
+ paddb m14, m6
+ vpblendd m15, m7, 0xaa
+ pblendvb m15, m11, m8
+ movhps xm1, [srcq+ssq*0]
+ vpbroadcastq m4, [srcq+ssq*1]
+ lea srcq, [srcq+ssq*2]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vpblendd m0, m2, 0x30
+ vpblendd m1, m4, 0xc0
+ vpblendd m0, m3, 0xc0
+ pshufb m0, m14
+ pshufb m1, m14
+ pmaddubsw m0, m15
+ pmaddubsw m1, m15
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ phaddw m0, m1
+ pmulhrsw m0, m12 ; 0 2 _ 4 1 3 _ 5
+ pshufd xm8, xm11, q0000
+ pshufd xm9, xm11, q1111
+ pshufd xm10, xm11, q2222
+ pshufd xm11, xm11, q3333
+ pshufd m2, m0, q3110 ; 0 2 2 4 1 3 3 5
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 01 23
+ punpckhwd xm2, xm1 ; 23 45
+.dy2_w2_loop:
+ movq xm6, [srcq+ssq*0]
+ vpbroadcastq m7, [srcq+ssq*1]
+ movhps xm6, [srcq+ssq*2]
+ vpbroadcastq m1, [srcq+ss3q ]
+ lea srcq, [srcq+ssq*4]
+ pmaddwd xm4, xm3, xm8
+ pmaddwd xm5, xm2, xm9
+ vpblendd m6, m7, 0x30
+ vpblendd m6, m1, 0xc0
+ pshufb m6, m14
+ pmaddubsw m6, m15
+ phaddw m6, m6
+ pmulhrsw m6, m12
+ palignr m0, m6, m0, 8
+ pshufd m2, m0, q3221
+ vextracti128 xm1, m2, 1
+ punpcklwd xm3, xm2, xm1 ; 45 67
+ punpckhwd xm2, xm1 ; 67 89
+ pmaddwd xm6, xm3, xm10
+ pmaddwd xm7, xm2, xm11
+ paddd xm4, xm5
+ paddd xm4, xm13
+ paddd xm6, xm7
+ paddd xm4, xm6
+ psrad xm4, rndshift
+ packssdw xm4, xm4
+ packuswb xm4, xm4
+ pextrw [dstq+dsq*0], xm4, 0
+ pextrw [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .dy2_w2_loop
+ RET
+%endif
+.dy2_w4:
+ mov myd, mym
+ vbroadcasti128 m7, [base+rescale_mul]
+ movzx t0d, t0b
+ dec srcq
+ movd xm15, t0d
+ pmaddwd m8, m7
+ vpbroadcastd m11, [base+pd_0x4000]
+ vpbroadcastd xm15, xm15
+ paddd m14, m8 ; mx+dx*[0-3]
+ pand m8, m14, m10
+ psrld m8, 6
+ paddd xm15, xm8
+ movd r4d, xm15
+ pextrd r6d, xm15, 1
+ pextrd r11d, xm15, 2
+ pextrd r13d, xm15, 3
+ movd xm15, [base+subpel_filters+r4*8+2]
+ vbroadcasti128 m5, [base+bdct_lb_dw]
+ vpbroadcastq m6, [base+subpel_s_shuf2]
+ pinsrd xm15, [base+subpel_filters+r6*8+2], 1
+ pcmpeqd m8, m9
+ psrld m14, 10
+ movu xm0, [srcq+ssq*0]
+ movu xm2, [srcq+ssq*2]
+ pinsrd xm15, [base+subpel_filters+r11*8+2], 2
+ movu xm1, [srcq+ssq*1]
+ movu xm3, [srcq+ss3q ]
+ pinsrd xm15, [base+subpel_filters+r13*8+2], 3
+ lea srcq, [srcq+ssq*4]
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ vinserti128 m15, xm15, 1
+ pshufb m14, m5
+ paddb m14, m6
+ vinserti128 m2, [srcq+ssq*0], 1
+ vinserti128 m3, [srcq+ssq*1], 1
+ lea srcq, [srcq+ssq*2]
+ pblendvb m15, m11, m8
+ pshufb xm0, xm14
+ pshufb m2, m14
+ pshufb xm1, xm14
+ pshufb m3, m14
+ pmaddubsw xm0, xm15
+ pmaddubsw m2, m15
+ pmaddubsw xm1, xm15
+ pmaddubsw m3, m15
+ movq xm11, r4q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ phaddw m0, m2
+ phaddw m1, m3
+ pmulhrsw m0, m12 ; 0 2 _ 4
+ pmulhrsw m1, m12 ; 1 3 _ 5
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m10, m11, q2222
+ pshufd m11, m11, q3333
+ punpcklwd xm2, xm0, xm1
+ punpckhwd m1, m0, m1 ; 23 45
+ vinserti128 m0, m2, xm1, 1 ; 01 23
+.dy2_w4_loop:
+ movu xm6, [srcq+ssq*0]
+ movu xm7, [srcq+ssq*1]
+ vinserti128 m6, [srcq+ssq*2], 1
+ vinserti128 m7, [srcq+ss3q ], 1
+ lea srcq, [srcq+ssq*4]
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pshufb m6, m14
+ pshufb m7, m14
+ pmaddubsw m6, m15
+ pmaddubsw m7, m15
+ psrld m2, m6, 16
+ pslld m3, m7, 16
+ paddw m6, m2
+ paddw m7, m3
+ pblendw m6, m7, 0xaa ; 67 89
+ pmulhrsw m6, m12
+ paddd m4, m5
+ vpblendd m0, m1, m6, 0x0f
+ mova m1, m6
+ vpermq m0, m0, q1032 ; 45 67
+ pmaddwd m6, m0, m10
+ pmaddwd m7, m1, m11
+ paddd m4, m13
+ paddd m6, m7
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movd [dstq+dsq*0], xm4
+ pextrd [dstq+dsq*1], xm4, 1
+ lea dstq, [dstq+dsq*2]
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ sub hd, 2
+ jg .dy2_w4_loop
+ MC_8TAP_SCALED_RET
+.dy2_w8:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ movd xm15, t0d
+ pmaddwd m8, [base+rescale_mul]
+ vpbroadcastq m11, [base+pq_0x40000000]
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ mov [rsp], r7d
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ lea myd, [t1+myq]
+ mov t1d, 64 << 24
+ cmovnz t1q, [base+subpel_filters+myq*8]
+ movq xm11, t1q
+ punpcklbw xm11, xm11
+ psraw xm11, 8
+ vinserti128 m11, xm11, 1
+ mov r7d, [rsp]
+ pshufd m8, m11, q0000
+ pshufd m9, m11, q1111
+ pshufd m14, m11, q2222
+ pshufd m11, m11, q3333
+.dy2_w8_loop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m14
+ pmaddwd m7, m3, m11
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, 16
+%endif
+ dec hd
+ jz .ret
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_w8_loop
+.dy2_w16:
+ mov dword [rsp+40], 2
+ movifprep tmp_stridem, 32
+ jmp .dy2_w_start
+.dy2_w32:
+ mov dword [rsp+40], 4
+ movifprep tmp_stridem, 64
+ jmp .dy2_w_start
+.dy2_w64:
+ mov dword [rsp+40], 8
+ movifprep tmp_stridem, 128
+ jmp .dy2_w_start
+.dy2_w128:
+ mov dword [rsp+40], 16
+ movifprep tmp_stridem, 256
+.dy2_w_start:
+%ifidn %1, put
+ movifnidn dsm, dsq
+%endif
+ shr t0d, 16
+ sub srcq, 3
+ pmaddwd m8, [base+rescale_mul]
+ movd xm15, t0d
+ mov [rsp+64], t0d
+ mov [rsp+48], srcq
+ mov [rsp+56], r0q ; dstq / tmpq
+%if UNIX64
+ mov hm, hd
+%endif
+ shl dword dxm, 3 ; dx*8
+ vpbroadcastd m15, xm15
+ paddd m14, m8 ; mx+dx*[0-7]
+ jmp .dy2_hloop
+.dy2_hloop_prep:
+ dec dword [rsp+40]
+ jz .ret
+ add qword [rsp+56], 8*(isprep+1)
+ mov hd, hm
+ vpbroadcastd m8, dxm
+ vpbroadcastd m10, [base+pd_0x3ff]
+ paddd m14, m8, [rsp]
+ vpbroadcastd m15, [rsp+64]
+ pxor m9, m9
+ mov srcq, [rsp+48]
+ mov r0q, [rsp+56] ; dstq / tmpq
+.dy2_hloop:
+ vpbroadcastq m11, [base+pq_0x40000000]
+ pand m6, m14, m10
+ psrld m6, 6
+ paddd m15, m6
+ pcmpeqd m6, m9
+ vextracti128 xm7, m15, 1
+ movd r4d, xm15
+ pextrd r6d, xm15, 2
+ pextrd r7d, xm15, 1
+ pextrd r9d, xm15, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ movu [rsp], m14
+ movq xm15, [base+subpel_filters+ r4*8]
+ movq xm10, [base+subpel_filters+ r6*8]
+ movhps xm15, [base+subpel_filters+ r7*8]
+ movhps xm10, [base+subpel_filters+ r9*8]
+ vinserti128 m15, [base+subpel_filters+r10*8], 1
+ vinserti128 m10, [base+subpel_filters+r11*8], 1
+ vpbroadcastq m9, [base+subpel_filters+r13*8]
+ vpbroadcastq m8, [base+subpel_filters+ rX*8]
+ psrld m14, 10
+ vextracti128 xm7, m14, 1
+ movq [rsp+32], xm14
+ movd r4d, xm14
+ pextrd r6d, xm14, 2
+ pextrd r7d, xm14, 1
+ pextrd r9d, xm14, 3
+ movd r10d, xm7
+ pextrd r11d, xm7, 2
+ pextrd r13d, xm7, 1
+ pextrd rXd, xm7, 3
+ pshufd m5, m6, q1100
+ pshufd m6, m6, q3322
+ vpblendd m15, m9, 0xc0
+ vpblendd m10, m8, 0xc0
+ pblendvb m15, m11, m5
+ pblendvb m10, m11, m6
+ vbroadcasti128 m14, [base+subpel_s_shuf8]
+ MC_8TAP_SCALED_H 0, 1, 2, 3, 4, 5, 6, 7 ; 0a 1a 0b 1b
+ MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 7, 8 ; 2a 3a 2b 3b
+ MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 7, 8, 9 ; 4a 5a 4b 5b
+ MC_8TAP_SCALED_H 3, 4, 5, 6, 7, 8, 9, 11 ; 6a 7a 6b 7b
+ mov myd, mym
+ pshufb m0, m14 ; 01a 01b
+ pshufb m1, m14 ; 23a 23b
+ pshufb m2, m14 ; 45a 45b
+ pshufb m3, m14 ; 67a 67b
+ shr myd, 6
+ mov r4d, 64 << 24
+ lea myd, [t1+myq]
+ cmovnz r4q, [base+subpel_filters+myq*8]
+ movq xm14, r4q
+ punpcklbw xm14, xm14
+ psraw xm14, 8
+ vinserti128 m14, xm14, 1
+ mov r4d, [rsp+32]
+ mov r7d, [rsp+36]
+ pshufd m8, m14, q0000
+ pshufd m9, m14, q1111
+ pshufd m11, m14, q2222
+ pshufd m14, m14, q3333
+.dy2_vloop:
+ pmaddwd m4, m0, m8
+ pmaddwd m5, m1, m9
+ pmaddwd m6, m2, m11
+ pmaddwd m7, m3, m14
+ paddd m4, m5
+ paddd m6, m7
+ paddd m4, m13
+ paddd m4, m6
+ psrad m4, rndshift
+ vextracti128 xm5, m4, 1
+ packssdw xm4, xm5
+%ifidn %1, put
+ packuswb xm4, xm4
+ movq [dstq], xm4
+ add dstq, dsm
+%else
+ mova [tmpq], xm4
+ add tmpq, tmp_stridem
+%endif
+ dec hd
+ jz .dy2_hloop_prep
+ mova m0, m1
+ mova m1, m2
+ mova m2, m3
+ movq xm3, [srcq+ r4]
+ movq xm4, [srcq+ r6]
+ movhps xm3, [srcq+ r7]
+ movhps xm4, [srcq+ r9]
+ vinserti128 m3, [srcq+r10], 1
+ vinserti128 m4, [srcq+r11], 1
+ vpbroadcastq m5, [srcq+r13]
+ vpbroadcastq m6, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m3, m5, 0xc0
+ vpblendd m4, m6, 0xc0
+ pmaddubsw m3, m15
+ pmaddubsw m4, m10
+ phaddw m3, m4
+ movq xm4, [srcq+ r4]
+ movq xm5, [srcq+ r6]
+ movhps xm4, [srcq+ r7]
+ movhps xm5, [srcq+ r9]
+ vinserti128 m4, [srcq+r10], 1
+ vinserti128 m5, [srcq+r11], 1
+ vpbroadcastq m6, [srcq+r13]
+ vpbroadcastq m7, [srcq+ rX]
+ add srcq, ssq
+ vpblendd m4, m6, 0xc0
+ vpblendd m5, m7, 0xc0
+ pmaddubsw m4, m15
+ pmaddubsw m5, m10
+ phaddw m4, m5
+ psrld m5, m3, 16
+ pslld m6, m4, 16
+ paddw m3, m5
+ paddw m4, m6
+ pblendw m3, m4, 0xaa
+ pmulhrsw m3, m12
+ jmp .dy2_vloop
+.ret:
+ MC_8TAP_SCALED_RET 0
+%undef isprep
+%endmacro
+
+%macro BILIN_SCALED_FN 1
+cglobal %1_bilin_scaled
+ mov t0d, (5*15 << 16) | 5*15
+ mov t1d, t0d
+ jmp mangle(private_prefix %+ _%1_8tap_scaled %+ SUFFIX)
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 5
+%else
+DECLARE_REG_TMP 6, 8
+%endif
+
+%define PUT_8TAP_SCALED_FN FN put_8tap_scaled,
+%define PREP_8TAP_SCALED_FN FN prep_8tap_scaled,
+
+BILIN_SCALED_FN put
+PUT_8TAP_SCALED_FN sharp, SHARP, SHARP
+PUT_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PUT_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PUT_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PUT_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PUT_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PUT_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PUT_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PUT_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED put
+
+%if WIN64
+DECLARE_REG_TMP 5, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+BILIN_SCALED_FN prep
+PREP_8TAP_SCALED_FN sharp, SHARP, SHARP
+PREP_8TAP_SCALED_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_SCALED_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_SCALED_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_SCALED_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_SCALED_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_SCALED_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_SCALED_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_SCALED_FN regular, REGULAR, REGULAR
+MC_8TAP_SCALED prep
+
+%macro WARP_V 5 ; dst, 02, 46, 13, 57
+ ; Can be done using gathers, but that's terribly slow on many CPU:s
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+myq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1 ; a e
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+deltaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; b f
+ lea tmp1d, [myq+deltaq*4]
+ lea tmp2d, [myq+deltaq*1]
+ shr myd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+myq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1 ; c g
+ lea tmp1d, [tmp2q+deltaq*4]
+ lea myd, [tmp2q+gammaq] ; my += gamma
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklwd m8, m0
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1 ; d h
+ punpcklwd m0, m9, m0
+ punpckldq m9, m8, m0
+ punpckhdq m0, m8, m0
+ punpcklbw m8, m11, m9 ; a0 a2 b0 b2 c0 c2 d0 d2 << 8
+ punpckhbw m9, m11, m9 ; a4 a6 b4 b6 c4 c6 d4 d6 << 8
+ pmaddwd m%2, m8
+ pmaddwd m9, m%3
+ punpcklbw m8, m11, m0 ; a1 a3 b1 b3 c1 c3 d1 d3 << 8
+ punpckhbw m0, m11, m0 ; a5 a7 b5 b7 c5 c7 d5 d7 << 8
+ pmaddwd m8, m%4
+ pmaddwd m0, m%5
+ paddd m%2, m9
+ paddd m0, m8
+ paddd m%1, m0, m%2
+%endmacro
+
+cglobal warp_affine_8x8t, 0, 14, 0, tmp, ts
+%if WIN64
+ sub rsp, 0xa0
+%endif
+ call mangle(private_prefix %+ _warp_affine_8x8_avx2).main
+.loop:
+ psrad m7, 13
+ psrad m0, 13
+ packssdw m7, m0
+ pmulhrsw m7, m14 ; (x + (1 << 6)) >> 7
+ vpermq m7, m7, q3120
+ mova [tmpq+tsq*0], xm7
+ vextracti128 [tmpq+tsq*2], m7, 1
+ dec r4d
+ jz mangle(private_prefix %+ _warp_affine_8x8_avx2).end
+ call mangle(private_prefix %+ _warp_affine_8x8_avx2).main2
+ lea tmpq, [tmpq+tsq*4]
+ jmp .loop
+
+cglobal warp_affine_8x8, 0, 14, 0, dst, ds, src, ss, abcd, mx, tmp2, alpha, \
+ beta, filter, tmp1, delta, my, gamma
+%if WIN64
+ sub rsp, 0xa0
+ %assign xmm_regs_used 16
+ %assign stack_size_padded 0xa0
+ %assign stack_offset stack_offset+stack_size_padded
+%endif
+ call .main
+ jmp .start
+.loop:
+ call .main2
+ lea dstq, [dstq+dsq*2]
+.start:
+ psrad m7, 18
+ psrad m0, 18
+ packusdw m7, m0
+ pavgw m7, m11 ; (x + (1 << 10)) >> 11
+ vextracti128 xm0, m7, 1
+ packuswb xm7, xm0
+ pshufd xm7, xm7, q3120
+ movq [dstq+dsq*0], xm7
+ movhps [dstq+dsq*1], xm7
+ dec r4d
+ jg .loop
+.end:
+ RET
+ALIGN function_align
+.main:
+ ; Stack args offset by one (r4m -> r5m etc.) due to call
+%if WIN64
+ mov abcdq, r5m
+ mov mxd, r6m
+ movaps [rsp+stack_offset+0x10], xmm6
+ movaps [rsp+stack_offset+0x20], xmm7
+ movaps [rsp+0x28], xmm8
+ movaps [rsp+0x38], xmm9
+ movaps [rsp+0x48], xmm10
+ movaps [rsp+0x58], xmm11
+ movaps [rsp+0x68], xmm12
+ movaps [rsp+0x78], xmm13
+ movaps [rsp+0x88], xmm14
+ movaps [rsp+0x98], xmm15
+%endif
+ movsx alphad, word [abcdq+2*0]
+ movsx betad, word [abcdq+2*1]
+ mova m12, [warp_8x8_shufA]
+ mova m13, [warp_8x8_shufB]
+ vpbroadcastd m14, [pw_8192]
+ vpbroadcastd m15, [pd_32768]
+ pxor m11, m11
+ lea filterq, [mc_warp_filter]
+ lea tmp1q, [ssq*3+3]
+ add mxd, 512+(64<<10)
+ lea tmp2d, [alphaq*3]
+ sub srcq, tmp1q ; src -= src_stride*3 + 3
+ sub betad, tmp2d ; beta -= alpha*3
+ mov myd, r7m
+ call .h
+ psrld m1, m0, 16
+ call .h
+ psrld m4, m0, 16
+ call .h
+ pblendw m1, m0, 0xaa ; 02
+ call .h
+ pblendw m4, m0, 0xaa ; 13
+ call .h
+ psrld m2, m1, 16
+ pblendw m2, m0, 0xaa ; 24
+ call .h
+ psrld m5, m4, 16
+ pblendw m5, m0, 0xaa ; 35
+ call .h
+ psrld m3, m2, 16
+ pblendw m3, m0, 0xaa ; 46
+ movsx deltad, word [abcdq+2*2]
+ movsx gammad, word [abcdq+2*3]
+ add myd, 512+(64<<10)
+ mov r4d, 4
+ lea tmp1d, [deltaq*3]
+ sub gammad, tmp1d ; gamma -= delta*3
+.main2:
+ call .h
+ psrld m6, m5, 16
+ pblendw m6, m0, 0xaa ; 57
+ WARP_V 7, 1, 3, 4, 6
+ call .h
+ mova m1, m2
+ mova m2, m3
+ psrld m3, 16
+ pblendw m3, m0, 0xaa ; 68
+ WARP_V 0, 4, 6, 1, 3
+ mova m4, m5
+ mova m5, m6
+ ret
+ALIGN function_align
+.h:
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ vbroadcasti128 m10, [srcq]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm8, [filterq+mxq *8]
+ vinserti128 m8, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+alphaq*1]
+ shr tmp2d, 10
+ shr tmp1d, 10
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ lea tmp1d, [mxq+alphaq*4]
+ lea tmp2d, [mxq+alphaq*1]
+ shr mxd, 10
+ shr tmp1d, 10
+ movq xm9, [filterq+mxq *8]
+ vinserti128 m9, [filterq+tmp1q*8], 1
+ lea tmp1d, [tmp2q+alphaq*4]
+ lea mxd, [tmp2q+betaq] ; mx += beta
+ shr tmp2d, 10
+ shr tmp1d, 10
+ punpcklqdq m8, m0 ; 0 1 4 5
+ movq xm0, [filterq+tmp2q*8]
+ vinserti128 m0, [filterq+tmp1q*8], 1
+ punpcklqdq m9, m0 ; 2 3 6 7
+ pshufb m0, m10, m12
+ pmaddubsw m0, m8
+ pshufb m10, m13
+ pmaddubsw m10, m9
+ add srcq, ssq
+ phaddw m0, m10
+ pmaddwd m0, m14 ; 17-bit intermediate, upshifted by 13
+ paddd m0, m15 ; rounded 14-bit result in upper 16 bits of dword
+ ret
+
+%macro BIDIR_FN 1 ; op
+ %1 0
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 4
+ je .ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ cmp hd, 8
+ je .ret
+ %1 2
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+.ret:
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*4]
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ %1_INC_PTR 4
+ %1 0
+ lea dstq, [dstq+strideq*2]
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*1], m0
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ %1_INC_PTR 4
+ %1 0
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+32], m0
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ %1 0
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+0*32], m0
+ %1 2
+ vpermq m0, m0, q3120
+ mova [dstq+1*32], m0
+ %1_INC_PTR 8
+ %1 -4
+ vpermq m0, m0, q3120
+ mova [dstq+2*32], m0
+ %1 -2
+ vpermq m0, m0, q3120
+ mova [dstq+3*32], m0
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*32]
+ paddw m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ paddw m1, [tmp2q+(%1+1)*32]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*32
+ add tmp2q, %1*32
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg %+ SUFFIX %+ _table
+ lea r6, [avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*32]
+ psubw m2, m0, [tmp2q+(%1+0)*32]
+ mova m1, [tmp1q+(%1+1)*32]
+ psubw m3, m1, [tmp2q+(%1+1)*32]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg %+ SUFFIX %+ _table
+ lea r6, [w_avg %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+ vpermq m3, [maskq+%1*16], q3120
+ mova m0, [tmp2q+(%1+0)*32]
+ psubw m1, m0, [tmp1q+(%1+0)*32]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*32]
+ psubw m2, m1, [tmp1q+(%1+1)*32]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*16
+ add tmp2q, %1*32
+ add tmp1q, %1*32
+%endmacro
+
+cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask %+ SUFFIX %+ _table
+ lea r7, [mask %+ SUFFIX %+ _table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ pxor m4, m4
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+32*%3]
+ mova m1, [tmp2q+32*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+32*%4]
+ mova m2, [tmp2q+32*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ psrlw m3, 8
+%if %5
+ packuswb m%2, m3
+ psubb m%2, m5, m%2
+ vpermq m%2, m%2, q3120
+%else
+ phaddw m%2, m3
+%endif
+ psllw m3, 10
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal blend, 3, 7, 7, dst, ds, tmp, w, h, mask
+%define base r6-blend_avx2_table
+ lea r6, [blend_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movifnidn maskq, maskmp
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m4, [base+pb_64]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r6
+ lea r6, [dsq*3]
+ jmp wq
+.w4:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ vpbroadcastd xm1, [dstq+dsq*2]
+ pinsrd xm1, [dstq+r6 ], 3
+ mova xm6, [maskq]
+ psubb xm3, xm4, xm6
+ punpcklbw xm2, xm3, xm6
+ punpckhbw xm3, xm6
+ mova xm6, [tmpq]
+ add maskq, 4*4
+ add tmpq, 4*4
+ punpcklbw xm0, xm6
+ punpckhbw xm1, xm6
+ pmaddubsw xm0, xm2
+ pmaddubsw xm1, xm3
+ pmulhrsw xm0, xm5
+ pmulhrsw xm1, xm5
+ packuswb xm0, xm1
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ pextrd [dstq+dsq*2], xm0, 2
+ pextrd [dstq+r6 ], xm0, 3
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w4
+ RET
+ALIGN function_align
+.w8:
+ movq xm1, [dstq+dsq*0]
+ movhps xm1, [dstq+dsq*1]
+ vpbroadcastq m2, [dstq+dsq*2]
+ vpbroadcastq m3, [dstq+r6 ]
+ mova m0, [maskq]
+ mova m6, [tmpq]
+ add maskq, 8*4
+ add tmpq, 8*4
+ vpblendd m1, m2, 0x30
+ vpblendd m1, m3, 0xc0
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ vextracti128 xm1, m0, 1
+ movq [dstq+dsq*0], xm0
+ movhps [dstq+dsq*1], xm0
+ movq [dstq+dsq*2], xm1
+ movhps [dstq+r6 ], xm1
+ lea dstq, [dstq+dsq*4]
+ sub hd, 4
+ jg .w8
+ RET
+ALIGN function_align
+.w16:
+ mova m0, [maskq]
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ mova m6, [tmpq]
+ add maskq, 16*2
+ add tmpq, 16*2
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16
+ RET
+ALIGN function_align
+.w32:
+ mova m0, [maskq]
+ mova m1, [dstq]
+ mova m6, [tmpq]
+ add maskq, 32
+ add tmpq, 32
+ psubb m3, m4, m0
+ punpcklbw m2, m3, m0
+ punpckhbw m3, m0
+ punpcklbw m0, m1, m6
+ punpckhbw m1, m6
+ pmaddubsw m0, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32
+ RET
+
+cglobal blend_v, 3, 6, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_v_avx2_table
+ lea r5, [blend_v_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ add maskq, obmc_masks-blend_v_avx2_table
+ jmp wq
+.w2:
+ vpbroadcastd xm2, [maskq+2*2]
+.w2_s0_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w2_s0_loop
+ RET
+ALIGN function_align
+.w4:
+ vpbroadcastq xm2, [maskq+4*2]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m4, [maskq+8*2]
+.w8_loop:
+ vpbroadcastq m2, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m2, 0x30
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ punpcklbw m0, m1
+ pmaddubsw m0, m4
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m3, [maskq+16*2]
+ vbroadcasti128 m4, [maskq+16*3]
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ sub hd, 2
+ jg .w16_loop
+ RET
+ALIGN function_align
+.w32:
+ mova xm3, [maskq+16*4]
+ vinserti128 m3, [maskq+16*6], 1
+ mova xm4, [maskq+16*5]
+ vinserti128 m4, [maskq+16*7], 1
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m4
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, dsq
+ dec hd
+ jg .w32_loop
+ RET
+
+cglobal blend_h, 4, 7, 6, dst, ds, tmp, w, h, mask
+%define base r5-blend_h_avx2_table
+ lea r5, [blend_h_avx2_table]
+ mov r6d, wd
+ tzcnt wd, wd
+ mov hd, hm
+ movsxd wq, dword [r5+wq*4]
+ vpbroadcastd m5, [base+pw_512]
+ add wq, r5
+ lea maskq, [base+obmc_masks+hq*2]
+ lea hd, [hq*3]
+ shr hd, 2 ; h * 3/4
+ lea maskq, [maskq+hq*2]
+ neg hq
+ jmp wq
+.w2:
+ movd xm0, [dstq+dsq*0]
+ pinsrw xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movd xm1, [tmpq]
+ add tmpq, 2*2
+ punpcklwd xm2, xm2
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ pextrw [dstq+dsq*0], xm0, 0
+ pextrw [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w2
+ RET
+ALIGN function_align
+.w4:
+ mova xm3, [blend_shuf]
+.w4_loop:
+ movd xm0, [dstq+dsq*0]
+ pinsrd xm0, [dstq+dsq*1], 1
+ movd xm2, [maskq+hq*2]
+ movq xm1, [tmpq]
+ add tmpq, 4*2
+ pshufb xm2, xm3
+ punpcklbw xm0, xm1
+ pmaddubsw xm0, xm2
+ pmulhrsw xm0, xm5
+ packuswb xm0, xm0
+ movd [dstq+dsq*0], xm0
+ pextrd [dstq+dsq*1], xm0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w4_loop
+ RET
+ALIGN function_align
+.w8:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x03
+.w8_loop:
+ vpbroadcastq m1, [dstq+dsq*0]
+ movq xm0, [dstq+dsq*1]
+ vpblendd m0, m1, 0x30
+ vpbroadcastd m3, [maskq+hq*2]
+ movq xm1, [tmpq+8*1]
+ vinserti128 m1, [tmpq+8*0], 1
+ add tmpq, 8*2
+ pshufb m3, m4
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m5
+ vextracti128 xm1, m0, 1
+ packuswb xm0, xm1
+ movhps [dstq+dsq*0], xm0
+ movq [dstq+dsq*1], xm0
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w8_loop
+ RET
+ALIGN function_align
+.w16:
+ vbroadcasti128 m4, [blend_shuf]
+ shufpd m4, m4, 0x0c
+.w16_loop:
+ mova xm1, [dstq+dsq*0]
+ vinserti128 m1, [dstq+dsq*1], 1
+ vpbroadcastd m3, [maskq+hq*2]
+ mova m2, [tmpq]
+ add tmpq, 16*2
+ pshufb m3, m4
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq+dsq*0], xm0
+ vextracti128 [dstq+dsq*1], m0, 1
+ lea dstq, [dstq+dsq*2]
+ add hq, 2
+ jl .w16_loop
+ RET
+ALIGN function_align
+.w32: ; w32/w64/w128
+ sub dsq, r6
+.w32_loop0:
+ vpbroadcastw m3, [maskq+hq*2]
+ mov wd, r6d
+.w32_loop:
+ mova m1, [dstq]
+ mova m2, [tmpq]
+ add tmpq, 32
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+ mova [dstq], m0
+ add dstq, 32
+ sub wd, 32
+ jg .w32_loop
+ add dstq, dsq
+ inc hq
+ jl .w32_loop0
+ RET
+
+cglobal emu_edge, 10, 13, 1, bw, bh, iw, ih, x, y, dst, dstride, src, sstride, \
+ bottomext, rightext
+ ; we assume that the buffer (stride) is larger than width, so we can
+ ; safely overwrite by a few bytes
+
+ ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
+ xor r12d, r12d
+ lea r10, [ihq-1]
+ cmp yq, ihq
+ cmovs r10, yq
+ test yq, yq
+ cmovs r10, r12
+ imul r10, sstrideq
+ add srcq, r10
+
+ ; ref += iclip(x, 0, iw - 1)
+ lea r10, [iwq-1]
+ cmp xq, iwq
+ cmovs r10, xq
+ test xq, xq
+ cmovs r10, r12
+ add srcq, r10
+
+ ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
+ lea bottomextq, [yq+bhq]
+ sub bottomextq, ihq
+ lea r3, [bhq-1]
+ cmovs bottomextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, x, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; top_ext = iclip(-y, 0, bh - 1)
+ neg topextq
+ cmovs topextq, r12
+ cmp bottomextq, bhq
+ cmovns bottomextq, r3
+ cmp topextq, bhq
+ cmovg topextq, r3
+
+ ; right_ext = iclip(x + bw - iw, 0, bw - 1)
+ lea rightextq, [xq+bwq]
+ sub rightextq, iwq
+ lea r2, [bwq-1]
+ cmovs rightextq, r12
+
+ DEFINE_ARGS bw, bh, iw, ih, leftext, topext, dst, dstride, src, sstride, \
+ bottomext, rightext
+
+ ; left_ext = iclip(-x, 0, bw - 1)
+ neg leftextq
+ cmovs leftextq, r12
+ cmp rightextq, bwq
+ cmovns rightextq, r2
+ cmp leftextq, bwq
+ cmovns leftextq, r2
+
+ DEFINE_ARGS bw, centerh, centerw, dummy, leftext, topext, \
+ dst, dstride, src, sstride, bottomext, rightext
+
+ ; center_h = bh - top_ext - bottom_ext
+ lea r3, [bottomextq+topextq]
+ sub centerhq, r3
+
+ ; blk += top_ext * PXSTRIDE(dst_stride)
+ mov r2, topextq
+ imul r2, dstrideq
+ add dstq, r2
+ mov r9m, dstq
+
+ ; center_w = bw - left_ext - right_ext
+ mov centerwq, bwq
+ lea r3, [rightextq+leftextq]
+ sub centerwq, r3
+
+%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
+.v_loop_%3:
+%if %1
+ ; left extension
+ xor r3, r3
+ vpbroadcastb m0, [srcq]
+.left_loop_%3:
+ mova [dstq+r3], m0
+ add r3, 32
+ cmp r3, leftextq
+ jl .left_loop_%3
+
+ ; body
+ lea r12, [dstq+leftextq]
+%endif
+ xor r3, r3
+.body_loop_%3:
+ movu m0, [srcq+r3]
+%if %1
+ movu [r12+r3], m0
+%else
+ movu [dstq+r3], m0
+%endif
+ add r3, 32
+ cmp r3, centerwq
+ jl .body_loop_%3
+
+%if %2
+ ; right extension
+%if %1
+ add r12, centerwq
+%else
+ lea r12, [dstq+centerwq]
+%endif
+ xor r3, r3
+ vpbroadcastb m0, [srcq+centerwq-1]
+.right_loop_%3:
+ movu [r12+r3], m0
+ add r3, 32
+ cmp r3, rightextq
+ jl .right_loop_%3
+
+%endif
+ add dstq, dstrideq
+ add srcq, sstrideq
+ dec centerhq
+ jg .v_loop_%3
+%endmacro
+
+ test leftextq, leftextq
+ jnz .need_left_ext
+ test rightextq, rightextq
+ jnz .need_right_ext
+ v_loop 0, 0, 0
+ jmp .body_done
+
+.need_left_ext:
+ test rightextq, rightextq
+ jnz .need_left_right_ext
+ v_loop 1, 0, 1
+ jmp .body_done
+
+.need_left_right_ext:
+ v_loop 1, 1, 2
+ jmp .body_done
+
+.need_right_ext:
+ v_loop 0, 1, 3
+
+.body_done:
+ ; bottom edge extension
+ test bottomextq, bottomextq
+ jz .top
+ mov srcq, dstq
+ sub srcq, dstrideq
+ xor r1, r1
+.bottom_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, bottomextq
+.bottom_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .bottom_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .bottom_x_loop
+
+.top:
+ ; top edge extension
+ test topextq, topextq
+ jz .end
+ mov srcq, r9m
+ mov dstq, dstm
+ xor r1, r1
+.top_x_loop:
+ mova m0, [srcq+r1]
+ lea r3, [dstq+r1]
+ mov r4, topextq
+.top_y_loop:
+ mova [r3], m0
+ add r3, dstrideq
+ dec r4
+ jg .top_y_loop
+ add r1, 32
+ cmp r1, bwq
+ jl .top_x_loop
+
+.end:
+ RET
+
+cglobal resize, 6, 14, 16, dst, dst_stride, src, src_stride, \
+ dst_w, h, src_w, dx, mx0
+ sub dword mx0m, 4<<14
+ sub dword src_wm, 8
+ vpbroadcastd m5, dxm
+ vpbroadcastd m8, mx0m
+ vpbroadcastd m6, src_wm
+
+ DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, picptr
+ LEA r7, $$
+%define base r7-$$
+
+ vpbroadcastd m3, [base+pw_m256]
+ vpbroadcastd m7, [base+pd_63]
+ vbroadcasti128 m15, [base+pb_8x0_8x8]
+ pmaddwd m2, m5, [base+rescale_mul] ; dx*[0,1,2,3,4,5,6,7]
+ pslld m5, 3 ; dx*8
+ pslld m6, 14
+ paddd m8, m2 ; mx+[0..7]*dx
+ pxor m2, m2
+
+ ; m2 = 0, m3 = pmulhrsw constant for x=(x+64)>>7
+ ; m8 = mx+[0..7]*dx, m5 = dx*8, m6 = src_w, m7 = 0x3f, m15=0,8
+
+.loop_y:
+ xor xd, xd
+ mova m4, m8 ; per-line working version of mx
+
+.loop_x:
+ pmaxsd m0, m4, m2
+ psrad m9, m4, 8 ; filter offset (unmasked)
+ pminsd m0, m6 ; iclip(mx, 0, src_w-8)
+ psubd m1, m4, m0 ; pshufb offset
+ psrad m0, 14 ; clipped src_x offset
+ psrad m1, 14 ; pshufb edge_emu offset
+ pand m9, m7 ; filter offset (masked)
+
+ ; load source pixels - this ugly code is vpgatherdq emulation since
+ ; directly using vpgatherdq on Haswell is quite a bit slower :(
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vextracti128 xm0, m0, 1
+ movq xm12, [srcq+r8]
+ movq xm13, [srcq+r10]
+ movhps xm12, [srcq+r9]
+ movhps xm13, [srcq+r11]
+ movd r8d, xm0
+ pextrd r9d, xm0, 1
+ pextrd r10d, xm0, 2
+ pextrd r11d, xm0, 3
+ vinserti128 m12, [srcq+r8], 1
+ vinserti128 m13, [srcq+r10], 1
+ vpbroadcastq m10, [srcq+r9]
+ vpbroadcastq m11, [srcq+r11]
+ vpblendd m12, m10, 11000000b
+ vpblendd m13, m11, 11000000b
+
+ ; if no emulation is required, we don't need to shuffle or emulate edges
+ ; this also saves 2 quasi-vpgatherdqs
+ vptest m1, m1
+ jz .filter
+
+ movd r8d, xm1
+ pextrd r9d, xm1, 1
+ pextrd r10d, xm1, 2
+ pextrd r11d, xm1, 3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ vextracti128 xm1, m1, 1
+ movq xm14, [base+resize_shuf+4+r8]
+ movq xm0, [base+resize_shuf+4+r10]
+ movhps xm14, [base+resize_shuf+4+r9]
+ movhps xm0, [base+resize_shuf+4+r11]
+ movd r8d, xm1
+ pextrd r9d, xm1, 1
+ pextrd r10d, xm1, 2
+ pextrd r11d, xm1, 3
+ movsxd r8, r8d
+ movsxd r9, r9d
+ movsxd r10, r10d
+ movsxd r11, r11d
+ vinserti128 m14, [base+resize_shuf+4+r8], 1
+ vinserti128 m0, [base+resize_shuf+4+r10], 1
+ vpbroadcastq m10, [base+resize_shuf+4+r9]
+ vpbroadcastq m11, [base+resize_shuf+4+r11]
+ vpblendd m14, m10, 11000000b
+ vpblendd m0, m11, 11000000b
+
+ paddb m14, m15
+ paddb m0, m15
+ pshufb m12, m14
+ pshufb m13, m0
+
+.filter:
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vextracti128 xm9, m9, 1
+ movq xm10, [base+resize_filter+r8*8]
+ movq xm11, [base+resize_filter+r10*8]
+ movhps xm10, [base+resize_filter+r9*8]
+ movhps xm11, [base+resize_filter+r11*8]
+ movd r8d, xm9
+ pextrd r9d, xm9, 1
+ pextrd r10d, xm9, 2
+ pextrd r11d, xm9, 3
+ vinserti128 m10, [base+resize_filter+r8*8], 1
+ vinserti128 m11, [base+resize_filter+r10*8], 1
+ vpbroadcastq m14, [base+resize_filter+r9*8]
+ vpbroadcastq m1, [base+resize_filter+r11*8]
+ vpblendd m10, m14, 11000000b
+ vpblendd m11, m1, 11000000b
+
+ pmaddubsw m12, m10
+ pmaddubsw m13, m11
+ phaddw m12, m13
+ vextracti128 xm13, m12, 1
+ phaddsw xm12, xm13
+ pmulhrsw xm12, xm3 ; x=(x+64)>>7
+ packuswb xm12, xm12
+ movq [dstq+xq], xm12
+
+ paddd m4, m5
+ add xd, 8
+ cmp xd, dst_wd
+ jl .loop_x
+
+ add dstq, dst_strideq
+ add srcq, src_strideq
+ dec hd
+ jg .loop_y
+ RET
+
+cglobal w_mask_420, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx2_table
+ lea r7, [w_mask_420_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m9, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_420_sign+r6*4] ; 258 - sign
+ add wq, r7
+ W_MASK 0, 4, 0, 1
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm0, m4, 1
+ vpblendd xm1, xm4, xm0, 0x05
+ vpblendd xm4, xm0, 0x0a
+ pshufd xm1, xm1, q2301
+ psubw xm4, xm8, xm4
+ psubw xm4, xm1
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [maskq], xm4
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ phaddd m4, m5
+ vextracti128 xm1, m0, 1
+ psubw m4, m8, m4
+ psrlw m4, 2
+ vpermd m4, m9, m4
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], xm4
+ RET
+.w8_loop:
+ add tmp1q, 2*32
+ add tmp2q, 2*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 8
+.w8:
+ vextracti128 xm2, m4, 1
+ vextracti128 xm1, m0, 1
+ psubw xm4, xm8, xm4
+ psubw xm4, xm2
+ psrlw xm4, 2
+ packuswb xm4, xm4
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ movq [maskq], xm4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ punpckhqdq m1, m4, m5
+ punpcklqdq m4, m5
+ psubw m1, m8, m1
+ psubw m1, m4
+ psrlw m1, 2
+ vpermq m0, m0, q3120
+ packuswb m1, m1
+ vpermd m1, m9, m1
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], xm1
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 16
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ psubw m4, m8, m4
+ psubw m4, m5
+ psrlw m4, 2
+ vpermq m0, m0, q3120
+ packuswb m4, m4
+ vpermd m4, m9, m4
+ mova [dstq+strideq*1], m0
+ mova [maskq], xm4
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+ dec hd
+.w64_loop:
+ add tmp1q, 4*32
+ add tmp2q, 4*32
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ test hd, 1
+ jz .w64_loop_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq], m4
+ add maskq, 32
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop_even:
+ psubw m12, m8, m4
+ psubw m13, m8, m5
+ dec hd
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ add tmp1q, 8*32
+ add tmp2q, 8*32
+ test hd, 1
+ jz .w128_even
+ psubw m4, m10, m4
+ psubw m5, m11, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*0], m4
+ jmp .w128_odd
+.w128_even:
+ psubw m10, m8, m4
+ psubw m11, m8, m5
+.w128_odd:
+ W_MASK 0, 4, -4, -3
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, -2, -1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ test hd, 1
+ jz .w128_loop_even
+ psubw m4, m12, m4
+ psubw m5, m13, m5
+ psrlw m4, 2
+ psrlw m5, 2
+ packuswb m4, m5
+ vpermd m4, m9, m4
+ mova [maskq+32*1], m4
+ add maskq, 64
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx2_table
+ lea r7, [w_mask_422_avx2_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ pxor m9, m9
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ pmovzxbd m10, [base+deint_shuf4]
+ vpbroadcastd m8, [base+wm_422_sign+r6*4] ; 128 - sign
+ add wq, r7
+ mov maskq, maskmp
+ W_MASK 0, 4, 0, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ jg .w4_h16
+.w4_end:
+ vextracti128 xm5, m4, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ mova [maskq], xm5
+ RET
+.w4_h16:
+ W_MASK 0, 5, 2, 3
+ lea dstq, [dstq+strideq*4]
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermd m5, m10, m5
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq], m5
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 16
+.w8:
+ vextracti128 xm5, m4, 1
+ vextracti128 xm1, m0, 1
+ packuswb xm4, xm5
+ psubb xm5, xm8, xm4
+ pavgb xm5, xm9
+ pshufd xm5, xm5, q3120
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], xm5
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*2], xm0
+ vextracti128 [dstq+stride3q ], m0, 1
+ mova [maskq], m5
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+strideq*1], m0
+ mova [maskq], m5
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq], m5
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ W_MASK 0, 5, 2, 3
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*1], m0
+ mova [maskq+32*0], m5
+ W_MASK 0, 4, 4, 5
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ W_MASK 0, 5, 6, 7
+ packuswb m4, m5
+ psubb m5, m8, m4
+ pavgb m5, m9
+ vpermq m0, m0, q3120
+ vpermd m5, m10, m5
+ mova [dstq+32*3], m0
+ mova [maskq+32*1], m5
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444, 4, 8, 8, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx2_table
+ lea r7, [w_mask_444_avx2_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ add wq, r7
+ W_MASK 0, 4, 0, 1, 1
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ mova [maskq+32*0], m4
+ cmp hd, 8
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ je .w4_end
+ W_MASK 0, 4, 2, 3, 1
+ lea dstq, [dstq+strideq*4]
+ vextracti128 xm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xm1
+ pextrd [dstq+stride3q ], xm1, 1
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xm1, 2
+ pextrd [dstq+stride3q ], xm1, 3
+ mova [maskq+32*1], m4
+.w4_end:
+ RET
+.w8_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*4]
+ add maskq, 32
+.w8:
+ vextracti128 xm1, m0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xm1
+ mova [maskq], m4
+ sub hd, 4
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ lea dstq, [dstq+strideq*2]
+ add maskq, 32
+.w16:
+ vpermq m0, m0, q3120
+ mova [dstq+strideq*0], xm0
+ vextracti128 [dstq+strideq*1], m0, 1
+ mova [maskq], m4
+ sub hd, 2
+ jg .w16_loop
+ RET
+.w32_loop:
+ add tmp1q, 32*2
+ add tmp2q, 32*2
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32
+.w32:
+ vpermq m0, m0, q3120
+ mova [dstq], m0
+ mova [maskq], m4
+ dec hd
+ jg .w32_loop
+ RET
+.w64_loop:
+ add tmp1q, 32*4
+ add tmp2q, 32*4
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*2
+.w64:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ dec hd
+ jg .w64_loop
+ RET
+.w128_loop:
+ add tmp1q, 32*8
+ add tmp2q, 32*8
+ W_MASK 0, 4, 0, 1, 1
+ add dstq, strideq
+ add maskq, 32*4
+.w128:
+ vpermq m0, m0, q3120
+ mova [dstq+32*0], m0
+ mova [maskq+32*0], m4
+ W_MASK 0, 4, 2, 3, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*1], m0
+ mova [maskq+32*1], m4
+ W_MASK 0, 4, 4, 5, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*2], m0
+ mova [maskq+32*2], m4
+ W_MASK 0, 4, 6, 7, 1
+ vpermq m0, m0, q3120
+ mova [dstq+32*3], m0
+ mova [maskq+32*3], m4
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; ARCH_X86_64
--- /dev/null
+++ b/src/x86/mc_avx512.asm
@@ -1,0 +1,2394 @@
+; Copyright © 2020, VideoLAN and dav1d authors
+; Copyright © 2020, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+; list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+; this list of conditions and the following disclaimer in the documentation
+; and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "ext/x86/x86inc.asm"
+
+%if HAVE_AVX512ICL && ARCH_X86_64
+
+SECTION_RODATA 64
+
+bidir_sctr_w4: dd 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
+wm_420_perm4: db 1, 3, 9, 11, 5, 7, 13, 15, 17, 19, 25, 27, 21, 23, 29, 31
+ db 33, 35, 41, 43, 37, 39, 45, 47, 49, 51, 57, 59, 53, 55, 61, 63
+ db 0, 2, 8, 10, 4, 6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
+ db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
+wm_420_perm8: db 1, 3, 17, 19, 5, 7, 21, 23, 9, 11, 25, 27, 13, 15, 29, 31
+ db 33, 35, 49, 51, 37, 39, 53, 55, 41, 43, 57, 59, 45, 47, 61, 63
+ db 0, 2, 16, 18, 4, 6, 20, 22, 8, 10, 24, 26, 12, 14, 28, 30
+ db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
+wm_420_perm16: db 1, 3, 33, 35, 5, 7, 37, 39, 9, 11, 41, 43, 13, 15, 45, 47
+ db 17, 19, 49, 51, 21, 23, 53, 55, 25, 27, 57, 59, 29, 31, 61, 63
+ db 0, 2, 32, 34, 4, 6, 36, 38, 8, 10, 40, 42, 12, 14, 44, 46
+ db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
+wm_420_mask: db 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63
+ db 67, 71, 75, 79, 83, 87, 91, 95, 99,103,107,111,115,119,123,127
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_422_mask: db 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62
+ db 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
+ db 66, 70, 74, 78, 82, 86, 90, 94, 98,102,106,110,114,118,122,126
+ db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
+wm_444_mask: db 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+ db 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63
+ db 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+ db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
+bilin_h_perm16: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 33, 32, 34, 33, 35, 34, 36, 35, 37, 36, 38, 37, 39, 38, 40, 39
+ db 41, 40, 42, 41, 43, 42, 44, 43, 45, 44, 46, 45, 47, 46, 48, 47
+bilin_h_perm32: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+ db 9, 8, 10, 9, 11, 10, 12, 11, 13, 12, 14, 13, 15, 14, 16, 15
+ db 17, 16, 18, 17, 19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23
+ db 25, 24, 26, 25, 27, 26, 28, 27, 29, 28, 30, 29, 31, 30, 32, 31
+bilin_v_perm8: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 32, 80, 33, 81, 34, 82, 35, 83, 36, 84, 37, 85, 38, 86, 39, 87
+ db 64, 32, 65, 33, 66, 34, 67, 35, 68, 36, 69, 37, 70, 38, 71, 39
+bilin_v_perm16: db 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7
+ db 24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15
+ db 64, 16, 65, 17, 66, 18, 67, 19, 68, 20, 69, 21, 70, 22, 71, 23
+ db 72, 24, 73, 25, 74, 26, 75, 27, 76, 28, 77, 29, 78, 30, 79, 31
+bilin_v_perm32: db 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7
+ db 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15
+ db 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23
+ db 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31
+bilin_v_perm64: dq 0, 4, 1, 5, 2, 6, 3, 7
+spel_h_perm16a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+spel_h_perm16b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 36, 37, 38, 39, 37, 38, 39, 40, 38, 39, 40, 41, 39, 40, 41, 42
+ db 44, 45, 46, 47, 45, 46, 47, 48, 46, 47, 48, 49, 47, 48, 49, 50
+spel_h_perm16c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 40, 41, 42, 43, 41, 42, 43, 44, 42, 43, 44, 45, 43, 44, 45, 46
+ db 48, 49, 50, 51, 49, 50, 51, 52, 50, 51, 52, 53, 51, 52, 53, 54
+spel_h_perm32a: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+spel_h_perm32b: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18
+ db 20, 21, 22, 23, 21, 22, 23, 24, 22, 23, 24, 25, 23, 24, 25, 26
+ db 28, 29, 30, 31, 29, 30, 31, 32, 30, 31, 32, 33, 31, 32, 33, 34
+spel_h_perm32c: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+ db 16, 17, 18, 19, 17, 18, 19, 20, 18, 19, 20, 21, 19, 20, 21, 22
+ db 24, 25, 26, 27, 25, 26, 27, 28, 26, 27, 28, 29, 27, 28, 29, 30
+ db 32, 33, 34, 35, 33, 34, 35, 36, 34, 35, 36, 37, 35, 36, 37, 38
+spel_hv_perm4a: db 8, 9, 16, 17, 10, 11, 18, 19, 12, 13, 20, 21, 14, 15, 22, 23
+ db 16, 17, 24, 25, 18, 19, 26, 27, 20, 21, 28, 29, 22, 23, 30, 31
+spel_hv_perm4b: db 24, 25, 32, 33, 26, 27, 34, 35, 28, 29, 36, 37, 30, 31, 38, 39
+ db 32, 33, 40, 41, 34, 35, 42, 43, 36, 37, 44, 45, 38, 39, 46, 47
+spel_hv_perm4c: db 40, 41, 48, 49, 42, 43, 50, 51, 44, 45, 52, 53, 46, 47, 54, 55
+ db 48, 49, 56, 57, 50, 51, 58, 59, 52, 53, 60, 61, 54, 55, 62, 63
+deint_shuf4: db 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
+subpel_h_shufA: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+subpel_h_shufB: db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+subpel_h_shufC: db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+bilin_h_shuf4: db 1, 0, 2, 1, 3, 2, 4, 3, 9, 8, 10, 9, 11, 10, 12, 11
+bilin_h_shuf8: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+bilin_v_shuf4: db 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7
+pb_02461357: db 0, 2, 4, 6, 1, 3, 5, 7
+
+wm_420_perm64: dq 0xfedcba9876543210
+wm_sign: dd 0x40804080, 0xc0c0c0c0, 0x40404040
+
+pb_127: times 4 db 127
+pw_m128 times 2 dw -128
+pw_512: times 2 dw 512
+pw_1024: times 2 dw 1024
+pw_2048: times 2 dw 2048
+pw_6903: times 2 dw 6903
+pw_8192: times 2 dw 8192
+pd_2: dd 2
+pd_32: dd 32
+pd_32768: dd 32768
+
+%define pb_m64 (wm_sign+4)
+%define pb_64 (wm_sign+8)
+
+cextern mc_subpel_filters
+%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
+
+%macro BASE_JMP_TABLE 3-*
+ %xdefine %1_%2_table (%%table - %3)
+ %xdefine %%base %1_%2
+ %%table:
+ %rep %0 - 2
+ dw %%base %+ _w%3 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro HV_JMP_TABLE 5-*
+ %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
+ %xdefine %%base %1_%3
+ %assign %%types %4
+ %if %%types & 1
+ %xdefine %1_%2_h_%3_table (%%h - %5)
+ %%h:
+ %rep %0 - 4
+ dw %%prefix %+ .h_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 2
+ %xdefine %1_%2_v_%3_table (%%v - %5)
+ %%v:
+ %rep %0 - 4
+ dw %%prefix %+ .v_w%5 - %%base
+ %rotate 1
+ %endrep
+ %rotate 4
+ %endif
+ %if %%types & 4
+ %xdefine %1_%2_hv_%3_table (%%hv - %5)
+ %%hv:
+ %rep %0 - 4
+ dw %%prefix %+ .hv_w%5 - %%base
+ %rotate 1
+ %endrep
+ %endif
+%endmacro
+
+%macro BIDIR_JMP_TABLE 1-*
+ %xdefine %1_table (%%table - 2*%2)
+ %xdefine %%base %1_table
+ %xdefine %%prefix mangle(private_prefix %+ _%1)
+ %%table:
+ %rep %0 - 1
+ dd %%prefix %+ .w%2 - %%base
+ %rotate 1
+ %endrep
+%endmacro
+
+%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_avx512icl.prep)
+
+%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
+
+BASE_JMP_TABLE prep, avx512icl, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, avx512icl, 7, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, avx512icl, 7, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE avg_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_avg_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE mask_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_420_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_422_avx512icl, 4, 8, 16, 32, 64, 128
+BIDIR_JMP_TABLE w_mask_444_avx512icl, 4, 8, 16, 32, 64, 128
+
+SECTION .text
+
+%macro REPX 2-*
+ %xdefine %%f(x) %1
+%rep %0 - 1
+ %rotate 1
+ %%f(%1)
+%endrep
+%endmacro
+
+%macro WRAP_YMM 1+
+INIT_YMM cpuname
+ %1
+INIT_ZMM cpuname
+%endmacro
+
+DECLARE_REG_TMP 3, 5, 6
+
+INIT_ZMM avx512icl
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+ movifnidn mxyd, r5m ; mx
+ lea t2, [prep_avx512icl]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ test mxyd, mxyd
+ jnz .h
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .v
+.prep:
+ movzx wd, word [t2+wq*2+table_offset(prep,)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.prep_w4:
+ movd xmm0, [srcq+strideq*0]
+ pinsrd xmm0, [srcq+strideq*1], 1
+ pinsrd xmm0, [srcq+strideq*2], 2
+ pinsrd xmm0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw ym0, xmm0
+ psllw ym0, 4
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .prep_w4
+ RET
+.prep_w8:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti128 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pmovzxbw m0, ym0
+ psllw m0, 4
+ mova [tmpq], m0
+ add tmpq, 32*2
+ sub hd, 4
+ jg .prep_w8
+ RET
+.prep_w16:
+ movu xmm0, [srcq+strideq*0]
+ vinserti128 ym0, ymm0, [srcq+strideq*1], 1
+ movu xmm1, [srcq+strideq*2]
+ vinserti128 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ pmovzxbw m0, ym0
+ pmovzxbw m1, ym1
+ psllw m0, 4
+ psllw m1, 4
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 32*4
+ sub hd, 4
+ jg .prep_w16
+ RET
+.prep_w32:
+ pmovzxbw m0, [srcq+strideq*0]
+ pmovzxbw m1, [srcq+strideq*1]
+ pmovzxbw m2, [srcq+strideq*2]
+ pmovzxbw m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .prep_w32
+ RET
+.prep_w64:
+ pmovzxbw m0, [srcq+strideq*0+32*0]
+ pmovzxbw m1, [srcq+strideq*0+32*1]
+ pmovzxbw m2, [srcq+strideq*1+32*0]
+ pmovzxbw m3, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .prep_w64
+ RET
+.prep_w128:
+ pmovzxbw m0, [srcq+32*0]
+ pmovzxbw m1, [srcq+32*1]
+ pmovzxbw m2, [srcq+32*2]
+ pmovzxbw m3, [srcq+32*3]
+ REPX {psllw x, 4}, m0, m1, m2, m3
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .prep_w128
+ RET
+.h:
+ ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+ ; = (16 - mx) * src[x] + mx * src[x + 1]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ vpbroadcastw m5, mxyd
+ mov mxyd, r6m ; my
+ test mxyd, mxyd
+ jnz .hv
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.h_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+.h_w4_loop:
+ movq xmm0, [srcq+strideq*0]
+ movq xmm1, [srcq+strideq*1]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*2], 1
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ punpcklqdq ym0, ym1
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+.h_w8_loop:
+ movu xmm0, [srcq+strideq*0]
+ vinserti32x4 ym0, ymm0, [srcq+strideq*1], 1
+ vinserti32x4 m0, [srcq+strideq*2], 2
+ vinserti32x4 m0, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m4, [bilin_h_perm16]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpermb m0, m4, m0
+ vpermb m1, m4, m1
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m4, [bilin_h_perm32]
+.h_w32_loop:
+ vpermb m0, m4, [srcq+strideq*0]
+ vpermb m1, m4, [srcq+strideq*1]
+ vpermb m2, m4, [srcq+strideq*2]
+ vpermb m3, m4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 4
+ jg .h_w32_loop
+ RET
+.h_w64:
+ mova m4, [bilin_h_perm32]
+.h_w64_loop:
+ vpermb m0, m4, [srcq+strideq*0+32*0]
+ vpermb m1, m4, [srcq+strideq*0+32*1]
+ vpermb m2, m4, [srcq+strideq*1+32*0]
+ vpermb m3, m4, [srcq+strideq*1+32*1]
+ lea srcq, [srcq+strideq*2]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ sub hd, 2
+ jg .h_w64_loop
+ RET
+.h_w128:
+ mova m4, [bilin_h_perm32]
+.h_w128_loop:
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+ mova [tmpq+64*2], m2
+ mova [tmpq+64*3], m3
+ add tmpq, 64*4
+ add srcq, strideq
+ dec hd
+ jg .h_w128_loop
+ RET
+.v:
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+ imul mxyd, 0xff01
+ add mxyd, 16 << 8
+ add wq, t2
+ lea stride3q, [strideq*3]
+ vpbroadcastw m6, mxyd
+ jmp wq
+.v_w4:
+ vpbroadcastd xm0, [srcq+strideq*0]
+ mov r3d, 0x29
+ vbroadcasti32x4 ym3, [bilin_v_shuf4]
+ kmovb k1, r3d
+.v_w4_loop:
+ vpblendmd xm1{k1}, xm0, [srcq+strideq*1] {1to4} ; __01 ____
+ vpbroadcastd ym2, [srcq+strideq*2]
+ vpbroadcastd ym2{k1}, [srcq+stride3q ] ; __2_ 23__
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ym0, [srcq+strideq*0]
+ punpckhqdq ym2{k1}, ym1, ym0 ; 012_ 234_
+ pshufb ym2, ym3
+ pmaddubsw ym2, ym6
+ mova [tmpq], ym2
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ RET
+.v_w8:
+ mova m5, [bilin_v_perm8]
+ vbroadcasti32x4 ym0, [srcq+strideq*0]
+.v_w8_loop:
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpbroadcastq ym0, [srcq+strideq*2]
+ vinserti32x4 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym0, [srcq+strideq*0], 0
+ vpermt2b m1, m5, m0
+ pmaddubsw m1, m6
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mova m5, [bilin_v_perm16]
+ movu xm0, [srcq+strideq*0]
+.v_w16_loop:
+ movu xm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ym0, [srcq+strideq*1], 1
+ vpermt2b m1, m5, m2
+ vinserti32x4 ym2, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ movu xm0, [srcq+strideq*0]
+ vpermt2b m2, m5, m0
+ pmaddubsw m1, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m5, [bilin_v_perm32]
+ movu ym0, [srcq+strideq*0]
+.v_w32_loop:
+ movu ym2, [srcq+strideq*1]
+ movu ym3, [srcq+strideq*2]
+ movu ym4, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpermt2b m0, m5, m2
+ vpermt2b m2, m5, m3
+ vpermt2b m3, m5, m4
+ pmaddubsw m1, m0, m6
+ movu ym0, [srcq+strideq*0]
+ vpermt2b m4, m5, m0
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m4, m6
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m4
+ add tmpq, 64*4
+ sub hd, 4
+ jg .v_w32_loop
+ RET
+.v_w64:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0]
+.v_w64_loop:
+ vpermq m1, m5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m1, m0
+ punpckhbw m2, m1, m0
+ vpermq m0, m5, [srcq+strideq*0]
+ punpcklbw m3, m0, m1
+ punpckhbw m1, m0, m1
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ pmaddubsw m3, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m2
+ mova [tmpq+64*2], m3
+ mova [tmpq+64*3], m1
+ add tmpq, 64*4
+ sub hd, 2
+ jg .v_w64_loop
+ RET
+.v_w128:
+ mova m5, [bilin_v_perm64]
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+.v_w128_loop:
+ vpermq m2, m5, [srcq+strideq*1+ 0]
+ vpermq m3, m5, [srcq+strideq*1+64]
+ lea srcq, [srcq+strideq*2]
+ punpcklbw m4, m2, m0
+ punpckhbw m0, m2, m0
+ pmaddubsw m4, m6
+ pmaddubsw m0, m6
+ mova [tmpq+64*0], m4
+ mova [tmpq+64*1], m0
+ punpcklbw m4, m3, m1
+ punpckhbw m1, m3, m1
+ pmaddubsw m4, m6
+ pmaddubsw m1, m6
+ mova [tmpq+64*2], m4
+ mova [tmpq+64*3], m1
+ vpermq m0, m5, [srcq+strideq*0+ 0]
+ vpermq m1, m5, [srcq+strideq*0+64]
+ punpcklbw m4, m0, m2
+ punpckhbw m2, m0, m2
+ pmaddubsw m4, m6
+ pmaddubsw m2, m6
+ mova [tmpq+64*4], m4
+ mova [tmpq+64*5], m2
+ punpcklbw m4, m1, m3
+ punpckhbw m3, m1, m3
+ pmaddubsw m4, m6
+ pmaddubsw m3, m6
+ mova [tmpq+64*6], m4
+ mova [tmpq+64*7], m3
+ add tmpq, 64*8
+ sub hd, 2
+ jg .v_w128_loop
+ RET
+.hv:
+ ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+ ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+ %assign stack_offset stack_offset - stack_size_padded
+ WIN64_SPILL_XMM 7
+ movzx wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+ shl mxyd, 11
+ vpbroadcastw m6, mxyd
+ add wq, t2
+ lea stride3q, [strideq*3]
+ jmp wq
+.hv_w4:
+ vbroadcasti32x4 ym4, [bilin_h_shuf4]
+ vpbroadcastq ym0, [srcq+strideq*0]
+ pshufb ym0, ym4
+ pmaddubsw ym0, ym5
+.hv_w4_loop:
+ movq xmm1, [srcq+strideq*1]
+ movq xmm2, [srcq+strideq*2]
+ vinserti32x4 ym1, ymm1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vinserti32x4 ym2, ymm2, [srcq+strideq*0], 1
+ punpcklqdq ym1, ym2
+ pshufb ym1, ym4
+ pmaddubsw ym1, ym5 ; 1 2 3 4
+ valignq ym2, ym1, ym0, 3 ; 0 1 2 3
+ mova ym0, ym1
+ psubw ym1, ym2
+ pmulhrsw ym1, ym6
+ paddw ym1, ym2
+ mova [tmpq], ym1
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ RET
+.hv_w8:
+ vbroadcasti32x4 m4, [bilin_h_shuf8]
+ vbroadcasti32x4 m0, [srcq+strideq*0]
+ pshufb m0, m4
+ pmaddubsw m0, m5
+.hv_w8_loop:
+ movu xmm1, [srcq+strideq*1]
+ vinserti128 ym1, ymm1, [srcq+strideq*2], 1
+ vinserti128 m1, [srcq+stride3q ], 2
+ lea srcq, [srcq+strideq*4]
+ vinserti128 m1, [srcq+strideq*0], 3
+ pshufb m1, m4
+ pmaddubsw m1, m5 ; 1 2 3 4
+ valignq m2, m1, m0, 6 ; 0 1 2 3
+ mova m0, m1
+ psubw m1, m2
+ pmulhrsw m1, m6
+ paddw m1, m2
+ mova [tmpq], m1
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mova m4, [bilin_h_perm16]
+ vbroadcasti32x8 m0, [srcq+strideq*0]
+ vpermb m0, m4, m0
+ pmaddubsw m0, m5
+.hv_w16_loop:
+ movu ym1, [srcq+strideq*1]
+ vinserti32x8 m1, [srcq+strideq*2], 1
+ movu ym2, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti32x8 m2, [srcq+strideq*0], 1
+ vpermb m1, m4, m1
+ vpermb m2, m4, m2
+ pmaddubsw m1, m5 ; 1 2
+ vshufi32x4 m3, m0, m1, q1032 ; 0 1
+ pmaddubsw m0, m2, m5 ; 3 4
+ vshufi32x4 m2, m1, m0, q1032 ; 2 3
+ psubw m1, m3
+ pmulhrsw m1, m6
+ paddw m1, m3
+ psubw m3, m0, m2
+ pmulhrsw m3, m6
+ paddw m3, m2
+ mova [tmpq+64*0], m1
+ mova [tmpq+64*1], m3
+ add tmpq, 64*2
+ sub hd, 4
+ jg .hv_w16_loop
+ RET
+.hv_w32:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+strideq*0]
+ pmaddubsw m0, m5
+.hv_w32_loop:
+ vpermb m1, m4, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermb m2, m4, [srcq+strideq*0]
+ pmaddubsw m1, m5
+ psubw m3, m1, m0
+ pmulhrsw m3, m6
+ paddw m3, m0
+ pmaddubsw m0, m2, m5
+ psubw m2, m0, m1
+ pmulhrsw m2, m6
+ paddw m2, m1
+ mova [tmpq+64*0], m3
+ mova [tmpq+64*1], m2
+ add tmpq, 64*2
+ sub hd, 2
+ jg .hv_w32_loop
+ RET
+.hv_w64:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+.hv_w64_loop:
+ add srcq, strideq
+ vpermb m2, m4, [srcq+32*0]
+ vpermb m3, m4, [srcq+32*1]
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+ psubw m7, m2, m0
+ psubw m8, m3, m1
+ pmulhrsw m7, m6
+ pmulhrsw m8, m6
+ paddw m7, m0
+ mova m0, m2
+ paddw m8, m1
+ mova m1, m3
+ mova [tmpq+64*0], m7
+ mova [tmpq+64*1], m8
+ add tmpq, 64*2
+ dec hd
+ jg .hv_w64_loop
+ RET
+.hv_w128:
+ mova m4, [bilin_h_perm32]
+ vpermb m0, m4, [srcq+32*0]
+ vpermb m1, m4, [srcq+32*1]
+ vpermb m2, m4, [srcq+32*2]
+ vpermb m3, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m0, m1, m2, m3
+.hv_w128_loop:
+ add srcq, strideq
+ vpermb m7, m4, [srcq+32*0]
+ vpermb m8, m4, [srcq+32*1]
+ vpermb m9, m4, [srcq+32*2]
+ vpermb m10, m4, [srcq+32*3]
+ REPX {pmaddubsw x, m5}, m7, m8, m9, m10
+ psubw m11, m7, m0
+ psubw m12, m8, m1
+ psubw m13, m9, m2
+ psubw m14, m10, m3
+ REPX {pmulhrsw x, m6}, m11, m12, m13, m14
+ paddw m11, m0
+ mova m0, m7
+ paddw m12, m1
+ mova m1, m8
+ paddw m13, m2
+ mova m2, m9
+ paddw m14, m3
+ mova m3, m10
+ mova [tmpq+64*0], m11
+ mova [tmpq+64*1], m12
+ mova [tmpq+64*2], m13
+ mova [tmpq+64*3], m14
+ add tmpq, 64*4
+ dec hd
+ jg .hv_w128_loop
+ RET
+
+; int8_t subpel_filters[5][15][8]
+%assign FILTER_REGULAR (0*15 << 16) | 3*15
+%assign FILTER_SMOOTH (1*15 << 16) | 4*15
+%assign FILTER_SHARP (2*15 << 16) | 3*15
+
+%macro FN 4 ; fn, type, type_h, type_v
+cglobal %1_%2
+ mov t0d, FILTER_%3
+%ifidn %3, %4
+ mov t1d, t0d
+%else
+ mov t1d, FILTER_%4
+%endif
+%ifnidn %2, regular ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _%1 %+ SUFFIX)
+%endif
+%endmacro
+
+%macro PREP_8TAP_H 0
+ vpermb m10, m5, m0
+ vpermb m11, m5, m1
+ vpermb m12, m6, m0
+ vpermb m13, m6, m1
+ vpermb m14, m7, m0
+ vpermb m15, m7, m1
+ mova m0, m4
+ vpdpbusd m0, m10, m8
+ mova m2, m4
+ vpdpbusd m2, m12, m8
+ mova m1, m4
+ vpdpbusd m1, m11, m8
+ mova m3, m4
+ vpdpbusd m3, m13, m8
+ vpdpbusd m0, m12, m9
+ vpdpbusd m2, m14, m9
+ vpdpbusd m1, m13, m9
+ vpdpbusd m3, m15, m9
+ packssdw m0, m2
+ packssdw m1, m3
+ psraw m0, 2
+ psraw m1, 2
+ mova [tmpq+64*0], m0
+ mova [tmpq+64*1], m1
+%endmacro
+
+%if WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+
+%define PREP_8TAP_FN FN prep_8tap,
+
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN regular, REGULAR, REGULAR
+
+cglobal prep_8tap, 3, 8, 0, tmp, src, stride, w, h, mx, my, stride3
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ lea r7, [prep_avx512icl]
+ movsxd wq, wm
+ movifnidn hd, hm
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [r7+wq*2+table_offset(prep,)]
+ add wq, r7
+ lea r6, [strideq*3]
+%if WIN64
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ vpbroadcastd m4, [pd_2]
+ WIN64_SPILL_XMM 10
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_h)]
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m9, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ add wq, r7
+ jmp wq
+.h_w4:
+ movzx mxd, mxb
+ vbroadcasti128 ym5, [subpel_h_shufA]
+ mov r3d, 0x4
+ dec srcq
+ vpbroadcastd ym6, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ kmovb k1, r3d
+ lea stride3q, [strideq*3]
+.h_w4_loop:
+ movq xm2, [srcq+strideq*0]
+ movq xm3, [srcq+strideq*1]
+ vpbroadcastq ym2{k1}, [srcq+strideq*2]
+ vpbroadcastq ym3{k1}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ pshufb ym2, ym5
+ pshufb ym3, ym5
+ mova ym0, ym4
+ vpdpbusd ym0, ym2, ym6
+ mova ym1, ym4
+ vpdpbusd ym1, ym3, ym6
+ packssdw ym0, ym1
+ psraw ym0, 2
+ mova [tmpq], ym0
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+.h_w8:
+ vbroadcasti128 m5, [subpel_h_shufA]
+ vbroadcasti128 m6, [subpel_h_shufB]
+ vbroadcasti128 m7, [subpel_h_shufC]
+ lea stride3q, [strideq*3]
+.h_w8_loop:
+ movu xmm3, [srcq+strideq*0]
+ vinserti128 ym3, ymm3, [srcq+strideq*1], 1
+ vinserti128 m3, [srcq+strideq*2], 2
+ vinserti128 m3, [srcq+stride3q ], 3
+ lea srcq, [srcq+strideq*4]
+ pshufb m1, m3, m5
+ pshufb m2, m3, m6
+ mova m0, m4
+ vpdpbusd m0, m1, m8
+ mova m1, m4
+ vpdpbusd m1, m2, m8
+ pshufb m3, m7
+ vpdpbusd m0, m2, m9
+ vpdpbusd m1, m3, m9
+ packssdw m0, m1
+ psraw m0, 2
+ mova [tmpq], m0
+ add tmpq, 64
+ sub hd, 4
+ jg .h_w8_loop
+ RET
+.h_w16:
+ mova m5, [spel_h_perm16a]
+ mova m6, [spel_h_perm16b]
+ mova m7, [spel_h_perm16c]
+ lea stride3q, [strideq*3]
+.h_w16_loop:
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*2]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ vinserti32x8 m1, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 4
+ jg .h_w16_loop
+ RET
+.h_w32:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+.h_w32_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ sub hd, 2
+ jg .h_w32_loop
+ RET
+.h_w64:
+ xor r6d, r6d
+ jmp .h_start
+.h_w128:
+ mov r6, -64*1
+.h_start:
+ mova m5, [spel_h_perm32a]
+ mova m6, [spel_h_perm32b]
+ mova m7, [spel_h_perm32c]
+ sub srcq, r6
+ mov r5, r6
+.h_loop:
+ movu m0, [srcq+r6+32*0]
+ movu m1, [srcq+r6+32*1]
+ PREP_8TAP_H
+ add tmpq, 64*2
+ add r6, 64
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+.v:
+ movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
+ shr myd, 16 ; Note that the code is 8-tap only, having
+ tzcnt wd, wd
+ cmp hd, 4 ; a separate 4-tap code path for (4|8|16)x4
+ cmove myd, mxd ; had a negligible effect on performance.
+ ; TODO: Would a 6-tap code path be worth it?
+ lea myq, [r7+myq*8+subpel_filters-prep_avx512icl]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_v)]
+ add wq, r7
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ vpbroadcastd m7, [pw_8192]
+ vpbroadcastw m8, [myq+0]
+ vpbroadcastw m9, [myq+2]
+ vpbroadcastw m10, [myq+4]
+ vpbroadcastw m11, [myq+6]
+ jmp wq
+.v_w4:
+ movd xmm0, [srcq+strideq*0]
+ vpbroadcastd ymm1, [srcq+strideq*2]
+ vpbroadcastd xmm2, [srcq+strideq*1]
+ vpbroadcastd ymm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpblendd ymm1, ymm0, 0x01 ; 0 2 2 _ 2 _ _ _
+ vpblendd ymm3, ymm2, 0x03 ; 1 1 3 3 3 3 _ _
+ vpbroadcastd ymm0, [srcq+strideq*0]
+ vpbroadcastd ymm2, [srcq+strideq*1]
+ vpblendd ymm1, ymm0, 0x68 ; 0 2 2 4 2 4 4 _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vbroadcasti128 ymm5, [deint_shuf4]
+ vpblendd ymm3, ymm2, 0xc0 ; 1 1 3 3 3 3 5 5
+ vpblendd ymm2, ymm3, ymm1, 0x55 ; 0 1 2 3 2 3 4 5
+ vpblendd ymm3, ymm1, 0xaa ; 1 2 3 4 3 4 5 _
+ punpcklbw ymm1, ymm2, ymm3 ; 01 12 23 34
+ vpblendd ymm3, ymm0, 0x80 ; 1 2 3 4 3 4 5 6
+ punpckhbw ymm2, ymm3 ; 23 34 45 56
+.v_w4_loop:
+ pinsrd xmm0, [srcq+stride3q ], 1
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastd ymm3, [srcq+strideq*0]
+ vpbroadcastd ymm4, [srcq+strideq*1]
+ vpblendd ymm3, ymm4, 0x20 ; _ _ 8 _ 8 9 _ _
+ vpblendd ymm3, ymm0, 0x03 ; 6 7 8 _ 8 9 _ _
+ vpbroadcastd ymm0, [srcq+strideq*2]
+ vpblendd ymm3, ymm0, 0x40 ; 6 7 8 _ 8 9 a _
+ pshufb ymm3, ymm5 ; 67 78 89 9a
+ pmaddubsw ymm4, ymm1, ym8
+ vperm2i128 ymm1, ymm2, ymm3, 0x21 ; 45 56 67 78
+ pmaddubsw ymm2, ym9
+ paddw ymm4, ymm2
+ mova ymm2, ymm3
+ pmaddubsw ymm3, ym11
+ paddw ymm3, ymm4
+ pmaddubsw ymm4, ymm1, ym10
+ paddw ymm3, ymm4
+ pmulhrsw ymm3, ym7
+ mova [tmpq], ymm3
+ add tmpq, 32
+ sub hd, 4
+ jg .v_w4_loop
+ vzeroupper
+ RET
+.v_w8:
+ mov r3d, 0xf044
+ kmovw k1, r3d
+ kshiftrw k2, k1, 8
+ movq xm0, [srcq+strideq*0]
+ vpbroadcastq ym1, [srcq+strideq*1]
+ vpbroadcastq m2, [srcq+strideq*2]
+ vpbroadcastq m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m4, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ vpbroadcastq m6, [srcq+strideq*2]
+ vmovdqa64 ym0{k1}, ym1
+ vmovdqa64 ym1{k1}, ym2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ punpcklbw ym0, ym1 ; 01 12 __ __
+ punpcklbw m2, m3 ; 23 34 23 34
+ punpcklbw m4, m5 ; 45 56 45 56
+ vmovdqa64 m0{k2}, m2 ; 01 12 23 34
+ vmovdqa64 m2{k2}, m4 ; 23 34 45 56
+.v_w8_loop:
+ vpbroadcastq m1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3, [srcq+strideq*0]
+ vpbroadcastq m5, [srcq+strideq*1]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m15, m2, m9
+ vpblendmq m0{k1}, m6, m1
+ vpblendmq m2{k1}, m1, m3
+ vpbroadcastq m6, [srcq+strideq*2]
+ paddw m14, m15
+ punpcklbw m2, m0, m2 ; 67 78 67 78
+ vpblendmq m12{k1}, m3, m5
+ vpblendmq m13{k1}, m5, m6
+ vpblendmq m0{k2}, m4, m2 ; 45 56 67 78
+ punpcklbw m4, m12, m13 ; 89 9a 89 9a
+ vmovdqa64 m2{k2}, m4 ; 67 78 89 9a
+ pmaddubsw m12, m0, m10
+ pmaddubsw m13, m2, m11
+ paddw m14, m12
+ paddw m14, m13
+ pmulhrsw m14, m7
+ mova [tmpq], m14
+ add tmpq, 64
+ sub hd, 4
+ jg .v_w8_loop
+ RET
+.v_w16:
+ mov r3d, 0xf0
+ kmovb k1, r3d
+ vbroadcasti128 m0, [srcq+strideq*0]
+ vbroadcasti128 m1, [srcq+strideq*1]
+ vbroadcasti128 m2, [srcq+strideq*2]
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m4, [srcq+strideq*0]
+ vbroadcasti128 m5, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ vmovdqa64 m0{k1}, m1
+ vmovdqa64 m1{k1}, m2
+ vmovdqa64 m2{k1}, m3
+ vmovdqa64 m3{k1}, m4
+ vmovdqa64 m4{k1}, m5
+ vmovdqa64 m5{k1}, m6
+ shufpd m0, m2, 0xcc ; 0a_2a 0b_2b 1a_3a 1b_3b
+ shufpd m1, m3, 0xcc ; 1a_3a 1b_3b 2a_4a 2b_4b
+ shufpd m4, m4, 0x44 ; 4a_-- 4b_-- 5a_-- 5b_--
+ shufpd m5, m5, 0x44 ; 5a_-- 5b_-- 6a_-- 6b_--
+ punpckhbw m2, m0, m1 ; 23a 23b 34a 34b
+ punpcklbw m0, m1 ; 01a 01b 12a 12b
+ punpcklbw m4, m5 ; 45a 45b 56a 56b
+.v_w16_loop:
+ vbroadcasti128 m3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vbroadcasti128 m5, [srcq+strideq*0]
+ vpblendmq m1{k1}, m6, m3
+ vmovdqa64 m3{k1}, m5
+ pmaddubsw m12, m0, m8
+ pmaddubsw m13, m2, m8
+ pmaddubsw m14, m2, m9
+ pmaddubsw m15, m4, m9
+ pmaddubsw m0, m4, m10
+ vbroadcasti128 m2, [srcq+strideq*1]
+ vbroadcasti128 m6, [srcq+strideq*2]
+ paddw m12, m14
+ paddw m13, m15
+ paddw m12, m0
+ vmovdqa64 m5{k1}, m2
+ vmovdqa64 m2{k1}, m6
+ mova m0, m4
+ shufpd m1, m5, 0xcc ; 6a_8a 6b_8b 7a_9a 7b_9b
+ shufpd m3, m2, 0xcc ; 7a_9a 7b_9b 8a_Aa 8b_Ab
+ punpcklbw m2, m1, m3 ; 67a 67b 78a 78b
+ punpckhbw m4, m1, m3 ; 89a 89b 9Aa 9Ab
+ pmaddubsw m14, m2, m10
+ pmaddubsw m15, m2, m11
+ paddw m13, m14
+ paddw m12, m15
+ pmaddubsw m14, m4, m11
+ paddw m13, m14
+ pmulhrsw m12, m7
+ pmulhrsw m13, m7
+ mova [tmpq+ 0], m12
+ mova [tmpq+64], m13
+ add tmpq, 64*2
+ sub hd, 4
+ jg .v_w16_loop
+ RET
+.v_w32:
+ mova m18, [bilin_v_perm64]
+ movu ym0, [srcq+strideq*0]
+ movu ym1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ movu ym3, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym4, [srcq+strideq*0]
+ movu ym5, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym6, [srcq+strideq*0]
+ vpermq m0, m18, m0
+ vpermq m1, m18, m1
+ vpermq m2, m18, m2
+ vpermq m3, m18, m3
+ vpermq m4, m18, m4
+ vpermq m5, m18, m5
+ vpermq m6, m18, m6
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+ punpcklbw m3, m4
+ punpcklbw m4, m5
+ punpcklbw m5, m6
+.v_w32_loop:
+ movu ym12, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ movu ym13, [srcq+strideq*0]
+ pmaddubsw m14, m0, m8
+ pmaddubsw m16, m2, m9
+ pmaddubsw m15, m1, m8
+ pmaddubsw m17, m3, m9
+ mova m0, m2
+ mova m1, m3
+ vpermq m12, m18, m12
+ vpermq m13, m18, m13
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m4, m10
+ pmaddubsw m17, m5, m10
+ punpcklbw m6, m12
+ punpcklbw m12, m13
+ mova m2, m4
+ mova m3, m5
+ paddw m14, m16
+ paddw m15, m17
+ pmaddubsw m16, m6, m11
+ pmaddubsw m17, m12, m11
+ mova m4, m6
+ mova m5, m12
+ paddw m14, m16
+ paddw m15, m17
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ mova m6, m13
+ mova [tmpq+ 0], m14
+ mova [tmpq+64], m15
+ add tmpq, 64*2
+ sub hd, 2
+ jg .v_w32_loop
+ vzeroupper
+ RET
+.v_w64:
+ mov wd, 64
+ jmp .v_start
+.v_w128:
+ mov wd, 128
+.v_start:
+ WIN64_SPILL_XMM 27
+ mova m26, [bilin_v_perm64]
+ lea r6d, [hq+wq*2]
+ mov r5, srcq
+ mov r7, tmpq
+.v_loop0:
+ vpermq m0, m26, [srcq+strideq*0]
+ vpermq m1, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m2, m26, [srcq+strideq*0]
+ vpermq m3, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m4, m26, [srcq+strideq*0]
+ vpermq m5, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m6, m26, [srcq+strideq*0]
+ punpckhbw m12, m0, m1
+ punpcklbw m0, m1
+ punpckhbw m13, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m14, m2, m3
+ punpcklbw m2, m3
+ punpckhbw m15, m3, m4
+ punpcklbw m3, m4
+ punpckhbw m16, m4, m5
+ punpcklbw m4, m5
+ punpckhbw m17, m5, m6
+ punpcklbw m5, m6
+.v_loop:
+ vpermq m18, m26, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vpermq m19, m26, [srcq+strideq*0]
+ pmaddubsw m20, m0, m8
+ pmaddubsw m21, m12, m8
+ pmaddubsw m22, m1, m8
+ pmaddubsw m23, m13, m8
+ mova m0, m2
+ mova m12, m14
+ mova m1, m3
+ mova m13, m15
+ pmaddubsw m2, m9
+ pmaddubsw m14, m9
+ pmaddubsw m3, m9
+ pmaddubsw m15, m9
+ punpckhbw m24, m6, m18
+ punpcklbw m6, m18
+ paddw m20, m2
+ paddw m21, m14
+ paddw m22, m3
+ paddw m23, m15
+ mova m2, m4
+ mova m14, m16
+ mova m3, m5
+ mova m15, m17
+ pmaddubsw m4, m10
+ pmaddubsw m16, m10
+ pmaddubsw m5, m10
+ pmaddubsw m17, m10
+ punpckhbw m25, m18, m19
+ punpcklbw m18, m19
+ paddw m20, m4
+ paddw m21, m16
+ paddw m22, m5
+ paddw m23, m17
+ mova m4, m6
+ mova m16, m24
+ mova m5, m18
+ mova m17, m25
+ pmaddubsw m6, m11
+ pmaddubsw m24, m11
+ pmaddubsw m18, m11
+ pmaddubsw m25, m11
+ paddw m20, m6
+ paddw m21, m24
+ paddw m22, m18
+ paddw m23, m25
+ pmulhrsw m20, m7
+ pmulhrsw m21, m7
+ pmulhrsw m22, m7
+ pmulhrsw m23, m7
+ mova m6, m19
+ mova [tmpq+wq*0+ 0], m20
+ mova [tmpq+wq*0+64], m21
+ mova [tmpq+wq*2+ 0], m22
+ mova [tmpq+wq*2+64], m23
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_loop
+ add r5, 64
+ add r7, 128
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .v_loop0
+ RET
+.hv:
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ WIN64_SPILL_XMM 16
+ cmp wd, 4
+ je .hv_w4
+ shr mxd, 16
+ sub srcq, 3
+ vpbroadcastd m10, [r7+mxq*8+subpel_filters-prep_avx512icl+0]
+ vpbroadcastd m11, [r7+mxq*8+subpel_filters-prep_avx512icl+4]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ tzcnt wd, wd
+ vpbroadcastd m8, [pd_2]
+ movzx wd, word [r7+wq*2+table_offset(prep, _8tap_hv)]
+ vpbroadcastd m9, [pd_32]
+ add wq, r7
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ jmp wq
+.hv_w4:
+ movzx mxd, mxb
+ dec srcq
+ vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep_avx512icl+2]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmove myd, mxd
+ vpbroadcastq m0, [r7+myq*8+subpel_filters-prep_avx512icl]
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ mov r3d, 0x04
+ kmovb k1, r3d
+ kshiftlb k2, k1, 2
+ kshiftlb k3, k1, 4
+ vpbroadcastd m10, [pd_2]
+ vbroadcasti128 m16, [subpel_h_shufA]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ vpbroadcastd m11, [pd_32]
+ pshufd m12, m0, q0000
+ pshufd m13, m0, q1111
+ pshufd m14, m0, q2222
+ pshufd m15, m0, q3333
+ movq xm3, [srcq+strideq*0]
+ vpbroadcastq ym2, [srcq+strideq*1]
+ vpbroadcastq ym3{k1}, [srcq+strideq*2]
+ vpbroadcastq m2{k2}, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vpbroadcastq m3{k2}, [srcq+strideq*0]
+ vpbroadcastq m2{k3}, [srcq+strideq*1]
+ vpbroadcastq m3{k3}, [srcq+strideq*2]
+ mova m17, [spel_hv_perm4a]
+ movu m18, [spel_hv_perm4b]
+ mova m0, m10
+ mova m1, m10
+ pshufb m2, m16
+ pshufb m3, m16
+ vpdpbusd m0, m2, m8
+ vpdpbusd m1, m3, m8
+ packssdw m0, m1 ; _ 0 1 2 3 4 5 6
+ psraw m0, 2
+ vpermb m1, m17, m0 ; 01 12 23 34
+ vpermb m2, m18, m0 ; 23 34 45 56
+.hv_w4_loop:
+ movq xm3, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ movq xm4, [srcq+strideq*0]
+ vpbroadcastq ym3{k1}, [srcq+strideq*1]
+ vpbroadcastq ym4{k1}, [srcq+strideq*2]
+ mova ym5, ym10
+ mova ym6, ym10
+ pshufb ym3, ym16
+ pshufb ym4, ym16
+ vpdpbusd ym5, ym3, ym8
+ vpdpbusd ym6, ym4, ym8
+ mova m7, m11
+ packssdw ym5, ym6 ; 7 8 9 a _ _ _ _
+ psraw ym5, 2
+ valignq m0, m5, m0, 4 ; _ 4 5 6 7 8 9 a
+ vpdpwssd m7, m1, m12
+ vpdpwssd m7, m2, m13
+ vpermb m1, m17, m0 ; 45 56 67 78
+ vpermb m2, m18, m0 ; 67 78 89 9a
+ vpdpwssd m7, m1, m14
+ vpdpwssd m7, m2, m15
+ psrad m7, 6
+ vpmovdw [tmpq], m7
+ add tmpq, 32
+ sub hd, 4
+ jg .hv_w4_loop
+ vzeroupper
+ RET
+.hv_w8:
+ WIN64_SPILL_XMM 24
+ vbroadcasti128 m16, [subpel_h_shufA]
+ vbroadcasti128 m17, [subpel_h_shufB]
+ vbroadcasti128 m18, [subpel_h_shufC]
+ vinserti128 ym0, [srcq+strideq*0], 1
+ vinserti128 m0, [srcq+strideq*1], 2
+ vinserti128 m0, [srcq+strideq*2], 3
+ movu xm1, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym1, [srcq+strideq*0], 1
+ vinserti128 m1, [srcq+strideq*1], 2
+ vinserti128 m1, [srcq+strideq*2], 3
+ mova m2, m8
+ mova m4, m8
+ mova m3, m8
+ mova m5, m8
+ pshufb m20, m0, m16
+ pshufb m21, m0, m17
+ pshufb m22, m0, m18
+ pshufb m23, m1, m16
+ pshufb m6, m1, m17
+ pshufb m7, m1, m18
+ vpdpbusd m2, m20, m10
+ vpdpbusd m4, m21, m10
+ vpdpbusd m2, m21, m11
+ vpdpbusd m4, m22, m11
+ vpdpbusd m3, m23, m10
+ vpdpbusd m5, m6, m10
+ vpdpbusd m3, m6, m11
+ vpdpbusd m5, m7, m11
+ packssdw m2, m4
+ packssdw m3, m5
+ psraw m2, 2 ; _ 0 1 2
+ psraw m3, 2 ; 3 4 5 6
+ valignq m0, m3, m2, 2 ; 0 1 2 3
+ valignq m1, m3, m2, 4 ; 1 2 3 4
+ valignq m2, m3, m2, 6 ; 2 3 4 5
+ punpcklwd m4, m0, m1 ; 01a 12a 23a 34a
+ punpckhwd m5, m0, m1 ; 01b 12b 23b 34b
+ punpcklwd m6, m2, m3 ; 23a 34a 45a 56a
+ punpckhwd m7, m2, m3 ; 23b 34b 45b 56b
+.hv_w8_loop:
+ movu xm19, [srcq+stride3q ]
+ lea srcq, [srcq+strideq*4]
+ vinserti128 ym19, [srcq+strideq*0], 1
+ vinserti128 m19, [srcq+strideq*1], 2
+ vinserti128 m19, [srcq+strideq*2], 3
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m4, m12
+ vpdpwssd m21, m5, m12
+ vpdpwssd m20, m6, m13
+ vpdpwssd m21, m7, m13
+ pshufb m0, m19, m16
+ pshufb m1, m19, m17
+ pshufb m2, m19, m18
+ vpdpbusd m22, m0, m10
+ vpdpbusd m23, m1, m10
+ vpdpbusd m22, m1, m11
+ vpdpbusd m23, m2, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7 8 9 A
+ valignq m0, m22, m3, 2 ; 4 5 6 7
+ valignq m1, m22, m3, 4 ; 5 6 7 8
+ valignq m2, m22, m3, 6 ; 6 7 8 9
+ mova m3, m22
+ punpcklwd m4, m0, m1 ; 45a 56a 67a 78a
+ punpckhwd m5, m0, m1 ; 45b 56b 67b 78b
+ punpcklwd m6, m2, m3 ; 67a 78a 89a 9Aa
+ punpckhwd m7, m2, m3 ; 67b 78b 89b 9Ab
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq], m20
+ add tmpq, 64
+ sub hd, 4
+ jg .hv_w8_loop
+ RET
+.hv_w16:
+ mov wd, 16*2
+ jmp .hv_start
+.hv_w32:
+ mov wd, 32*2
+ jmp .hv_start
+.hv_w64:
+ mov wd, 64*2
+ jmp .hv_start
+.hv_w128:
+ mov wd, 128*2
+.hv_start:
+ WIN64_SPILL_XMM 31
+ mova m16, [spel_h_perm16a]
+ mova m17, [spel_h_perm16b]
+ mova m18, [spel_h_perm16c]
+ lea r6d, [hq+wq*8-256]
+ mov r5, srcq
+ mov r7, tmpq
+.hv_loop0:
+ movu ym0, [srcq+strideq*0]
+ vinserti32x8 m0, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym1, [srcq+strideq*0]
+ vinserti32x8 m1, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym2, [srcq+strideq*0]
+ vinserti32x8 m2, [srcq+strideq*1], 1
+ lea srcq, [srcq+strideq*2]
+ movu ym3, [srcq+strideq*0]
+ mova m4, m8
+ mova m5, m8
+ mova m6, m8
+ mova m7, m8
+ vpermb m19, m16, m0
+ vpermb m20, m17, m0
+ vpermb m21, m18, m0
+ vpermb m22, m16, m1
+ vpermb m23, m17, m1
+ vpermb m24, m18, m1
+ vpermb m25, m16, m2
+ vpermb m26, m17, m2
+ vpermb m27, m18, m2
+ vpermb ym28, ym16, ym3
+ vpermb ym29, ym17, ym3
+ vpermb ym30, ym18, ym3
+ mova m0, m8
+ mova m1, m8
+ mova ym2, ym8
+ mova ym3, ym8
+ vpdpbusd m4, m19, m10
+ vpdpbusd m5, m20, m10
+ vpdpbusd m6, m22, m10
+ vpdpbusd m7, m23, m10
+ vpdpbusd m0, m25, m10
+ vpdpbusd m1, m26, m10
+ vpdpbusd ym2, ym28, ym10
+ vpdpbusd ym3, ym29, ym10
+ vpdpbusd m4, m20, m11
+ vpdpbusd m5, m21, m11
+ vpdpbusd m6, m23, m11
+ vpdpbusd m7, m24, m11
+ vpdpbusd m0, m26, m11
+ vpdpbusd m1, m27, m11
+ vpdpbusd ym2, ym29, ym11
+ vpdpbusd ym3, ym30, ym11
+ packssdw m4, m5
+ packssdw m6, m7
+ packssdw m0, m1
+ packssdw ym2, ym3
+ psraw m4, 2 ; 0a 0b 1a 1b
+ psraw m6, 2 ; 2a 2b 3a 3b
+ psraw m0, 2 ; 4a 4b 5a 5b
+ psraw ym2, 2 ; 6a 6b __ __
+ vshufi32x4 m5, m4, m6, q1032 ; 1a 1b 2a 2b
+ vshufi32x4 m7, m6, m0, q1032 ; 3a 3b 4a 4b
+ vshufi32x4 m1, m0, m2, q1032 ; 5a 5b 6a 6b
+ punpcklwd m2, m4, m5 ; 01a 01c 12a 12c
+ punpckhwd m3, m4, m5 ; 01b 01d 12b 12d
+ punpcklwd m4, m6, m7 ; 23a 23c 34a 34c
+ punpckhwd m5, m6, m7 ; 23b 23d 34b 34d
+ punpcklwd m6, m0, m1 ; 45a 45c 56a 56c
+ punpckhwd m7, m0, m1 ; 45b 45d 56b 56d
+.hv_loop:
+ movu ym19, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ vinserti32x8 m19, [srcq+strideq*0], 1
+ mova m20, m9
+ mova m21, m9
+ mova m22, m8
+ mova m23, m8
+ vpdpwssd m20, m2, m12
+ vpdpwssd m21, m3, m12
+ vpdpwssd m20, m4, m13
+ vpdpwssd m21, m5, m13
+ vpermb m24, m16, m19
+ vpermb m25, m17, m19
+ vpermb m26, m18, m19
+ vpdpbusd m22, m24, m10
+ vpdpbusd m23, m25, m10
+ vpdpbusd m22, m25, m11
+ vpdpbusd m23, m26, m11
+ packssdw m22, m23
+ psraw m22, 2 ; 7a 7b 8a 8b
+ vshufi32x4 m0, m1, m22, q1032 ; 6a 6b 7a 7b
+ mova m2, m4
+ mova m3, m5
+ mova m1, m22
+ mova m4, m6
+ mova m5, m7
+ punpcklwd m6, m0, m1 ; 67a 67c 78a 78c
+ punpckhwd m7, m0, m1 ; 67b 67d 78b 78d
+ vpdpwssd m20, m4, m14
+ vpdpwssd m21, m5, m14
+ vpdpwssd m20, m6, m15
+ vpdpwssd m21, m7, m15
+ psrad m20, 6
+ psrad m21, 6
+ packssdw m20, m21
+ mova [tmpq+wq*0], ym20
+ vextracti32x8 [tmpq+wq*1], m20, 1
+ lea tmpq, [tmpq+wq*2]
+ sub hd, 2
+ jg .hv_loop
+ add r5, 16
+ add r7, 32
+ movzx hd, r6b
+ mov srcq, r5
+ mov tmpq, r7
+ sub r6d, 1<<8
+ jg .hv_loop0
+ RET
+
+%macro BIDIR_FN 1 ; op
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM %1 0
+ vextracti32x4 xmm1, ym0, 1
+ movd [dstq ], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xmm1
+ pextrd [dstq+stride3q ], xmm1, 1
+ jl .w4_ret
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq ], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xmm1, 2
+ pextrd [dstq+stride3q ], xmm1, 3
+.w4_ret:
+ RET
+.w4_h16:
+ vpbroadcastd m7, strided
+ pmulld m7, [bidir_sctr_w4]
+ %1 0
+ kxnorw k1, k1, k1
+ vpscatterdd [dstq+m7]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM %1 0
+ vextracti128 xmm1, ym0, 1
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ RET
+.w8_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ %1 0
+ vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xmm2, m0, 2
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq ], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq ], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ %1_INC_PTR 2
+ lea dstq, [dstq+strideq*4]
+.w16:
+ %1 0
+ vpermq m0, m0, q3120
+ mova [dstq ], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m7, [pb_02461357]
+.w32_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m7, [pb_02461357]
+.w64_loop:
+ %1 0
+ %1_INC_PTR 2
+ vpermq m0, m7, m0
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m7, [pb_02461357]
+.w128_loop:
+ %1 0
+ vpermq m6, m7, m0
+ %1 2
+ mova [dstq+64*0], m6
+ %1_INC_PTR 4
+ vpermq m6, m7, m0
+ mova [dstq+64*1], m6
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+%endmacro
+
+%macro AVG 1 ; src_offset
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ paddw m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ paddw m1, [tmp2q+(%1+1)*mmsize]
+ pmulhrsw m0, m2
+ pmulhrsw m1, m2
+ packuswb m0, m1
+%endmacro
+
+%macro AVG_INC_PTR 1
+ add tmp1q, %1*mmsize
+ add tmp2q, %1*mmsize
+%endmacro
+
+cglobal avg, 4, 7, 3, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-avg_avx512icl_table
+ lea r6, [avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m2, [base+pw_1024]
+ add wq, r6
+ BIDIR_FN AVG
+
+%macro W_AVG 1 ; src_offset
+ ; (a * weight + b * (16 - weight) + 128) >> 8
+ ; = ((a - b) * weight + (b << 4) + 128) >> 8
+ ; = ((((a - b) * ((weight-16) << 12)) >> 16) + a + 8) >> 4
+ ; = ((((b - a) * (-weight << 12)) >> 16) + b + 8) >> 4
+ mova m0, [tmp1q+(%1+0)*mmsize]
+ psubw m2, m0, [tmp2q+(%1+0)*mmsize]
+ mova m1, [tmp1q+(%1+1)*mmsize]
+ psubw m3, m1, [tmp2q+(%1+1)*mmsize]
+ pmulhw m2, m4
+ pmulhw m3, m4
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%define W_AVG_INC_PTR AVG_INC_PTR
+
+cglobal w_avg, 4, 7, 6, dst, stride, tmp1, tmp2, w, h, stride3
+%define base r6-w_avg_avx512icl_table
+ lea r6, [w_avg_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ vpbroadcastw m4, r6m ; weight
+ movsxd wq, dword [r6+wq*4]
+ vpbroadcastd m5, [base+pw_2048]
+ psllw m4, 12 ; (weight-16) << 12 when interpreted as signed
+ add wq, r6
+ cmp dword r6m, 7
+ jg .weight_gt7
+ mov r6, tmp1q
+ pxor m0, m0
+ mov tmp1q, tmp2q
+ psubw m4, m0, m4 ; -weight
+ mov tmp2q, r6
+.weight_gt7:
+ BIDIR_FN W_AVG
+
+%macro MASK 1 ; src_offset
+ ; (a * m + b * (64 - m) + 512) >> 10
+ ; = ((a - b) * m + (b << 6) + 512) >> 10
+ ; = ((((b - a) * (-m << 10)) >> 16) + b + 8) >> 4
+%if mmsize == 64
+ vpermq m3, m8, [maskq+%1*32]
+%else
+ vpermq m3, [maskq+%1*16], q3120
+%endif
+ mova m0, [tmp2q+(%1+0)*mmsize]
+ psubw m1, m0, [tmp1q+(%1+0)*mmsize]
+ psubb m3, m4, m3
+ paddw m1, m1 ; (b - a) << 1
+ paddb m3, m3
+ punpcklbw m2, m4, m3 ; -m << 9
+ pmulhw m1, m2
+ paddw m0, m1
+ mova m1, [tmp2q+(%1+1)*mmsize]
+ psubw m2, m1, [tmp1q+(%1+1)*mmsize]
+ paddw m2, m2
+ punpckhbw m3, m4, m3
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m0, m5
+ pmulhrsw m1, m5
+ packuswb m0, m1
+%endmacro
+
+%macro MASK_INC_PTR 1
+ add maskq, %1*32
+ add tmp2q, %1*64
+ add tmp1q, %1*64
+%endmacro
+
+cglobal mask, 4, 8, 6, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-mask_avx512icl_table
+ lea r7, [mask_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ mov maskq, maskmp
+ movsxd wq, dword [r7+wq*4]
+ pxor m4, m4
+ mova m8, [base+bilin_v_perm64]
+ vpbroadcastd m5, [base+pw_2048]
+ add wq, r7
+ BIDIR_FN MASK
+
+%macro W_MASK 4-5 0 ; dst, mask, tmp_offset[1-2], 4:4:4
+ mova m%1, [tmp1q+mmsize*%3]
+ mova m1, [tmp2q+mmsize*%3]
+ psubw m1, m%1
+ pabsw m%2, m1
+ psubusw m%2, m6, m%2
+ psrlw m%2, 8 ; 64 - m
+ psllw m2, m%2, 10
+ pmulhw m1, m2
+ paddw m%1, m1
+ mova m1, [tmp1q+mmsize*%4]
+ mova m2, [tmp2q+mmsize*%4]
+ psubw m2, m1
+ pabsw m3, m2
+ psubusw m3, m6, m3
+ vpshldw m%2, m3, 8
+ psllw m3, m%2, 10
+%if %5
+ psubb m%2, m5, m%2
+%endif
+ pmulhw m2, m3
+ paddw m1, m2
+ pmulhrsw m%1, m7
+ pmulhrsw m1, m7
+ packuswb m%1, m1
+%endmacro
+
+cglobal w_mask_420, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_420_avx512icl_table
+ lea r7, [w_mask_420_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pb_m64] ; -1 << 6
+ mova ym10, [base+wm_420_mask+32]
+ vpbroadcastd m8, [base+wm_sign+r6*8] ; (258 - sign) << 6
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ mova m5, [wm_420_perm4]
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm4+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vextracti128 xmm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xmm1
+ pextrd [dstq+stride3q ], xmm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xmm1, 2
+ pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+ vpermb ym8, ym10, ym8
+ movq [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m11, strided
+ pmulld m11, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ vpdpbusd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vpscatterdd [dstq+m11]{k1}, m0
+ RET
+.w8:
+ mova m5, [wm_420_perm8]
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ vinserti128 ym5, [wm_420_perm8+32], 1
+ vpermb ym4, ym5, ym4
+ vpdpbusd ym8, ym4, ym9
+ vpermb m8, m10, m8
+ mova [maskq], xm8
+ vextracti128 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 16
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ vpermb m1, m10, m1
+ mova [maskq], xm1
+ vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xmm2, m0, 2
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16:
+ mova m5, [wm_420_perm16]
+.w16_loop:
+ W_MASK 0, 4, 0, 1
+ vpermb m4, m5, m4
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ lea dstq, [dstq+strideq*4]
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpbusd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ mova [maskq], xm1
+ add maskq, 16
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
+ psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
+.w64_loop:
+ W_MASK 0, 4, 0, 2
+ W_MASK 11, 5, 1, 3
+ mova m2, m8
+ vpdpbusd m2, m4, m9
+ mova m3, m8
+ vpdpbusd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ mova m1, m0
+ vpermt2q m0, m12, m11
+ vpermt2q m1, m13, m11
+ mova [maskq], ym2
+ add maskq, 32
+ mova [dstq+strideq*0], m0
+ mova [dstq+strideq*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m14, [wm_420_perm64]
+ mova m10, [wm_420_mask]
+ psrlq m15, m14, 4
+.w128_loop:
+ W_MASK 0, 12, 0, 4
+ W_MASK 11, 13, 1, 5
+ mova m4, m8
+ vpdpbusd m4, m12, m9
+ mova m5, m8
+ vpdpbusd m5, m13, m9
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [dstq+strideq*0+64*0], m0
+ mova [dstq+strideq*1+64*0], m1
+ W_MASK 0, 12, 2, 6
+ W_MASK 11, 13, 3, 7
+ vprold m4, 16
+ vprold m5, 16
+ vpdpbusd m4, m12, m9
+ vpdpbusd m5, m13, m9
+ add tmp1q, 512
+ add tmp2q, 512
+ vpermt2b m4, m10, m5
+ mova m1, m0
+ vpermt2q m0, m14, m11
+ vpermt2q m1, m15, m11
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0+64*1], m0
+ mova [dstq+strideq*1+64*1], m1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w128_loop
+ RET
+
+cglobal w_mask_422, 4, 8, 14, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_422_avx512icl_table
+ lea r7, [w_mask_422_avx512icl_table]
+ tzcnt wd, wm
+ mov r6d, r7m ; sign
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m7, [base+pw_2048]
+ vpbroadcastd m9, [base+pw_m128]
+ mova m10, [base+wm_422_mask]
+ vpbroadcastd m11, [base+pb_127]
+ add wq, r7
+ vpbroadcastd m8, [base+wm_sign+4+r6*4]
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ vextracti128 xmm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xmm1
+ pextrd [dstq+stride3q ], xmm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xmm1, 2
+ pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+ pand xm8, xm11
+ mova [maskq], xm8
+ RET
+.w4_h16:
+ vpbroadcastd m5, strided
+ pmulld m5, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1
+ vpdpwssd m8, m4, m9
+ kxnorw k1, k1, k1
+ vpermb m8, m10, m8
+ pand ym8, ym11
+ mova [maskq], ym8
+ vpscatterdd [dstq+m5]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1
+ movhps xm10, [wm_422_mask+16]
+ vpdpwssd ym8, ym4, ym9
+ vpermb ym8, ym10, ym8
+ pand xm8, xm11
+ mova [maskq], xm8
+ vextracti128 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ pand ym1, ym11
+ mova [maskq], ym1
+ vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xmm2, m0, 2
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 32
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ vpermb m1, m10, m1
+ vpermq m0, m0, q3120
+ pand ym1, ym11
+ mova [maskq], ym1
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m5, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m5, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1
+ mova m1, m8
+ vpdpwssd m1, m4, m9
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermb m1, m10, m1
+ vpermq m0, m5, m0
+ pand ym1, ym11
+ mova [maskq], ym1
+ add maskq, 32
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m13, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1
+ W_MASK 12, 5, 2, 3
+ mova m2, m8
+ vpdpwssd m2, m4, m9
+ mova m3, m8
+ vpdpwssd m3, m5, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermt2b m2, m10, m3
+ vpermq m0, m13, m0
+ vpermq m1, m13, m12
+ pand m2, m11
+ mova [maskq], m2
+ add maskq, 64
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m1
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+cglobal w_mask_444, 4, 8, 12, dst, stride, tmp1, tmp2, w, h, mask, stride3
+%define base r7-w_mask_444_avx512icl_table
+ lea r7, [w_mask_444_avx512icl_table]
+ tzcnt wd, wm
+ movifnidn hd, hm
+ movsxd wq, dword [r7+wq*4]
+ vpbroadcastd m6, [base+pw_6903] ; ((64 - 38) << 8) + 255 - 8
+ vpbroadcastd m5, [base+pb_64]
+ vpbroadcastd m7, [base+pw_2048]
+ mova m8, [base+wm_444_mask]
+ add wq, r7
+ mov maskq, maskmp
+ lea stride3q, [strideq*3]
+ jmp wq
+.w4:
+ cmp hd, 8
+ jg .w4_h16
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti128 xmm1, m0, 1
+ movd [dstq+strideq*0], xm0
+ pextrd [dstq+strideq*1], xm0, 1
+ movd [dstq+strideq*2], xmm1
+ pextrd [dstq+stride3q ], xmm1, 1
+ jl .w4_end
+ lea dstq, [dstq+strideq*4]
+ pextrd [dstq+strideq*0], xm0, 2
+ pextrd [dstq+strideq*1], xm0, 3
+ pextrd [dstq+strideq*2], xmm1, 2
+ pextrd [dstq+stride3q ], xmm1, 3
+.w4_end:
+ RET
+.w4_h16:
+ vpbroadcastd m9, strided
+ pmulld m9, [bidir_sctr_w4]
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ kxnorw k1, k1, k1
+ mova [maskq], m4
+ vpscatterdd [dstq+m9]{k1}, m0
+ RET
+.w8:
+ cmp hd, 4
+ jne .w8_h8
+ WRAP_YMM W_MASK 0, 4, 0, 1, 1
+ vinserti128 ym8, [wm_444_mask+32], 1
+ vpermb ym4, ym8, ym4
+ mova [maskq], ym4
+ vextracti128 xmm1, ym0, 1
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xm0
+ movhps [dstq+stride3q ], xmm1
+ RET
+.w8_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w8_h8:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ mova [maskq], m4
+ vextracti32x4 xmm1, ym0, 1
+ vextracti32x4 xmm2, m0, 2
+ vextracti32x4 xmm3, m0, 3
+ movq [dstq+strideq*0], xm0
+ movq [dstq+strideq*1], xmm1
+ movq [dstq+strideq*2], xmm2
+ movq [dstq+stride3q ], xmm3
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq+strideq*0], xm0
+ movhps [dstq+strideq*1], xmm1
+ movhps [dstq+strideq*2], xmm2
+ movhps [dstq+stride3q ], xmm3
+ sub hd, 8
+ jg .w8_loop
+ RET
+.w16_loop:
+ add tmp1q, 128
+ add tmp2q, 128
+ add maskq, 64
+ lea dstq, [dstq+strideq*4]
+.w16:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ vpermq m0, m0, q3120
+ mova [maskq], m4
+ mova [dstq+strideq*0], xm0
+ vextracti32x4 [dstq+strideq*1], m0, 2
+ vextracti32x4 [dstq+strideq*2], ym0, 1
+ vextracti32x4 [dstq+stride3q ], m0, 3
+ sub hd, 4
+ jg .w16_loop
+ RET
+.w32:
+ pmovzxbq m9, [pb_02461357]
+.w32_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq+strideq*0], ym0
+ vextracti32x8 [dstq+strideq*1], m0, 1
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w32_loop
+ RET
+.w64:
+ pmovzxbq m9, [pb_02461357]
+.w64_loop:
+ W_MASK 0, 4, 0, 1, 1
+ vpermb m4, m8, m4
+ add tmp1q, 128
+ add tmp2q, 128
+ vpermq m0, m9, m0
+ mova [maskq], m4
+ add maskq, 64
+ mova [dstq], m0
+ add dstq, strideq
+ dec hd
+ jg .w64_loop
+ RET
+.w128:
+ pmovzxbq m11, [pb_02461357]
+.w128_loop:
+ W_MASK 0, 4, 0, 1, 1
+ W_MASK 10, 9, 2, 3, 1
+ vpermb m4, m8, m4
+ vpermb m9, m8, m9
+ add tmp1q, 256
+ add tmp2q, 256
+ vpermq m0, m11, m0
+ vpermq m10, m11, m10
+ mova [maskq+64*0], m4
+ mova [maskq+64*1], m9
+ add maskq, 128
+ mova [dstq+64*0], m0
+ mova [dstq+64*1], m10
+ add dstq, strideq
+ dec hd
+ jg .w128_loop
+ RET
+
+%endif ; HAVE_AVX512ICL && ARCH_X86_64