ref: 0afec6b13fbacdc4fb25117c17ce4472945f901d
parent: 65ee1233cf86f03e029d0520f7cc5a3e152d3bbd
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Fri Feb 15 07:17:54 EST 2019
x86: add SSSE3 mc prep_8tap implementation --------------------- x86_64: ------------------------------------------ mct_8tap_regular_w4_0_8bpc_c: 115.6 mct_8tap_regular_w4_0_8bpc_ssse3: 13.1 mct_8tap_regular_w4_0_8bpc_avx2: 13.3 ------------------------------------------ mct_8tap_regular_w4_h_8bpc_c: 363.0 mct_8tap_regular_w4_h_8bpc_ssse3: 19.1 mct_8tap_regular_w4_h_8bpc_avx2: 16.5 ------------------------------------------ mct_8tap_regular_w4_hv_8bpc_c: 832.2 mct_8tap_regular_w4_hv_8bpc_ssse3: 113.4 mct_8tap_regular_w4_hv_8bpc_avx2: 53.1 ------------------------------------------ mct_8tap_regular_w4_v_8bpc_c: 488.5 mct_8tap_regular_w4_v_8bpc_ssse3: 38.9 mct_8tap_regular_w4_v_8bpc_avx2: 26.0 ------------------------------------------ mct_8tap_regular_w8_0_8bpc_c: 259.3 mct_8tap_regular_w8_0_8bpc_ssse3: 20.4 mct_8tap_regular_w8_0_8bpc_avx2: 18.0 ------------------------------------------ mct_8tap_regular_w8_h_8bpc_c: 1124.3 mct_8tap_regular_w8_h_8bpc_ssse3: 67.7 mct_8tap_regular_w8_h_8bpc_avx2: 43.3 ------------------------------------------ mct_8tap_regular_w8_hv_8bpc_c: 2155.0 mct_8tap_regular_w8_hv_8bpc_ssse3: 340.8 mct_8tap_regular_w8_hv_8bpc_avx2: 151.3 ------------------------------------------ mct_8tap_regular_w8_v_8bpc_c: 1195.4 mct_8tap_regular_w8_v_8bpc_ssse3: 72.4 mct_8tap_regular_w8_v_8bpc_avx2: 39.8 ------------------------------------------ mct_8tap_regular_w16_0_8bpc_c: 158.3 mct_8tap_regular_w16_0_8bpc_ssse3: 52.9 mct_8tap_regular_w16_0_8bpc_avx2: 30.2 ------------------------------------------ mct_8tap_regular_w16_h_8bpc_c: 4267.4 mct_8tap_regular_w16_h_8bpc_ssse3: 211.9 mct_8tap_regular_w16_h_8bpc_avx2: 121.4 ------------------------------------------ mct_8tap_regular_w16_hv_8bpc_c: 5430.9 mct_8tap_regular_w16_hv_8bpc_ssse3: 986.8 mct_8tap_regular_w16_hv_8bpc_avx2: 428.4 ------------------------------------------ mct_8tap_regular_w16_v_8bpc_c: 4604.2 mct_8tap_regular_w16_v_8bpc_ssse3: 199.1 mct_8tap_regular_w16_v_8bpc_avx2: 100.7 ------------------------------------------ mct_8tap_regular_w32_0_8bpc_c: 372.9 mct_8tap_regular_w32_0_8bpc_ssse3: 231.9 mct_8tap_regular_w32_0_8bpc_avx2: 99.7 ------------------------------------------ mct_8tap_regular_w32_h_8bpc_c: 15975.0 mct_8tap_regular_w32_h_8bpc_ssse3: 802.9 mct_8tap_regular_w32_h_8bpc_avx2: 468.5 ------------------------------------------ mct_8tap_regular_w32_hv_8bpc_c: 18555.5 mct_8tap_regular_w32_hv_8bpc_ssse3: 3673.5 mct_8tap_regular_w32_hv_8bpc_avx2: 1587.6 ------------------------------------------ mct_8tap_regular_w32_v_8bpc_c: 16632.4 mct_8tap_regular_w32_v_8bpc_ssse3: 743.5 mct_8tap_regular_w32_v_8bpc_avx2: 337.8 ------------------------------------------ mct_8tap_regular_w64_0_8bpc_c: 675.9 mct_8tap_regular_w64_0_8bpc_ssse3: 513.6 mct_8tap_regular_w64_0_8bpc_avx2: 285.4 ------------------------------------------ mct_8tap_regular_w64_h_8bpc_c: 37161.3 mct_8tap_regular_w64_h_8bpc_ssse3: 1929.7 mct_8tap_regular_w64_h_8bpc_avx2: 1138.1 ------------------------------------------ mct_8tap_regular_w64_hv_8bpc_c: 42434.0 mct_8tap_regular_w64_hv_8bpc_ssse3: 8822.1 mct_8tap_regular_w64_hv_8bpc_avx2: 3853.5 ------------------------------------------ mct_8tap_regular_w64_v_8bpc_c: 37969.1 mct_8tap_regular_w64_v_8bpc_ssse3: 1805.6 mct_8tap_regular_w64_v_8bpc_avx2: 826.1 ------------------------------------------ mct_8tap_regular_w128_0_8bpc_c: 1532.7 mct_8tap_regular_w128_0_8bpc_ssse3: 1397.7 mct_8tap_regular_w128_0_8bpc_avx2: 813.8 ------------------------------------------ mct_8tap_regular_w128_h_8bpc_c: 91204.3 mct_8tap_regular_w128_h_8bpc_ssse3: 4783.0 mct_8tap_regular_w128_h_8bpc_avx2: 2767.2 ------------------------------------------ mct_8tap_regular_w128_hv_8bpc_c: 102396.0 mct_8tap_regular_w128_hv_8bpc_ssse3: 22202.3 mct_8tap_regular_w128_hv_8bpc_avx2: 9637.2 ------------------------------------------ mct_8tap_regular_w128_v_8bpc_c: 92294.3 mct_8tap_regular_w128_v_8bpc_ssse3: 4952.8 mct_8tap_regular_w128_v_8bpc_avx2: 2370.1 ------------------------------------------ --------------------- x86_32: ------------------------------------------ mct_8tap_regular_w4_0_8bpc_c: 131.3 mct_8tap_regular_w4_0_8bpc_ssse3: 18.7 ------------------------------------------ mct_8tap_regular_w4_h_8bpc_c: 422.0 mct_8tap_regular_w4_h_8bpc_ssse3: 27.3 ------------------------------------------ mct_8tap_regular_w4_hv_8bpc_c: 1012.6 mct_8tap_regular_w4_hv_8bpc_ssse3: 123.6 ------------------------------------------ mct_8tap_regular_w4_v_8bpc_c: 589.6 mct_8tap_regular_w4_v_8bpc_ssse3: 48.9 ------------------------------------------ mct_8tap_regular_w8_0_8bpc_c: 278.5 mct_8tap_regular_w8_0_8bpc_ssse3: 26.3 ------------------------------------------ mct_8tap_regular_w8_h_8bpc_c: 1129.3 mct_8tap_regular_w8_h_8bpc_ssse3: 80.6 ------------------------------------------ mct_8tap_regular_w8_hv_8bpc_c: 2556.4 mct_8tap_regular_w8_hv_8bpc_ssse3: 354.6 ------------------------------------------ mct_8tap_regular_w8_v_8bpc_c: 1460.2 mct_8tap_regular_w8_v_8bpc_ssse3: 103.8 ------------------------------------------ mct_8tap_regular_w16_0_8bpc_c: 218.9 mct_8tap_regular_w16_0_8bpc_ssse3: 58.4 ------------------------------------------ mct_8tap_regular_w16_h_8bpc_c: 4471.8 mct_8tap_regular_w16_h_8bpc_ssse3: 237.2 ------------------------------------------ mct_8tap_regular_w16_hv_8bpc_c: 5570.5 mct_8tap_regular_w16_hv_8bpc_ssse3: 1044.1 ------------------------------------------ mct_8tap_regular_w16_v_8bpc_c: 4885.5 mct_8tap_regular_w16_v_8bpc_ssse3: 268.3 ------------------------------------------ mct_8tap_regular_w32_0_8bpc_c: 495.6 mct_8tap_regular_w32_0_8bpc_ssse3: 236.6 ------------------------------------------ mct_8tap_regular_w32_h_8bpc_c: 15903.5 mct_8tap_regular_w32_h_8bpc_ssse3: 872.5 ------------------------------------------ mct_8tap_regular_w32_hv_8bpc_c: 19402.2 mct_8tap_regular_w32_hv_8bpc_ssse3: 3832.8 ------------------------------------------ mct_8tap_regular_w32_v_8bpc_c: 17119.5 mct_8tap_regular_w32_v_8bpc_ssse3: 935.2 ------------------------------------------ mct_8tap_regular_w64_0_8bpc_c: 877.0 mct_8tap_regular_w64_0_8bpc_ssse3: 515.7 ------------------------------------------ mct_8tap_regular_w64_h_8bpc_c: 36832.1 mct_8tap_regular_w64_h_8bpc_ssse3: 2094.1 ------------------------------------------ mct_8tap_regular_w64_hv_8bpc_c: 43965.3 mct_8tap_regular_w64_hv_8bpc_ssse3: 9423.0 ------------------------------------------ mct_8tap_regular_w64_v_8bpc_c: 37041.2 mct_8tap_regular_w64_v_8bpc_ssse3: 2348.9 ------------------------------------------ mct_8tap_regular_w128_0_8bpc_c: 1929.9 mct_8tap_regular_w128_0_8bpc_ssse3: 1392.3 ------------------------------------------ mct_8tap_regular_w128_h_8bpc_c: 86022.5 mct_8tap_regular_w128_h_8bpc_ssse3: 5110.8 ------------------------------------------ mct_8tap_regular_w128_hv_8bpc_c: 105793.5 mct_8tap_regular_w128_hv_8bpc_ssse3: 23278.8 ------------------------------------------ mct_8tap_regular_w128_v_8bpc_c: 88223.5 mct_8tap_regular_w128_v_8bpc_ssse3: 7442.7 ------------------------------------------
--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -50,14 +50,23 @@
decl_mc_fn(dav1d_put_bilin_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_regular_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_regular_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_smooth_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_smooth_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_regular_ssse3);
decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
+decl_mct_fn(dav1d_prep_8tap_sharp_smooth_ssse3);
decl_mct_fn(dav1d_prep_bilin_avx2);
decl_mct_fn(dav1d_prep_bilin_ssse3);
@@ -108,6 +117,15 @@
init_mc_fn (FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
init_mct_fn(FILTER_2D_BILINEAR, bilin, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR, 8tap_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH, 8tap_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP, 8tap_smooth_sharp, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR, 8tap_sharp_regular, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH, 8tap_sharp_smooth, ssse3);
+ init_mct_fn(FILTER_2D_8TAP_SHARP, 8tap_sharp, ssse3);
c->avg = dav1d_avg_ssse3;
c->w_avg = dav1d_w_avg_ssse3;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -64,6 +64,7 @@
pw_2048: times 8 dw 2048
pw_6903: times 8 dw 6903
pw_8192: times 8 dw 8192
+pd_32: times 4 dd 32
pd_512: times 4 dd 512
pw_258: times 2 dw 258
@@ -141,6 +142,7 @@
%endmacro
HV_JMP_TABLE put, 8tap, ssse3, 3, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, 8tap, ssse3, 1, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE put, bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
HV_JMP_TABLE prep, bilin, ssse3, 7, 4, 8, 16, 32, 64, 128
@@ -2421,6 +2423,891 @@
mov srcq, r7
%endif
sub r6d, 1<<16
+ jg .hv_w8_loop0
+ RET
+
+%if ARCH_X86_32
+DECLARE_REG_TMP 1, 2
+%elif WIN64
+DECLARE_REG_TMP 6, 4
+%else
+DECLARE_REG_TMP 6, 7
+%endif
+%macro PREP_8TAP_FN 3 ; type, type_h, type_v
+cglobal prep_8tap_%1
+ mov t0d, FILTER_%2
+ mov t1d, FILTER_%3
+%ifnidn %1, sharp_smooth ; skip the jump in the last filter
+ jmp mangle(private_prefix %+ _prep_8tap %+ SUFFIX)
+%endif
+%endmacro
+
+PREP_8TAP_FN regular, REGULAR, REGULAR
+PREP_8TAP_FN regular_sharp, REGULAR, SHARP
+PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
+PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR
+PREP_8TAP_FN smooth, SMOOTH, SMOOTH
+PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP
+PREP_8TAP_FN sharp_regular, SHARP, REGULAR
+PREP_8TAP_FN sharp, SHARP, SHARP
+PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH
+
+%if ARCH_X86_32
+ %define base_reg r2
+ %define base base_reg-prep_ssse3
+ %define W32_RESTORE_SSQ mov strideq, stridem
+%else
+ %define base_reg r7
+ %define base 0
+ %define W32_RESTORE_SSQ
+%endif
+
+cglobal prep_8tap, 1, 9, 0, tmp, src, stride, w, h, mx, my, stride3
+%assign org_stack_offset stack_offset
+ imul mxd, mxm, 0x010101
+ add mxd, t0d ; 8tap_h, mx, 4tap_h
+ imul myd, mym, 0x010101
+ add myd, t1d ; 8tap_v, my, 4tap_v
+ movsxd wq, wm
+ movifnidn srcd, srcm
+ movifnidn hd, hm
+ LEA base_reg, prep_ssse3
+ test mxd, 0xf00
+ jnz .h
+ test myd, 0xf00
+ jnz .v
+ tzcnt wd, wd
+ movzx wd, word [base_reg+wq*2+table_offset(prep,)]
+ add wq, base_reg
+ movifnidn strided, stridem
+ lea r6, [strideq*3]
+ %assign stack_offset org_stack_offset
+%if WIN64
+ pop r8
+ pop r7
+%endif
+ jmp wq
+.h:
+ test myd, 0xf00
+ jnz .hv
+ WIN64_SPILL_XMM 12
+ cmp wd, 4
+ je .h_w4
+ tzcnt wd, wd
+%if ARCH_X86_64
+ mova m10, [base+subpel_h_shufA]
+ mova m11, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ shr mxd, 16
+ sub srcq, 3
+ movzx wd, word [base_reg+wq*2+table_offset(prep, _8tap_h)]
+ movd m5, [base_reg+mxq*8+subpel_filters-prep_ssse3+0]
+ pshufd m5, m5, q0000
+ movd m6, [base_reg+mxq*8+subpel_filters-prep_ssse3+4]
+ pshufd m6, m6, q0000
+ mova m7, [base+pw_8192]
+ add wq, base_reg
+ jmp wq
+.h_w4:
+%if ARCH_X86_32
+ and mxd, 0xff
+%else
+ movzx mxd, mxb
+%endif
+ dec srcq
+ movd m4, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+ pshufd m4, m4, q0000
+ mova m6, [base+pw_8192]
+ mova m5, [base+subpel_h_shufA]
+ W32_RESTORE_SSQ
+%if ARCH_X86_64
+ lea stride3q, [strideq*3]
+%endif
+.h_w4_loop:
+ movq m0, [srcq+strideq*0] ; 0
+ movq m1, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movq m2, [srcq+strideq*0] ; 2
+ movq m3, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+%else
+ movq m2, [srcq+strideq*2] ; 2
+ movq m3, [srcq+stride3q ] ; 3
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufb m0, m5 ; subpel_h_shufA
+ pshufb m1, m5
+ pshufb m2, m5
+ pshufb m3, m5
+ pmaddubsw m0, m4 ; subpel_filters + 2
+ pmaddubsw m1, m4
+ pmaddubsw m2, m4
+ pmaddubsw m3, m4
+ phaddw m0, m1
+ phaddw m2, m3
+ pmulhrsw m0, m6 ; pw_8192
+ pmulhrsw m2, m6 ; pw_8192
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m2
+ add tmpq, 32
+ sub hd, 4
+ jg .h_w4_loop
+ RET
+ ;
+%macro PREP_8TAP_H 4 ; dst/src, tmp[1-3]
+%if ARCH_X86_32
+ pshufb %2, %1, [base+subpel_h_shufB]
+ pshufb %3, %1, [base+subpel_h_shufC]
+ pshufb %1, [base+subpel_h_shufA]
+%else
+ pshufb %2, %1, m11; subpel_h_shufB
+ pshufb %3, %1, m9 ; subpel_h_shufC
+ pshufb %1, m10 ; subpel_h_shufA
+%endif
+ pmaddubsw %4, %2, m5 ; subpel +0 B0
+ pmaddubsw %2, m6 ; subpel +4 B4
+ pmaddubsw %3, m6 ; subpel +4 C4
+ pmaddubsw %1, m5 ; subpel +0 A0
+ paddw %3, %4
+ paddw %1, %2
+ phaddw %1, %3
+ pmulhrsw %1, m7 ; 8192
+%endmacro
+ ;
+.h_w8:
+%if ARCH_X86_32
+ mov r3, r2
+ %define base_reg r3
+ W32_RESTORE_SSQ
+%endif
+.h_w8_loop:
+ movu m0, [srcq+strideq*0]
+ movu m1, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ PREP_8TAP_H m0, m2, m3, m4
+ PREP_8TAP_H m1, m2, m3, m4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ sub hd, 2
+ jg .h_w8_loop
+ RET
+.h_w16:
+ xor r6d, r6d
+ jmp .h_start
+.h_w32:
+ mov r6, -16*1
+ jmp .h_start
+.h_w64:
+ mov r6, -16*3
+ jmp .h_start
+.h_w128:
+ mov r6, -16*7
+.h_start:
+%if ARCH_X86_32
+ mov r3, r2
+ %define base_reg r3
+%endif
+ sub srcq, r6
+ mov r5, r6
+ W32_RESTORE_SSQ
+.h_loop:
+ movu m0, [srcq+r6+8*0]
+ movu m1, [srcq+r6+8*1]
+ PREP_8TAP_H m0, m2, m3, m4
+ PREP_8TAP_H m1, m2, m3, m4
+ mova [tmpq+16*0], m0
+ mova [tmpq+16*1], m1
+ add tmpq, 32
+ add r6, 16
+ jle .h_loop
+ add srcq, strideq
+ mov r6, r5
+ dec hd
+ jg .h_loop
+ RET
+%if ARCH_X86_32
+ %define base_reg r2
+%endif
+
+.v:
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0xff
+%else
+ %assign stack_offset org_stack_offset
+ WIN64_SPILL_XMM 16
+ movzx mxd, myb
+%endif
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ lea myq, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ mova m2, [base+pw_512]
+ psrlw m2, m2, 1 ; 0x0100
+ mova m7, [base+pw_8192]
+%if ARCH_X86_32
+ %define subpel0 [rsp+mmsize*0]
+ %define subpel1 [rsp+mmsize*1]
+ %define subpel2 [rsp+mmsize*2]
+ %define subpel3 [rsp+mmsize*3]
+%assign regs_used 2 ; use r1 (src) as tmp for stack alignment if needed
+ ALLOC_STACK -mmsize*4
+%assign regs_used 7
+ movd m0, [myq+0]
+ pshufb m0, m2
+ mova subpel0, m0
+ movd m0, [myq+2]
+ pshufb m0, m2
+ mova subpel1, m0
+ movd m0, [myq+4]
+ pshufb m0, m2
+ mova subpel2, m0
+ movd m0, [myq+6]
+ pshufb m0, m2
+ mova subpel3, m0
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ lea strideq, [strideq*3]
+ sub [rstk+stack_offset+gprsize*2], strideq
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ mov srcq, [rstk+stack_offset+gprsize*2]
+%else
+ %define subpel0 m8
+ %define subpel1 m9
+ %define subpel2 m10
+ %define subpel3 m11
+ movd subpel0, [myq+0]
+ pshufb subpel0, m2
+ movd subpel1, [myq+2]
+ pshufb subpel1, m2
+ movd subpel2, [myq+4]
+ pshufb subpel2, m2
+ movd subpel3, [myq+6]
+ pshufb subpel3, m2
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ cmp wd, 8
+ jg .v_w16
+ je .v_w8
+%endif
+.v_w4:
+%if ARCH_X86_32
+%if STACK_ALIGNMENT < mmsize
+ %define srcm [rsp+mmsize*4+gprsize*1]
+ %define tmpm [rsp+mmsize*4+gprsize*2]
+%endif
+ mov tmpm, tmpq
+ mov srcm, srcq
+ lea r5d, [wq - 4] ; horizontal loop
+ shl r5d, (16 - 2) ; (wq / 4) << 16
+ mov r5w, hw
+.v_w4_loop0:
+%endif
+ movd m2, [srcq+strideq*0] ; 0
+ movhps m2, [srcq+strideq*2] ; 0 _ 2
+ movd m3, [srcq+strideq*1] ; 1
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ movhps m3, [srcq+strideq*1] ; 1 _ 3
+ lea srcq, [srcq+strideq*2]
+%else
+ movhps m3, [srcq+stride3q ] ; 1 _ 3
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufd m2, m2, q2020 ; 0 2 0 2
+ pshufd m3, m3, q2020 ; 1 3 1 3
+ punpckldq m2, m3 ; 0 1 2 3
+ movd m3, [srcq+strideq*0] ; 4
+ movd m1, [srcq+strideq*1] ; 5
+ movd m0, [srcq+strideq*2] ; 6
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+%else
+ add srcq, stride3q
+%endif
+ punpckldq m3, m1 ; 4 5 _ _
+ punpckldq m1, m0 ; 5 6 _ _
+ palignr m4, m3, m2, 4 ; 1 2 3 4
+ punpcklbw m3, m1 ; 45 56
+ punpcklbw m1, m2, m4 ; 01 12
+ punpckhbw m2, m4 ; 23 34
+.v_w4_loop:
+ pmaddubsw m5, m1, subpel0 ; a0 b0
+ mova m1, m2
+ pmaddubsw m2, subpel1 ; a1 b1
+ paddw m5, m2
+ mova m2, m3
+ pmaddubsw m3, subpel2 ; a2 b2
+ paddw m5, m3
+ movd m4, [srcq+strideq*0]
+ punpckldq m3, m0, m4 ; 6 7 _ _
+ movd m0, [srcq+strideq*1]
+ lea srcq, [srcq+strideq*2]
+ punpckldq m4, m0 ; 7 8 _ _
+ punpcklbw m3, m4 ; 67 78
+ pmaddubsw m4, m3, subpel3 ; a3 b3
+ paddw m5, m4
+ pmulhrsw m5, m7
+ movq [tmpq+wq*0], m5
+ movhps [tmpq+wq*2], m5
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w4_loop
+%if ARCH_X86_32
+ mov hw, r5w ; reset vertical loop
+ mov tmpq, tmpm
+ mov srcq, srcm
+ add tmpq, 8
+ add srcq, 4
+ mov tmpm, tmpq
+ mov srcm, srcq
+ sub r5d, 1<<16 ; horizontal--
+ jg .v_w4_loop0
+%endif
+ RET
+
+%if ARCH_X86_64
+.v_w8:
+.v_w16:
+ lea r5d, [wq - 8] ; horizontal loop
+ mov r8, tmpq
+ mov r6, srcq
+ shl r5d, 8 - 3; (wq / 8) << 8
+ mov r5b, hb
+.v_w8_loop0:
+ movq m4, [srcq+strideq*0] ; 0
+ movq m5, [srcq+strideq*1] ; 1
+ lea srcq, [srcq+strideq*2]
+ movq m6, [srcq+strideq*0] ; 2
+ movq m0, [srcq+strideq*1] ; 3
+ lea srcq, [srcq+strideq*2]
+ movq m1, [srcq+strideq*0] ; 4
+ movq m2, [srcq+strideq*1] ; 5
+ lea srcq, [srcq+strideq*2] ;
+ movq m3, [srcq+strideq*0] ; 6
+ shufpd m4, m0, 0x0c
+ shufpd m5, m1, 0x0c
+ punpcklbw m1, m4, m5 ; 01
+ punpckhbw m4, m5 ; 34
+ shufpd m6, m2, 0x0c
+ punpcklbw m2, m5, m6 ; 12
+ punpckhbw m5, m6 ; 45
+ shufpd m0, m3, 0x0c
+ punpcklbw m3, m6, m0 ; 23
+ punpckhbw m6, m0 ; 56
+.v_w8_loop:
+ movq m12, [srcq+strideq*1] ; 8
+ lea srcq, [srcq+strideq*2]
+ movq m13, [srcq+strideq*0] ; 9
+ pmaddubsw m14, m1, subpel0 ; a0
+ pmaddubsw m15, m2, subpel0 ; b0
+ mova m1, m3
+ mova m2, m4
+ pmaddubsw m3, subpel1 ; a1
+ pmaddubsw m4, subpel1 ; b1
+ paddw m14, m3
+ paddw m15, m4
+ mova m3, m5
+ mova m4, m6
+ pmaddubsw m5, subpel2 ; a2
+ pmaddubsw m6, subpel2 ; b2
+ paddw m14, m5
+ paddw m15, m6
+ shufpd m6, m0, m12, 0x0d
+ shufpd m0, m12, m13, 0x0c
+ punpcklbw m5, m6, m0 ; 67
+ punpckhbw m6, m0 ; 78
+ pmaddubsw m12, m5, subpel3 ; a3
+ pmaddubsw m13, m6, subpel3 ; b3
+ paddw m14, m12
+ paddw m15, m13
+ pmulhrsw m14, m7
+ pmulhrsw m15, m7
+ movu [tmpq+wq*0], xm14
+ movu [tmpq+wq*2], xm15
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jg .v_w8_loop
+ movzx hd, r5b ; reset vertical loop
+ add r8, 16
+ add r6, 8
+ mov tmpq, r8
+ mov srcq, r6
+ sub r5d, 1<<8 ; horizontal--
+ jg .v_w8_loop0
+ RET
+%endif ;ARCH_X86_64
+%undef subpel0
+%undef subpel1
+%undef subpel2
+%undef subpel3
+
+.hv:
+ %assign stack_offset org_stack_offset
+ cmp wd, 4
+ jg .hv_w8
+ and mxd, 0xff
+ movd m1, [base_reg+mxq*8+subpel_filters-prep_ssse3+2]
+%if ARCH_X86_32
+ mov mxd, myd
+ and mxd, 0xff
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ mov r5, r2; use as new base
+ %define base_reg r5
+ %assign regs_used 2
+ ALLOC_STACK -mmsize*14
+ %assign regs_used 7
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ lea strideq, [strideq*3 + 1]
+ sub [rstk+stack_offset+gprsize*2], strideq
+ mov strideq, [rstk+stack_offset+gprsize*3]
+ mov srcq, [rstk+stack_offset+gprsize*2]
+ %define subpelv0 [rsp+mmsize*0]
+ %define subpelv1 [rsp+mmsize*1]
+ %define subpelv2 [rsp+mmsize*2]
+ %define subpelv3 [rsp+mmsize*3]
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ pshufd m6, m0, q0000
+ mova subpelv0, m6
+ pshufd m6, m0, q1111
+ mova subpelv1, m6
+ pshufd m6, m0, q2222
+ mova subpelv2, m6
+ pshufd m6, m0, q3333
+ mova subpelv3, m6
+%else
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m0, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ ALLOC_STACK mmsize*14, 14
+ lea stride3q, [strideq*3]
+ sub srcq, stride3q
+ dec srcq
+ %define subpelv0 m10
+ %define subpelv1 m11
+ %define subpelv2 m12
+ %define subpelv3 m13
+ punpcklbw m0, m0
+ psraw m0, 8 ; sign-extend
+ mova m8, [base+pw_8192]
+ mova m9, [base+pd_32]
+ pshufd m10, m0, q0000
+ pshufd m11, m0, q1111
+ pshufd m12, m0, q2222
+ pshufd m13, m0, q3333
+%endif
+ pshufd m7, m1, q0000
+.hv_w4:
+%define hv4_line_0_0 4
+%define hv4_line_0_1 5
+%define hv4_line_0_2 6
+%define hv4_line_0_3 7
+%define hv4_line_0_4 8
+%define hv4_line_0_5 9
+%define hv4_line_1_0 10
+%define hv4_line_1_1 11
+%define hv4_line_1_2 12
+%define hv4_line_1_3 13
+ ;
+ ;
+%if ARCH_X86_32
+ %define w8192reg [base+pw_8192]
+ %define d32reg [base+pd_32]
+%else
+ %define w8192reg m8
+ %define d32reg m9
+%endif
+ ; lower shuffle 0 1 2 3 4
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+strideq*0] ; 0 _ _ _
+ movhps m5, [srcq+strideq*1] ; 0 _ 1 _
+ movq m4, [srcq+strideq*2] ; 2 _ _ _
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+ movhps m4, [srcq+strideq*0] ; 2 _ 3 _
+ add srcq, strideq
+%else
+ movhps m4, [srcq+stride3q ] ; 2 _ 3 _
+ lea srcq, [srcq+strideq*4]
+%endif
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ SAVELINE_W4 m2, 2, 0
+ ; upper shuffle 2 3 4 5 6
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m2, m5, m6 ;H subpel_h_shuf4 0 ~ 1 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 2 ~ 3 ~
+ pmaddubsw m2, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m2, m0 ;H 0 1 2 3
+ pmulhrsw m2, w8192reg ;H pw_8192
+ ;
+ ; lower shuffle
+ mova m6, [base+subpel_h_shuf4]
+ movq m5, [srcq+strideq*0] ; 4 _ _ _
+ movhps m5, [srcq+strideq*1] ; 4 _ 5 _
+ movq m4, [srcq+strideq*2] ; 6 _ _ _
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ SAVELINE_W4 m3, 3, 0
+ ; upper shuffle
+ mova m6, [base+subpel_h_shuf4+16]
+ pshufb m3, m5, m6 ;H subpel_h_shuf4 4 ~ 5 ~
+ pshufb m0, m4, m6 ;H subpel_h_shuf4 6 ~ 6 ~
+ pmaddubsw m3, m7 ;H subpel_filters
+ pmaddubsw m0, m7 ;H subpel_filters
+ phaddw m3, m0 ;H 4 5 6 7
+ pmulhrsw m3, w8192reg ;H pw_8192
+ ;
+%if ARCH_X86_32
+ lea srcq, [srcq+strideq*2]
+ add srcq, strideq
+%else
+ add srcq, stride3q
+%endif
+ ;process high
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ ;process low
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ palignr m4, m3, m2, 4;V 1 2 3 4
+ punpcklwd m1, m2, m4 ; V 01 12
+ punpckhwd m2, m4 ; V 23 34
+ pshufd m0, m3, q2121;V 5 6 5 6
+ punpcklwd m3, m0 ; V 45 56
+.hv_w4_loop:
+ ;process low
+ pmaddwd m5, m1, subpelv0 ; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4]
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m5, 6
+ SAVELINE_W4 m0, 0, 0
+ SAVELINE_W4 m1, 1, 0
+ SAVELINE_W4 m2, 2, 0
+ SAVELINE_W4 m3, 3, 0
+ SAVELINE_W4 m5, 5, 0
+ ;process high
+ RESTORELINE_W4 m0, 0, 1
+ RESTORELINE_W4 m1, 1, 1
+ RESTORELINE_W4 m2, 2, 1
+ RESTORELINE_W4 m3, 3, 1
+ pmaddwd m5, m1, subpelv0; V a0 b0
+ mova m1, m2
+ pmaddwd m2, subpelv1; V a1 b1
+ paddd m5, m2
+ mova m2, m3
+ pmaddwd m3, subpelv2; V a2 b2
+ paddd m5, m3
+ ;
+ mova m6, [base+subpel_h_shuf4+16]
+ movq m4, [srcq+strideq*0] ; 7
+ movhps m4, [srcq+strideq*1] ; 7 _ 8 _
+ pshufb m4, m6 ;H subpel_h_shuf4 7 ~ 8 ~
+ pmaddubsw m4, m7 ;H subpel_filters
+ phaddw m4, m4 ;H 7 8 7 8
+ pmulhrsw m4, w8192reg ;H pw_8192
+ palignr m3, m4, m0, 12 ; 6 7 8 7
+ mova m0, m4
+ punpcklwd m3, m4 ; 67 78
+ pmaddwd m4, m3, subpelv3; a3 b3
+ paddd m5, d32reg ; pd_32
+ paddd m5, m4
+ psrad m4, m5, 6
+ ;
+ RESTORELINE_W4 m5, 5, 0
+ packssdw m5, m4
+ pshufd m5, m5, q3120
+ movu [tmpq], m5
+ lea srcq, [srcq+strideq*2]
+ add tmpq, 16
+ sub hd, 2
+ SAVELINE_W4 m0, 0, 1
+ SAVELINE_W4 m1, 1, 1
+ SAVELINE_W4 m2, 2, 1
+ SAVELINE_W4 m3, 3, 1
+ RESTORELINE_W4 m0, 0, 0
+ RESTORELINE_W4 m1, 1, 0
+ RESTORELINE_W4 m2, 2, 0
+ RESTORELINE_W4 m3, 3, 0
+ jg .hv_w4_loop
+ RET
+%undef subpelv0
+%undef subpelv1
+%undef subpelv2
+%undef subpelv3
+ ;
+
+
+.hv_w8:
+ %assign stack_offset org_stack_offset
+%define hv8_line_1 0
+%define hv8_line_2 1
+%define hv8_line_3 2
+%define hv8_line_4 3
+%define hv8_line_6 4
+ shr mxd, 16
+%if ARCH_X86_32
+ %define base_reg r2
+ %define subpelh0 [rsp+mmsize*5]
+ %define subpelh1 [rsp+mmsize*6]
+ %define subpelv0 [rsp+mmsize*7]
+ %define subpelv1 [rsp+mmsize*8]
+ %define subpelv2 [rsp+mmsize*9]
+ %define subpelv3 [rsp+mmsize*10]
+ %define accuv0 [rsp+mmsize*11]
+ %define accuv1 [rsp+mmsize*12]
+ movq m1, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ movzx mxd, myw
+ and mxd, 0xff
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m5, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ ALLOC_STACK -mmsize*13
+%if STACK_ALIGNMENT < mmsize
+ mov rstk, r2m
+ %define tmpm [rsp+mmsize*13+gprsize*1]
+ %define srcm [rsp+mmsize*13+gprsize*2]
+ %define stridem [rsp+mmsize*13+gprsize*3]
+ mov stridem, rstk
+%endif
+ mov r6, r2
+%define base_reg r6
+ pshufd m0, m1, q0000
+ pshufd m1, m1, q1111
+ punpcklbw m5, m5
+ psraw m5, 8 ; sign-extend
+ pshufd m2, m5, q0000
+ pshufd m3, m5, q1111
+ pshufd m4, m5, q2222
+ pshufd m5, m5, q3333
+ mova subpelh0, m0
+ mova subpelh1, m1
+ mova subpelv0, m2
+ mova subpelv1, m3
+ mova subpelv2, m4
+ mova subpelv3, m5
+ W32_RESTORE_SSQ
+ lea strided, [strided*3]
+ sub srcd, strided
+ sub srcd, 3
+ mov srcm, srcd
+ W32_RESTORE_SSQ
+%else
+ ALLOC_STACK mmsize*5, 16
+ %define subpelh0 m10
+ %define subpelh1 m11
+ %define subpelv0 m12
+ %define subpelv1 m13
+ %define subpelv2 m14
+ %define subpelv3 m15
+ %define accuv0 m8
+ %define accuv1 m9
+ movq m0, [base_reg+mxq*8+subpel_filters-prep_ssse3]
+ movzx mxd, myb
+ shr myd, 16
+ cmp hd, 4
+ cmovle myd, mxd
+ movq m1, [base_reg+myq*8+subpel_filters-prep_ssse3]
+ pshufd subpelh0, m0, q0000
+ pshufd subpelh1, m0, q1111
+ punpcklbw m1, m1
+ psraw m1, 8 ; sign-extend
+ pshufd subpelv0, m1, q0000
+ pshufd subpelv1, m1, q1111
+ pshufd subpelv2, m1, q2222
+ pshufd subpelv3, m1, q3333
+ lea stride3q, [strideq*3]
+ sub srcq, 3
+ sub srcq, stride3q
+ mov r6, srcq
+%endif
+ lea r5d, [wq-4]
+%if ARCH_X86_64
+ mov r8, tmpq
+%else
+ mov tmpm, tmpq
+%endif
+ shl r5d, (16 - 2)
+ mov r5w, hw
+.hv_w8_loop0:
+ movu m4, [srcq+strideq*0] ; 0 = _ _
+ movu m5, [srcq+strideq*1] ; 1 = _ _
+ lea srcq, [srcq+strideq*2]
+%if ARCH_X86_64
+ mova m7, [base+subpel_h_shufA]
+ mova m8, [base+subpel_h_shufB]
+ mova m9, [base+subpel_h_shufC]
+%endif
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 0 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 1 ~ ~ ~
+ movu m6, [srcq+strideq*0] ; 2 = _ _
+ movu m0, [srcq+strideq*1] ; 3 = _ _
+ lea srcq, [srcq+strideq*2]
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 2 ~ ~ ~
+ HV_H_W8 m0, m1, m2, m3, m7, m8, m9 ; 3 ~ ~ ~
+ ;
+ mova m7, [base+pw_8192]
+ pmulhrsw m4, m7 ; H pw_8192
+ pmulhrsw m5, m7 ; H pw_8192
+ pmulhrsw m6, m7 ; H pw_8192
+ pmulhrsw m0, m7 ; H pw_8192
+ punpcklwd m1, m4, m5 ; 0 1 ~
+ punpcklwd m2, m5, m6 ; 1 2 ~
+ punpcklwd m3, m6, m0 ; 2 3 ~
+ SAVELINE_W8 1, m1
+ SAVELINE_W8 2, m2
+ SAVELINE_W8 3, m3
+ ;
+ mova m7, [base+subpel_h_shufA]
+ movu m4, [srcq+strideq*0] ; 4 = _ _
+ movu m5, [srcq+strideq*1] ; 5 = _ _
+ lea srcq, [srcq+strideq*2]
+ movu m6, [srcq+strideq*0] ; 6 = _ _
+ HV_H_W8 m4, m1, m2, m3, m7, m8, m9 ; 4 ~ ~ ~
+ HV_H_W8 m5, m1, m2, m3, m7, m8, m9 ; 5 ~ ~ ~
+ HV_H_W8 m6, m1, m2, m3, m7, m8, m9 ; 6 ~ ~ ~
+ mova m7, [base+pw_8192]
+ pmulhrsw m1, m4, m7 ; H pw_8192 4 ~
+ pmulhrsw m2, m5, m7 ; H pw_8192 5 ~
+ pmulhrsw m3, m6, m7 ; H pw_8192 6 ~
+ punpcklwd m4, m0, m1 ; 3 4 ~
+ punpcklwd m5, m1, m2 ; 4 5 ~
+ punpcklwd m6, m2, m3 ; 5 6 ~
+ ;
+ SAVELINE_W8 6, m3
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+.hv_w8_loop:
+ ; m8 accu for V a
+ ; m9 accu for V b
+ SAVELINE_W8 1, m3
+ SAVELINE_W8 2, m4
+ SAVELINE_W8 3, m5
+ SAVELINE_W8 4, m6
+%if ARCH_X86_32
+ pmaddwd m0, m1, subpelv0 ; a0
+ pmaddwd m7, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m0, m3
+ paddd m7, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m0, m5
+ paddd m7, m6
+ mova m5, [base+pd_32]
+ paddd m0, m5 ; pd_512
+ paddd m7, m5 ; pd_512
+ mova accuv0, m0
+ mova accuv1, m7
+%else
+ pmaddwd m8, m1, subpelv0 ; a0
+ pmaddwd m9, m2, subpelv0 ; b0
+ pmaddwd m3, subpelv1 ; a1
+ pmaddwd m4, subpelv1 ; b1
+ paddd m8, m3
+ paddd m9, m4
+ pmaddwd m5, subpelv2 ; a2
+ pmaddwd m6, subpelv2 ; b2
+ paddd m8, m5
+ paddd m9, m6
+ mova m7, [base+pd_32]
+ paddd m8, m7 ; pd_512
+ paddd m9, m7 ; pd_512
+ mova m7, [base+subpel_h_shufB]
+ mova m6, [base+subpel_h_shufC]
+ mova m5, [base+subpel_h_shufA]
+%endif
+ movu m0, [srcq+strideq*1] ; 7
+ movu m4, [srcq+strideq*2] ; 8
+ lea srcq, [srcq+strideq*2]
+ HV_H_W8 m0, m1, m2, m3, m5, m7, m6
+ HV_H_W8 m4, m1, m2, m3, m5, m7, m6
+ mova m5, [base+pw_8192]
+ pmulhrsw m0, m5 ; H pw_8192
+ pmulhrsw m4, m5 ; H pw_8192
+ RESTORELINE_W8 6, m6
+ punpcklwd m5, m6, m0 ; 6 7 ~
+ punpcklwd m6, m0, m4 ; 7 8 ~
+ pmaddwd m1, m5, subpelv3 ; a3
+ paddd m2, m1, accuv0
+ pmaddwd m1, m6, subpelv3 ; b3
+ paddd m1, m1, accuv1 ; H + V
+ psrad m2, 6
+ psrad m1, 6
+ packssdw m2, m1 ; d -> w
+ movq [tmpq+wq*0], m2
+ movhps [tmpq+wq*2], m2
+ lea tmpq, [tmpq+wq*4]
+ sub hd, 2
+ jle .hv_w8_outer
+ SAVELINE_W8 6, m4
+ RESTORELINE_W8 1, m1
+ RESTORELINE_W8 2, m2
+ RESTORELINE_W8 3, m3
+ RESTORELINE_W8 4, m4
+ jmp .hv_w8_loop
+.hv_w8_outer:
+ movzx hd, r5w
+%if ARCH_X86_32
+ add dword tmpm, 8
+ mov tmpq, tmpm
+ mov srcq, srcm
+ add srcq, 4
+ mov srcm, srcq
+%else
+ add r8, 8
+ mov tmpq, r8
+ add r6, 4
+ mov srcq, r6
+%endif
+ sub r5d, 1<<16
jg .hv_w8_loop0
RET