shithub: dav1d

Download patch

ref: 1bd078c2e5592fde8ba045a585398a5a2c1fb603
parent: efbdf7a0dc4c17bb393e9a67289760cf19c07405
author: Victorien Le Couviour--Tuffet <victorien@videolan.org>
date: Wed Feb 12 09:25:21 EST 2020

x86: add a seperate fully edged case to cdef_filter_avx2

---------------------
fully edged blocks perf
------------------------------------------
before: cdef_filter_4x4_8bpc_avx2: 91.0
 after: cdef_filter_4x4_8bpc_avx2: 75.7
---------------------
before: cdef_filter_4x8_8bpc_avx2: 154.6
 after: cdef_filter_4x8_8bpc_avx2: 131.8
---------------------
before: cdef_filter_8x8_8bpc_avx2: 214.1
 after: cdef_filter_8x8_8bpc_avx2: 195.9
------------------------------------------

--- a/src/x86/cdef.asm
+++ b/src/x86/cdef.asm
@@ -43,6 +43,24 @@
     %endrep
 %endmacro
 
+%macro JMP_TABLE 2-*
+ %xdefine %1_jmptable %%table
+ %xdefine %%base mangle(private_prefix %+ _%1_avx2)
+ %%table:
+ %rep %0 - 1
+    dd %%base %+ .%2 - %%table
+  %rotate 1
+ %endrep
+%endmacro
+
+%macro CDEF_FILTER_JMP_TABLE 1
+JMP_TABLE cdef_filter_%1, \
+    d6k0, d6k1, d7k0, d7k1, \
+    d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
+    d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
+    d0k0, d0k1, d1k0, d1k1
+%endmacro
+
 SECTION_RODATA 64
 
 lut_perm_4x4:  db 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79
@@ -67,8 +85,19 @@
 pri_tap:       db 64, 64, 32, 32, 48, 48, 48, 48         ; left-shifted by 4
 sec_tap:       db 32, 32, 16, 16
 pd_268435568:  dd 268435568
-div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
+blend_4x4:     dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
+               dd 0x80, 0x00, 0x00
+blend_4x8_0:   dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+blend_4x8_1:   dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
+               dd 0x00, 0x00
+blend_4x8_2:   dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+               dd 0x0000
+blend_4x8_3:   dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
+               dd 0x0000, 0x0000
+blend_8x8_0:   dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
+blend_8x8_1:   dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
 pd_47130256:   dd  4,  7,  1,  3,  0,  2,  5,  6
+div_table:     dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
 shufw_6543210x:db 12, 13, 10, 11,  8,  9,  6,  7,  4,  5,  2,  3,  0,  1, 14, 15
 shufb_lohi:    db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
 pw_128:        times 2 dw 128
@@ -93,31 +122,94 @@
                db  1 * 16 + 1,  2 * 16 + 2
                db  1 * 16 + 0,  2 * 16 + 1
 
+CDEF_FILTER_JMP_TABLE 4x4
+CDEF_FILTER_JMP_TABLE 4x8
+CDEF_FILTER_JMP_TABLE 8x8
+
 SECTION .text
 
-%macro ACCUMULATE_TAP 7 ; tap_offset, shift, mask, strength, mul_tap, w, stride
+%macro ACCUMULATE_TAP_BYTE 7 ; tap_offset, shift, mask, strength, mul_tap, w, h
     ; load p0/p1
+    movsxd     dirjmpq, [dirq+kq*4+%1*2*4]
+    add        dirjmpq, tableq
+    call       dirjmpq
+
+    pmaxub          m7, m5
+    pminub          m8, m5
+    pmaxub          m7, m6
+    pminub          m8, m6
+
+    ; accumulate sum[m15] over p0/p1
+%if %7 == 4
+    punpcklbw       m5, m6
+    punpcklbw       m6, m4, m4
+    psubusb         m9, m5, m6
+    psubusb         m5, m6, m5
+    por             m9, m5     ; abs_diff_p01(p01 - px)
+    pcmpeqb         m5, m9
+    por             m5, m3
+    psignb          m6, %5, m5
+    psrlw           m5, m9, %2 ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+    pminub          m5, m9
+    pmaddubsw       m5, m6
+    paddw          m15, m5
+%else
+    psubusb         m9, m5, m4
+    psubusb         m5, m4, m5
+    psubusb        m11, m6, m4
+    psubusb         m6, m4, m6
+    por             m9, m5      ; abs_diff_p0(p0 - px)
+    por            m11, m6      ; abs_diff_p1(p1 - px)
+    pcmpeqb         m5, m9
+    pcmpeqb         m6, m11
+    punpckhbw      m10, m9, m11
+    punpcklbw       m9, m11
+    por             m5, m3
+    por            m11, m6, m3
+    punpckhbw       m6, m5, m11
+    punpcklbw       m5, m11
+    psignb         m11, %5, m6
+    psrlw           m6, m10, %2 ; emulate 8-bit shift
+    pand            m6, %3
+    psubusb         m6, %4, m6
+    pminub          m6, m10
+    pmaddubsw       m6, m11
+    paddw          m12, m6
+    psignb         m11, %5, m5
+    psrlw           m5, m9, %2  ; emulate 8-bit shift
+    pand            m5, %3
+    psubusb         m5, %4, m5
+    pminub          m5, m9
+    pmaddubsw       m5, m11
+    paddw          m15, m5
+%endif
+%endmacro
+
+%macro ACCUMULATE_TAP_WORD 6 ; tap_offset, shift, mask, strength, mul_tap, w
+    ; load p0/p1
     movsx         offq, byte [dirq+kq+%1]       ; off1
 %if %6 == 4
-    movq           xm5, [stkq+offq*2+%7*0]      ; p0
-    movq           xm6, [stkq+offq*2+%7*2]
-    movhps         xm5, [stkq+offq*2+%7*1]
-    movhps         xm6, [stkq+offq*2+%7*3]
+    movq           xm5, [stkq+offq*2+32*0]      ; p0
+    movq           xm6, [stkq+offq*2+32*2]
+    movhps         xm5, [stkq+offq*2+32*1]
+    movhps         xm6, [stkq+offq*2+32*3]
     vinserti128     m5, xm6, 1
 %else
-    movu           xm5, [stkq+offq*2+%7*0]      ; p0
-    vinserti128     m5, [stkq+offq*2+%7*1], 1
+    movu           xm5, [stkq+offq*2+32*0]      ; p0
+    vinserti128     m5, [stkq+offq*2+32*1], 1
 %endif
     neg           offq                          ; -off1
 %if %6 == 4
-    movq           xm6, [stkq+offq*2+%7*0]      ; p1
-    movq           xm9, [stkq+offq*2+%7*2]
-    movhps         xm6, [stkq+offq*2+%7*1]
-    movhps         xm9, [stkq+offq*2+%7*3]
+    movq           xm6, [stkq+offq*2+32*0]      ; p1
+    movq           xm9, [stkq+offq*2+32*2]
+    movhps         xm6, [stkq+offq*2+32*1]
+    movhps         xm9, [stkq+offq*2+32*3]
     vinserti128     m6, xm9, 1
 %else
-    movu           xm6, [stkq+offq*2+%7*0]      ; p1
-    vinserti128     m6, [stkq+offq*2+%7*1], 1
+    movu           xm6, [stkq+offq*2+32*0]      ; p1
+    vinserti128     m6, [stkq+offq*2+32*1], 1
 %endif
     ; out of bounds values are set to a value that is a both a large unsigned
     ; value and a negative signed value.
@@ -150,19 +242,868 @@
     paddw          m15, m5
 %endmacro
 
-%macro CDEF_FILTER 3 ; w, h, stride
+%macro CDEF_FILTER 2 ; w, h
 INIT_YMM avx2
-%if %1 != 4 || %2 != 8
-cglobal cdef_filter_%1x%2, 4, 9, 16, 2 * 16 + (%2+4)*%3, \
-                           dst, stride, left, top, pri, sec, stride3, dst4, edge
+cglobal cdef_filter_%1x%2, 4, 9, 0, dst, stride, left, top, \
+                                    pri, sec, dir, damping, edge
+%assign stack_offset_entry stack_offset
+    mov          edged, edgem
+    cmp          edged, 0xf
+    jne .border_block
+
+    PUSH            r9
+    PUSH           r10
+    PUSH           r11
+%if %2 == 4
+ %assign regs_used 12
+ %if WIN64
+    PUSH  r%+regs_used
+  %assign regs_used regs_used+1
+ %endif
+    ALLOC_STACK 0x60, 16
+    pmovzxbw       xm0, [leftq+1]
+    vpermq          m0, m0, q0110
+    psrldq          m1, m0, 4
+    vpalignr        m2, m0, m0, 12
+    movu    [rsp+0x10], m0
+    movu    [rsp+0x28], m1
+    movu    [rsp+0x40], m2
 %else
-cglobal cdef_filter_%1x%2, 4, 10, 16, 2 * 16 + (%2+4)*%3, \
-                           dst, stride, left, top, pri, sec, stride3, dst4, edge
+    PUSH           r12
+ %if %1 == 4
+  %assign regs_used 13
+  %if WIN64
+    PUSH  r%+regs_used
+   %assign regs_used regs_used+1
+  %endif
+    ALLOC_STACK 8*2+%1*%2*1, 16
+    pmovzxwd        m0, [leftq]
+    mova    [rsp+0x10], m0
+ %else
+    PUSH           r13
+  %assign regs_used 14
+  %if WIN64
+    PUSH  r%+regs_used
+   %assign regs_used regs_used+1
+  %endif
+    ALLOC_STACK 8*2+%1*%2*2+32, 16
+    lea            r11, [strideq*3]
+    movu           xm4, [dstq+strideq*2]
+    pmovzxwq        m0, [leftq+0]
+    pmovzxwq        m1, [leftq+8]
+    vinserti128     m4, [dstq+r11], 1
+    pmovzxbd        m2, [leftq+1]
+    pmovzxbd        m3, [leftq+9]
+    mova    [rsp+0x10], m0
+    mova    [rsp+0x30], m1
+    mova    [rsp+0x50], m2
+    mova    [rsp+0x70], m3
+    mova    [rsp+0x90], m4
+ %endif
 %endif
-%define px rsp+2*16+2*%3
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
+    movifnidn     prid, prim
+%if UNIX64
+    movd           xm0, prid
+    movd           xm1, secdmpd
+%endif
+    mov       dampingd, r7m
+    lzcnt      pridmpd, prid
+    lzcnt      secdmpd, secdmpm
+    sub       dampingd, 31
+    xor          zerod, zerod
+    add        pridmpd, dampingd
+    cmovs      pridmpd, zerod
+    add        secdmpd, dampingd
+    cmovs      secdmpd, zerod
+    mov        [rsp+0], pridmpq                 ; pri_shift
+    mov        [rsp+8], secdmpq                 ; sec_shift
+
+ DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
+    lea         tableq, [tap_table]
+    vpbroadcastb   m13, [tableq+pridmpq]        ; pri_shift_mask
+    vpbroadcastb   m14, [tableq+secdmpq]        ; sec_shift_mask
+
+    ; pri/sec_taps[k] [4 total]
+ DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
+%if UNIX64
+    vpbroadcastb    m0, xm0                     ; pri_strength
+    vpbroadcastb    m1, xm1                     ; sec_strength
+%else
+    vpbroadcastb    m0, prim
+    vpbroadcastb    m1, secm
+%endif
+    and           prid, 1
+    lea           priq, [tableq+priq*2+8]       ; pri_taps
+    lea           secq, [tableq+12]             ; sec_taps
+
+    ; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
+    mov           dird, r6m
+    lea         tableq, [cdef_filter_%1x%2_jmptable]
+    lea           dirq, [tableq+dirq*2*4]
+%if %1 == 4
+ %if %2 == 4
+  DEFINE_ARGS dst, stride, left, top, pri, sec, \
+              table, dir, dirjmp, dst4, stride3, k
+ %else
+  DEFINE_ARGS dst, stride, left, top, pri, sec, \
+              table, dir, dirjmp, dst4, dst8, stride3, k
+    lea          dst8q, [dstq+strideq*8]
+ %endif
+%else
+  DEFINE_ARGS dst, stride, h, top1, pri, sec, \
+              table, dir, dirjmp, top2, dst4, stride3, k
+    mov             hq, -8
+    lea          top1q, [top1q+strideq*0]
+    lea          top2q, [top1q+strideq*1]
+%endif
+    lea          dst4q, [dstq+strideq*4]
+%if %1 == 4
+    lea       stride3q, [strideq*3]
+%endif
+%if %1*%2 > mmsize
+.v_loop:
+%endif
+    mov             kd, 1
+    pxor           m15, m15                     ; sum
+%if %2 == 8
+    pxor           m12, m12
+ %if %1 == 4
+    movd           xm4, [dstq +strideq*0]
+    movd           xm6, [dstq +strideq*1]
+    movd           xm5, [dstq +strideq*2]
+    movd           xm7, [dstq +stride3q ]
+    vinserti128     m4, [dst4q+strideq*0], 1
+    vinserti128     m6, [dst4q+strideq*1], 1
+    vinserti128     m5, [dst4q+strideq*2], 1
+    vinserti128     m7, [dst4q+stride3q ], 1
+    punpckldq       m4, m6
+    punpckldq       m5, m7
+ %else
+    movq           xm4, [dstq+strideq*0]
+    movq           xm5, [dstq+strideq*1]
+    vinserti128     m4, [dstq+strideq*2], 1
+    vinserti128     m5, [dstq+stride3q ], 1
+ %endif
+    punpcklqdq      m4, m5
+%else
+    movd           xm4, [dstq+strideq*0]
+    movd           xm5, [dstq+strideq*1]
+    vinserti128     m4, [dstq+strideq*2], 1
+    vinserti128     m5, [dstq+stride3q ], 1
+    punpckldq       m4, m5
+%endif
+    mova            m7, m4                      ; min
+    mova            m8, m4                      ; max
+.k_loop:
+    vpbroadcastb    m2, [priq+kq]               ; pri_taps
+    vpbroadcastb    m3, [secq+kq]               ; sec_taps
+
+    ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
+    ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
+    ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
+    dec             kq
+    jge .k_loop
+
+    vpbroadcastd   m10, [pw_2048]
+    pxor            m9, m9
+%if %2 == 4
+    punpcklbw       m4, m9
+    pcmpgtw         m9, m15
+    paddw          m15, m9
+    pmulhrsw       m15, m10
+    paddw           m4, m15
+    packuswb        m4, m4 ; clip px in [0x0,0xff]
+    pminub          m4, m7
+    pmaxub          m4, m8
+    vextracti128   xm5, m4, 1
+    movd   [dstq+strideq*0], xm4
+    movd   [dstq+strideq*2], xm5
+    pextrd [dstq+strideq*1], xm4, 1
+    pextrd [dstq+stride3q ], xm5, 1
+%else
+    pcmpgtw         m6, m9, m12
+    pcmpgtw         m5, m9, m15
+    paddw          m12, m6
+    paddw          m15, m5
+    punpckhbw       m5, m4, m9
+    punpcklbw       m4, m9
+    pmulhrsw       m12, m10
+    pmulhrsw       m15, m10
+    paddw           m5, m12
+    paddw           m4, m15
+    packuswb        m4, m5 ; clip px in [0x0,0xff]
+    pminub          m4, m7
+    pmaxub          m4, m8
+    vextracti128   xm5, m4, 1
+ %if %1 == 4
+    movd   [dstq +strideq*0], xm4
+    movd   [dst4q+strideq*0], xm5
+    pextrd [dstq +strideq*1], xm4, 1
+    pextrd [dst4q+strideq*1], xm5, 1
+    pextrd [dstq +strideq*2], xm4, 2
+    pextrd [dst4q+strideq*2], xm5, 2
+    pextrd [dstq +stride3q ], xm4, 3
+    pextrd [dst4q+stride3q ], xm5, 3
+ %else
+    movq   [dstq+strideq*0], xm4
+    movq   [dstq+strideq*2], xm5
+    movhps [dstq+strideq*1], xm4
+    movhps [dstq+stride3q ], xm5
+ %endif
+%endif
+%if %1*%2 > mmsize
+    mov           dstq, dst4q
+    lea          top1q, [rsp+0x90]
+    lea          top2q, [rsp+0xA0]
+    lea          dst4q, [dst4q+strideq*4]
+    add             hq, 4
+    jl .v_loop
+%endif
+    RET
+
+.d0k0:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq    m6, [dstq+strideq*1-1]
+    vpbroadcastq   m10, [dstq+strideq*2-1]
+    movd           xm5, [topq+strideq*1+1]
+    movd           xm9, [dstq+strideq*0+1]
+    psrldq         m11, m6, 2
+    psrldq         m12, m10, 2
+    vinserti128     m6, [dstq+stride3q -1], 1
+    vinserti128    m10, [dstq+strideq*4-1], 1
+    vpblendd        m5, m11, 0x10
+    vpblendd        m9, m12, 0x10
+    movu           m11, [blend_4x4+16]
+    punpckldq       m6, m10
+    punpckldq       m5, m9
+    vpblendvb       m6, [rsp+gprsize+0x28], m11
+ %else
+    movd           xm5, [topq +strideq*1+1]
+    movq           xm6, [dstq +strideq*1-1]
+    movq          xm10, [dstq +stride3q -1]
+    movq          xm11, [dst4q+strideq*1-1]
+    pinsrd         xm5, [dstq +strideq*0+1], 1
+    movhps         xm6, [dstq +strideq*2-1]
+    movhps        xm10, [dst4q+strideq*0-1]
+    movhps        xm11, [dst4q+strideq*2-1]
+    psrldq         xm9, xm6, 2
+    shufps         xm5, xm9, q2010   ; -1 +0 +1 +2
+    shufps         xm6, xm10, q2020  ; +1 +2 +3 +4
+    psrldq         xm9, xm11, 2
+    psrldq        xm10, 2
+    shufps        xm10, xm9, q2020   ; +3 +4 +5 +6
+    movd           xm9, [dst4q+stride3q -1]
+    pinsrd         xm9, [dst4q+strideq*4-1], 1
+    shufps        xm11, xm9, q1020   ; +5 +6 +7 +8
+    pmovzxbw        m9, [leftq+3]
+    vinserti128     m6, xm11, 1
+    movu           m11, [blend_4x8_0+4]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m9, m11
+ %endif
+%else
+    lea            r13, [blend_8x8_0+16]
+    movq           xm5, [top2q         +1]
+    vbroadcasti128 m10, [dstq+strideq*1-1]
+    vbroadcasti128 m11, [dstq+strideq*2-1]
+    movhps         xm5, [dstq+strideq*0+1]
+    vinserti128     m6, m10, [dstq+stride3q -1], 1
+    vinserti128     m9, m11, [dstq+strideq*4-1], 1
+    psrldq         m10, 2
+    psrldq         m11, 2
+    punpcklqdq      m6, m9
+    movu            m9, [r13+hq*2*1+16*1]
+    punpcklqdq     m10, m11
+    vpblendd        m5, m10, 0xF0
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64+8*1], m9
+%endif
+    ret
+.d1k0:
+.d2k0:
+.d3k0:
+%if %1 == 4
+ %if %2 == 4
+    movq           xm6, [dstq+strideq*0-1]
+    movq           xm9, [dstq+strideq*1-1]
+    vinserti128     m6, [dstq+strideq*2-1], 1
+    vinserti128     m9, [dstq+stride3q -1], 1
+    movu           m11, [rsp+gprsize+0x10]
+    pcmpeqd        m12, m12
+    psrldq          m5, m6, 2
+    psrldq         m10, m9, 2
+    psrld          m12, 24
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+    vpblendvb       m6, m11, m12
+ %else
+    movq           xm6, [dstq +strideq*0-1]
+    movq           xm9, [dstq +strideq*2-1]
+    movhps         xm6, [dstq +strideq*1-1]
+    movhps         xm9, [dstq +stride3q -1]
+    movq          xm10, [dst4q+strideq*0-1]
+    movhps        xm10, [dst4q+strideq*1-1]
+    psrldq         xm5, xm6, 2
+    psrldq        xm11, xm9, 2
+    shufps         xm5, xm11, q2020
+    movq          xm11, [dst4q+strideq*2-1]
+    movhps        xm11, [dst4q+stride3q -1]
+    shufps         xm6, xm9, q2020
+    shufps         xm9, xm10, xm11, q2020
+    vinserti128     m6, xm9, 1
+    pmovzxbw        m9, [leftq+1]
+    psrldq        xm10, 2
+    psrldq        xm11, 2
+    shufps        xm10, xm11, q2020
+    vpbroadcastd   m11, [blend_4x8_0+4]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m9, m11
+ %endif
+%else
+    movu           xm5, [dstq+strideq*0-1]
+    movu           xm9, [dstq+strideq*1-1]
+    vinserti128     m5, [dstq+strideq*2-1], 1
+    vinserti128     m9, [dstq+stride3q -1], 1
+    mova           m10, [blend_8x8_0+16]
+    punpcklqdq      m6, m5, m9
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64], m10
+    psrldq          m5, 2
+    psrldq          m9, 2
+    punpcklqdq      m5, m9
+%endif
+    ret
+.d4k0:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq   m10, [dstq+strideq*1-1]
+    vpbroadcastq   m11, [dstq+strideq*2-1]
+    movd           xm6, [topq+strideq*1-1]
+    movd           xm9, [dstq+strideq*0-1]
+    psrldq          m5, m10, 2
+    psrldq         m12, m11, 2
+    vpblendd        m6, m10, 0x10
+    vpblendd        m9, m11, 0x10
+    movu           m10, [blend_4x4]
+    vinserti128     m5, [dstq+stride3q +1], 1
+    vinserti128    m12, [dstq+strideq*4+1], 1
+    punpckldq       m6, m9
+    punpckldq       m5, m12
+    vpblendvb       m6, [rsp+gprsize+0x40], m10
+ %else
+    movd           xm6, [topq +strideq*1-1]
+    movq           xm9, [dstq +strideq*1-1]
+    movq          xm10, [dstq +stride3q -1]
+    movq          xm11, [dst4q+strideq*1-1]
+    pinsrd         xm6, [dstq +strideq*0-1], 1
+    movhps         xm9, [dstq +strideq*2-1]
+    movhps        xm10, [dst4q+strideq*0-1]
+    movhps        xm11, [dst4q+strideq*2-1]
+    psrldq         xm5, xm9, 2
+    shufps         xm6, xm9, q2010
+    psrldq         xm9, xm10, 2
+    shufps         xm5, xm9, q2020
+    shufps        xm10, xm11, q2020
+    movd           xm9, [dst4q+stride3q +1]
+    vinserti128     m6, xm10, 1
+    pinsrd         xm9, [dst4q+strideq*4+1], 1
+    psrldq        xm11, 2
+    pmovzxbw       m10, [leftq-1]
+    shufps        xm11, xm9, q1020
+    movu            m9, [blend_4x8_0]
+    vinserti128     m5, xm11, 1
+    vpblendvb       m6, m10, m9
+ %endif
+%else
+    lea            r13, [blend_8x8_0+8]
+    movq           xm6, [top2q         -1]
+    vbroadcasti128  m5, [dstq+strideq*1-1]
+    vbroadcasti128  m9, [dstq+strideq*2-1]
+    movhps         xm6, [dstq+strideq*0-1]
+    movu           m11, [r13+hq*2*1+16*1]
+    punpcklqdq     m10, m5, m9
+    vinserti128     m5, [dstq+stride3q -1], 1
+    vinserti128     m9, [dstq+strideq*4-1], 1
+    vpblendd        m6, m10, 0xF0
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*1], m11
+    psrldq          m5, 2
+    psrldq          m9, 2
+    punpcklqdq      m5, m9
+%endif
+    ret
+.d5k0:
+.d6k0:
+.d7k0:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq+strideq*1  ]
+    vpbroadcastd    m5, [dstq+strideq*1  ]
+    vpbroadcastd    m9, [dstq+strideq*2  ]
+    vpblendd       xm6, [dstq+strideq*0-4], 0x2
+    vpblendd        m5, m9, 0x22
+    vpblendd        m6, m5, 0x30
+    vinserti128     m5, [dstq+stride3q    ], 1
+    vpblendd        m5, [dstq+strideq*4-20], 0x20
+ %else
+    movd           xm6, [topq +strideq*1]
+    movd           xm5, [dstq +strideq*1]
+    movd           xm9, [dstq +stride3q ]
+    movd          xm10, [dst4q+strideq*1]
+    movd          xm11, [dst4q+stride3q ]
+    pinsrd         xm6, [dstq +strideq*0], 1
+    pinsrd         xm5, [dstq +strideq*2], 1
+    pinsrd         xm9, [dst4q+strideq*0], 1
+    pinsrd        xm10, [dst4q+strideq*2], 1
+    pinsrd        xm11, [dst4q+strideq*4], 1
+    punpcklqdq     xm6, xm5
+    punpcklqdq     xm5, xm9
+    punpcklqdq     xm9, xm10
+    punpcklqdq    xm10, xm11
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+ %endif
+%else
+    movq           xm6, [top2q         ]
+    movq           xm5, [dstq+strideq*1]
+    movq           xm9, [dstq+stride3q ]
+    movhps         xm6, [dstq+strideq*0]
+    movhps         xm5, [dstq+strideq*2]
+    movhps         xm9, [dstq+strideq*4]
+    vinserti128     m6, xm5, 1
+    vinserti128     m5, xm9, 1
+%endif
+    ret
+.d0k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [dstq +strideq*2-2]
+    movd           xm9, [dstq +stride3q -2]
+    movd           xm5, [topq +strideq*0+2]
+    movd          xm10, [topq +strideq*1+2]
+    pinsrw         xm6, [leftq+4], 0
+    pinsrw         xm9, [leftq+6], 0
+    vinserti128     m5, [dstq +strideq*0+2], 1
+    vinserti128    m10, [dstq +strideq*1+2], 1
+    vinserti128     m6, [dst4q+strideq*0-2], 1
+    vinserti128     m9, [dst4q+strideq*1-2], 1
+    punpckldq       m5, m10
+    punpckldq       m6, m9
+ %else
+    movq           xm6, [dstq +strideq*2-2]
+    movd          xm10, [dst4q+strideq*2-2]
+    movd           xm5, [topq +strideq*0+2]
+    movq           xm9, [dst4q+strideq*0-2]
+    movhps         xm6, [dstq +stride3q -2]
+    pinsrw        xm10, [dst4q+stride3q   ], 3
+    pinsrd         xm5, [topq +strideq*1+2], 1
+    movhps         xm9, [dst4q+strideq*1-2]
+    pinsrd        xm10, [dst8q+strideq*0-2], 2
+    pinsrd         xm5, [dstq +strideq*0+2], 2
+    pinsrd        xm10, [dst8q+strideq*1-2], 3
+    pinsrd         xm5, [dstq +strideq*1+2], 3
+    shufps        xm11, xm6, xm9, q3131
+    shufps         xm6, xm9, q2020
+    movu            m9, [blend_4x8_3+8]
+    vinserti128     m6, xm10, 1
+    vinserti128     m5, xm11, 1
+    vpblendvb       m6, [rsp+gprsize+16+8], m9
+ %endif
+%else
+    lea            r13, [blend_8x8_1+16]
+    movq           xm6, [dstq +strideq*2-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq           xm5, [top1q          +2]
+    movq          xm10, [top2q          +2]
+    movu           m11, [r13+hq*2*2+16*2]
+    vinserti128     m6, [dst4q+strideq*0-2], 1
+    vinserti128     m9, [dst4q+strideq*1-2], 1
+    vinserti128     m5, [dstq +strideq*0+2], 1
+    vinserti128    m10, [dstq +strideq*1+2], 1
+    punpcklqdq      m6, m9
+    punpcklqdq      m5, m10
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*2], m11
+%endif
+    ret
+.d1k1:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq    m6, [dstq+strideq*1-2]
+    vpbroadcastq    m9, [dstq+strideq*2-2]
+    movd           xm5, [topq+strideq*1+2]
+    movd          xm10, [dstq+strideq*0+2]
+    psrldq         m11, m6, 4
+    psrldq         m12, m9, 4
+    vpblendd        m5, m11, 0x10
+    movq          xm11, [leftq+2]
+    vinserti128     m6, [dstq+stride3q -2], 1
+    punpckldq     xm11, xm11
+    vpblendd       m10, m12, 0x10
+    pcmpeqd        m12, m12
+    pmovzxwd       m11, xm11
+    psrld          m12, 16
+    punpckldq       m6, m9
+    vpbroadcastd    m9, [dstq+strideq*4-2]
+    vpblendvb       m6, m11, m12
+    punpckldq       m5, m10
+    vpblendd        m6, m9, 0x20
+ %else
+    movd           xm5, [topq +strideq*1+2]
+    movq           xm6, [dstq +strideq*1-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq          xm10, [dst4q+strideq*1-2]
+    movd          xm11, [dst4q+stride3q -2]
+    pinsrd         xm5, [dstq +strideq*0+2], 1
+    movhps         xm6, [dstq +strideq*2-2]
+    movhps         xm9, [dst4q+strideq*0-2]
+    movhps        xm10, [dst4q+strideq*2-2]
+    pinsrd        xm11, [dst4q+strideq*4-2], 1
+    shufps         xm5, xm6, q3110
+    shufps         xm6, xm9, q2020
+    shufps         xm9, xm10, q3131
+    shufps        xm10, xm11, q1020
+    movu           m11, [blend_4x8_2+4]
+    vinserti128     m6, xm10, 1
+    vinserti128     m5, xm9, 1
+    vpblendvb       m6, [rsp+gprsize+16+4], m11
+ %endif
+%else
+    lea            r13, [blend_8x8_1+16]
+    movq           xm5, [top2q         +2]
+    vbroadcasti128  m6, [dstq+strideq*1-2]
+    vbroadcasti128  m9, [dstq+strideq*2-2]
+    movhps         xm5, [dstq+strideq*0+2]
+    shufps         m10, m6, m9, q2121
+    vinserti128     m6, [dstq+stride3q -2], 1
+    vinserti128     m9, [dstq+strideq*4-2], 1
+    movu           m11, [r13+hq*2*1+16*1]
+    vpblendd        m5, m10, 0xF0
+    punpcklqdq      m6, m9
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64+8*1], m11
+%endif
+    ret
+.d2k1:
+%if %1 == 4
+ %if %2 == 4
+    movq          xm11, [leftq]
+    movq           xm6, [dstq+strideq*0-2]
+    movq           xm9, [dstq+strideq*1-2]
+    vinserti128     m6, [dstq+strideq*2-2], 1
+    vinserti128     m9, [dstq+stride3q -2], 1
+    punpckldq     xm11, xm11
+    psrldq          m5, m6, 4
+    psrldq         m10, m9, 4
+    pmovzxwd       m11, xm11
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+    pblendw         m6, m11, 0x05
+ %else
+    movq           xm5, [dstq +strideq*0-2]
+    movq           xm9, [dstq +strideq*2-2]
+    movq          xm10, [dst4q+strideq*0-2]
+    movq          xm11, [dst4q+strideq*2-2]
+    movhps         xm5, [dstq +strideq*1-2]
+    movhps         xm9, [dstq +stride3q -2]
+    movhps        xm10, [dst4q+strideq*1-2]
+    movhps        xm11, [dst4q+stride3q -2]
+    shufps         xm6, xm5, xm9, q2020
+    shufps         xm5, xm9, q3131
+    shufps         xm9, xm10, xm11, q2020
+    shufps        xm10, xm11, q3131
+    pmovzxwd       m11, [leftq]
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+    pblendw         m6, m11, 0x55
+ %endif
+%else
+    mova           m11, [rsp+gprsize+16+hq*8+64]
+    movu           xm5, [dstq+strideq*0-2]
+    movu           xm9, [dstq+strideq*1-2]
+    vinserti128     m5, [dstq+strideq*2-2], 1
+    vinserti128     m9, [dstq+stride3q -2], 1
+    shufps          m6, m5, m9, q1010
+    shufps          m5, m9, q2121
+    pblendw         m6, m11, 0x11
+%endif
+    ret
+.d3k1:
+%if %1 == 4
+ %if %2 == 4
+    vpbroadcastq   m11, [dstq+strideq*1-2]
+    vpbroadcastq   m12, [dstq+strideq*2-2]
+    movd           xm6, [topq+strideq*1-2]
+    movd           xm9, [dstq+strideq*0-2]
+    pblendw        m11, [leftq-16+2], 0x01
+    pblendw        m12, [leftq-16+4], 0x01
+    pinsrw         xm9, [leftq- 0+0], 0
+    psrldq          m5, m11, 4
+    psrldq         m10, m12, 4
+    vinserti128     m5, [dstq+stride3q +2], 1
+    vinserti128    m10, [dstq+strideq*4+2], 1
+    vpblendd        m6, m11, 0x10
+    vpblendd        m9, m12, 0x10
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+ %else
+    movd           xm6, [topq +strideq*1-2]
+    movq           xm5, [dstq +strideq*1-2]
+    movq           xm9, [dstq +stride3q -2]
+    movq          xm10, [dst4q+strideq*1-2]
+    movd          xm11, [dst4q+stride3q +2]
+    pinsrw         xm6, [dstq +strideq*0  ], 3
+    movhps         xm5, [dstq +strideq*2-2]
+    movhps         xm9, [dst4q+strideq*0-2]
+    movhps        xm10, [dst4q+strideq*2-2]
+    pinsrd        xm11, [dst4q+strideq*4+2], 1
+    shufps         xm6, xm5, q2010
+    shufps         xm5, xm9, q3131
+    shufps         xm9, xm10, q2020
+    shufps        xm10, xm11, q1031
+    movu           m11, [blend_4x8_2]
+    vinserti128     m6, xm9, 1
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, [rsp+gprsize+16-4], m11
+ %endif
+%else
+    lea            r13, [blend_8x8_1+8]
+    movq           xm6, [top2q         -2]
+    vbroadcasti128  m5, [dstq+strideq*1-2]
+    vbroadcasti128 m10, [dstq+strideq*2-2]
+    movhps         xm6, [dstq+strideq*0-2]
+    punpcklqdq      m9, m5, m10
+    vinserti128     m5, [dstq+stride3q -2], 1
+    vinserti128    m10, [dstq+strideq*4-2], 1
+    movu           m11, [r13+hq*2*1+16*1]
+    vpblendd        m6, m9, 0xF0
+    shufps          m5, m10, q2121
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*1], m11
+%endif
+    ret
+.d4k1:
+%if %1 == 4
+ %if %2 == 4
+    vinserti128     m6, [dstq +strideq*0-2], 1
+    vinserti128     m9, [dstq +strideq*1-2], 1
+    movd           xm5, [dstq +strideq*2+2]
+    movd          xm10, [dstq +stride3q +2]
+    pblendw         m6, [leftq-16+0], 0x01
+    pblendw         m9, [leftq-16+2], 0x01
+    vinserti128     m5, [dst4q+strideq*0+2], 1
+    vinserti128    m10, [dst4q+strideq*1+2], 1
+    vpblendd        m6, [topq +strideq*0-2], 0x01
+    vpblendd        m9, [topq +strideq*1-2], 0x01
+    punpckldq       m5, m10
+    punpckldq       m6, m9
+ %else
+    movd           xm6, [topq +strideq*0-2]
+    movq           xm5, [dstq +strideq*2-2]
+    movq           xm9, [dst4q+strideq*0-2]
+    movd          xm10, [dst4q+strideq*2+2]
+    pinsrd         xm6, [topq +strideq*1-2], 1
+    movhps         xm5, [dstq +stride3q -2]
+    movhps         xm9, [dst4q+strideq*1-2]
+    pinsrd        xm10, [dst4q+stride3q +2], 1
+    pinsrd         xm6, [dstq +strideq*0-2], 2
+    pinsrd        xm10, [dst8q+strideq*0+2], 2
+    pinsrd         xm6, [dstq +strideq*1-2], 3
+    pinsrd        xm10, [dst8q+strideq*1+2], 3
+    shufps        xm11, xm5, xm9, q2020
+    shufps         xm5, xm9, q3131
+    movu            m9, [blend_4x8_3]
+    vinserti128     m6, xm11, 1
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, [rsp+gprsize+16-8], m9
+ %endif
+%else
+    lea            r13, [blend_8x8_1]
+    movu           m11, [r13+hq*2*2+16*2]
+    movq           xm6, [top1q          -2]
+    movq           xm9, [top2q          -2]
+    movq           xm5, [dstq +strideq*2+2]
+    movq          xm10, [dstq +stride3q +2]
+    vinserti128     m6, [dstq +strideq*0-2], 1
+    vinserti128     m9, [dstq +strideq*1-2], 1
+    vinserti128     m5, [dst4q+strideq*0+2], 1
+    vinserti128    m10, [dst4q+strideq*1+2], 1
+    punpcklqdq      m6, m9
+    vpblendvb       m6, [rsp+gprsize+16+hq*8+64-8*2], m11
+    punpcklqdq      m5, m10
+%endif
+    ret
+.d5k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq +strideq*0-1]
+    movd           xm9, [topq +strideq*1-1]
+    movd           xm5, [dstq +strideq*2+1]
+    movd          xm10, [dstq +stride3q +1]
+    pcmpeqd        m12, m12
+    pmovzxbw       m11, [leftq-8+1]
+    psrld          m12, 24
+    vinserti128     m6, [dstq +strideq*0-1], 1
+    vinserti128     m9, [dstq +strideq*1-1], 1
+    vinserti128     m5, [dst4q+strideq*0+1], 1
+    vinserti128    m10, [dst4q+strideq*1+1], 1
+    punpckldq       m6, m9
+    pxor            m9, m9
+    vpblendd       m12, m9, 0x0F
+    punpckldq       m5, m10
+    vpblendvb       m6, m11, m12
+ %else
+    movd           xm6, [topq +strideq*0-1]
+    movq           xm5, [dstq +strideq*2-1]
+    movq           xm9, [dst4q+strideq*0-1]
+    movd          xm10, [dst4q+strideq*2+1]
+    pinsrd         xm6, [topq +strideq*1-1], 1
+    movhps         xm5, [dstq +stride3q -1]
+    movhps         xm9, [dst4q+strideq*1-1]
+    pinsrd        xm10, [dst4q+stride3q +1], 1
+    pinsrd         xm6, [dstq +strideq*0-1], 2
+    pinsrd        xm10, [dst8q+strideq*0+1], 2
+    pinsrd         xm6, [dstq +strideq*1-1], 3
+    pinsrd        xm10, [dst8q+strideq*1+1], 3
+    shufps        xm11, xm5, xm9, q2020
+    vinserti128     m6, xm11, 1
+    pmovzxbw       m11, [leftq-3]
+    psrldq         xm5, 2
+    psrldq         xm9, 2
+    shufps         xm5, xm9, q2020
+    movu            m9, [blend_4x8_1]
+    vinserti128     m5, xm10, 1
+    vpblendvb       m6, m11, m9
+ %endif
+%else
+    lea            r13, [blend_8x8_0]
+    movu           m11, [r13+hq*2*2+16*2]
+    movq           xm6, [top1q          -1]
+    movq           xm9, [top2q          -1]
+    movq           xm5, [dstq +strideq*2+1]
+    movq          xm10, [dstq +stride3q +1]
+    vinserti128     m6, [dstq +strideq*0-1], 1
+    vinserti128     m9, [dstq +strideq*1-1], 1
+    vinserti128     m5, [dst4q+strideq*0+1], 1
+    vinserti128    m10, [dst4q+strideq*1+1], 1
+    punpcklqdq      m6, m9
+    punpcklqdq      m5, m10
+    vpblendvb       m6, [rsp+gprsize+80+hq*8+64-8*2], m11
+%endif
+    ret
+.d6k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm6, [topq +strideq*0]
+    movd           xm9, [topq +strideq*1]
+    movd           xm5, [dstq +strideq*2]
+    movd          xm10, [dstq +stride3q ]
+    vinserti128     m6, [dstq +strideq*0], 1
+    vinserti128     m9, [dstq +strideq*1], 1
+    vinserti128     m5, [dst4q+strideq*0], 1
+    vinserti128    m10, [dst4q+strideq*1], 1
+    punpckldq       m6, m9
+    punpckldq       m5, m10
+ %else
+    movd           xm5, [dstq +strideq*2]
+    movd           xm6, [topq +strideq*0]
+    movd           xm9, [dst4q+strideq*2]
+    pinsrd         xm5, [dstq +stride3q ], 1
+    pinsrd         xm6, [topq +strideq*1], 1
+    pinsrd         xm9, [dst4q+stride3q ], 1
+    pinsrd         xm5, [dst4q+strideq*0], 2
+    pinsrd         xm6, [dstq +strideq*0], 2
+    pinsrd         xm9, [dst8q+strideq*0], 2
+    pinsrd         xm5, [dst4q+strideq*1], 3
+    pinsrd         xm6, [dstq +strideq*1], 3
+    pinsrd         xm9, [dst8q+strideq*1], 3
+    vinserti128     m6, xm5, 1
+    vinserti128     m5, xm9, 1
+ %endif
+%else
+    movq           xm5, [dstq +strideq*2]
+    movq           xm9, [dst4q+strideq*0]
+    movq           xm6, [top1q          ]
+    movq          xm10, [dstq +strideq*0]
+    movhps         xm5, [dstq +stride3q ]
+    movhps         xm9, [dst4q+strideq*1]
+    movhps         xm6, [top2q          ]
+    movhps        xm10, [dstq +strideq*1]
+    vinserti128     m5, xm9, 1
+    vinserti128     m6, xm10, 1
+%endif
+    ret
+.d7k1:
+%if %1 == 4
+ %if %2 == 4
+    movd           xm5, [dstq +strideq*2-1]
+    movd           xm9, [dstq +stride3q -1]
+    movd           xm6, [topq +strideq*0+1]
+    movd          xm10, [topq +strideq*1+1]
+    pinsrb         xm5, [leftq+ 5], 0
+    pinsrb         xm9, [leftq+ 7], 0
+    vinserti128     m6, [dstq +strideq*0+1], 1
+    vinserti128    m10, [dstq +strideq*1+1], 1
+    vinserti128     m5, [dst4q+strideq*0-1], 1
+    vinserti128     m9, [dst4q+strideq*1-1], 1
+    punpckldq       m6, m10
+    punpckldq       m5, m9
+ %else
+    movd           xm6, [topq +strideq*0+1]
+    movq           xm9, [dstq +strideq*2-1]
+    movq          xm10, [dst4q+strideq*0-1]
+    movd          xm11, [dst4q+strideq*2-1]
+    pinsrd         xm6, [topq +strideq*1+1], 1
+    movhps         xm9, [dstq +stride3q -1]
+    movhps        xm10, [dst4q+strideq*1-1]
+    pinsrd        xm11, [dst4q+stride3q -1], 1
+    pinsrd         xm6, [dstq +strideq*0+1], 2
+    pinsrd        xm11, [dst8q+strideq*0-1], 2
+    pinsrd         xm6, [dstq +strideq*1+1], 3
+    pinsrd        xm11, [dst8q+strideq*1-1], 3
+    shufps         xm5, xm9, xm10, q2020
+    vinserti128     m5, xm11, 1
+    pmovzxbw       m11, [leftq+5]
+    psrldq         xm9, 2
+    psrldq        xm10, 2
+    shufps         xm9, xm10, q2020
+    movu           m10, [blend_4x8_1+8]
+    vinserti128     m6, xm9, 1
+    vpblendvb       m5, m11, m10
+ %endif
+%else
+    lea            r13, [blend_8x8_0+16]
+    movq           xm5, [dstq +strideq*2-1]
+    movq           xm9, [dst4q+strideq*0-1]
+    movq           xm6, [top1q          +1]
+    movq          xm10, [dstq +strideq*0+1]
+    movhps         xm5, [dstq +stride3q -1]
+    movhps         xm9, [dst4q+strideq*1-1]
+    movhps         xm6, [top2q          +1]
+    movhps        xm10, [dstq +strideq*1+1]
+    movu           m11, [r13+hq*2*2+16*2]
+    vinserti128     m5, xm9, 1
+    vinserti128     m6, xm10, 1
+    vpblendvb       m5, [rsp+gprsize+80+hq*8+64+8*2], m11
+%endif
+    ret
+
+.border_block:
+ DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
+%define rstk rsp
+%assign stack_offset stack_offset_entry
+%if %1 == 4 && %2 == 8
+    PUSH            r9
+ %assign regs_used 10
+%else
+ %assign regs_used 9
+%endif
+%if WIN64
+    PUSH  r%+regs_used
+ %assign regs_used regs_used+1
+%endif
+    ALLOC_STACK 2*16+(%2+4)*32, 16
+%define px rsp+2*16+2*32
+
     pcmpeqw        m14, m14
     psllw          m14, 15                  ; 0x8000
-    mov          edged, r8m
 
     ; prepare pixel buffers - body/right
 %if %1 == 4
@@ -178,19 +1119,19 @@
     pmovzxbw        m2, [dstq+strideq*1]
     pmovzxbw        m3, [dstq+strideq*2]
     pmovzxbw        m4, [dstq+stride3q]
-    mova     [px+0*%3], m1
-    mova     [px+1*%3], m2
-    mova     [px+2*%3], m3
-    mova     [px+3*%3], m4
+    mova     [px+0*32], m1
+    mova     [px+1*32], m2
+    mova     [px+2*32], m3
+    mova     [px+3*32], m4
 %if %2 == 8
     pmovzxbw        m1, [dst4q+strideq*0]
     pmovzxbw        m2, [dst4q+strideq*1]
     pmovzxbw        m3, [dst4q+strideq*2]
     pmovzxbw        m4, [dst4q+stride3q]
-    mova     [px+4*%3], m1
-    mova     [px+5*%3], m2
-    mova     [px+6*%3], m3
-    mova     [px+7*%3], m4
+    mova     [px+4*32], m1
+    mova     [px+5*32], m2
+    mova     [px+6*32], m3
+    mova     [px+7*32], m4
 %endif
     jmp .body_done
 .no_right:
@@ -203,24 +1144,24 @@
     pmovzxbw       xm2, xm2
     pmovzxbw       xm3, xm3
     pmovzxbw       xm4, xm4
-    movq     [px+0*%3], xm1
-    movq     [px+1*%3], xm2
-    movq     [px+2*%3], xm3
-    movq     [px+3*%3], xm4
+    movq     [px+0*32], xm1
+    movq     [px+1*32], xm2
+    movq     [px+2*32], xm3
+    movq     [px+3*32], xm4
 %else
     pmovzxbw       xm1, [dstq+strideq*0]
     pmovzxbw       xm2, [dstq+strideq*1]
     pmovzxbw       xm3, [dstq+strideq*2]
     pmovzxbw       xm4, [dstq+stride3q]
-    mova     [px+0*%3], xm1
-    mova     [px+1*%3], xm2
-    mova     [px+2*%3], xm3
-    mova     [px+3*%3], xm4
+    mova     [px+0*32], xm1
+    mova     [px+1*32], xm2
+    mova     [px+2*32], xm3
+    mova     [px+3*32], xm4
 %endif
-    movd [px+0*%3+%1*2], xm14
-    movd [px+1*%3+%1*2], xm14
-    movd [px+2*%3+%1*2], xm14
-    movd [px+3*%3+%1*2], xm14
+    movd [px+0*32+%1*2], xm14
+    movd [px+1*32+%1*2], xm14
+    movd [px+2*32+%1*2], xm14
+    movd [px+3*32+%1*2], xm14
 %if %2 == 8
  %if %1 == 4
     movd           xm1, [dst4q+strideq*0]
@@ -231,24 +1172,24 @@
     pmovzxbw       xm2, xm2
     pmovzxbw       xm3, xm3
     pmovzxbw       xm4, xm4
-    movq     [px+4*%3], xm1
-    movq     [px+5*%3], xm2
-    movq     [px+6*%3], xm3
-    movq     [px+7*%3], xm4
+    movq     [px+4*32], xm1
+    movq     [px+5*32], xm2
+    movq     [px+6*32], xm3
+    movq     [px+7*32], xm4
  %else
     pmovzxbw       xm1, [dst4q+strideq*0]
     pmovzxbw       xm2, [dst4q+strideq*1]
     pmovzxbw       xm3, [dst4q+strideq*2]
     pmovzxbw       xm4, [dst4q+stride3q]
-    mova     [px+4*%3], xm1
-    mova     [px+5*%3], xm2
-    mova     [px+6*%3], xm3
-    mova     [px+7*%3], xm4
+    mova     [px+4*32], xm1
+    mova     [px+5*32], xm2
+    mova     [px+6*32], xm3
+    mova     [px+7*32], xm4
  %endif
-    movd [px+4*%3+%1*2], xm14
-    movd [px+5*%3+%1*2], xm14
-    movd [px+6*%3+%1*2], xm14
-    movd [px+7*%3+%1*2], xm14
+    movd [px+4*32+%1*2], xm14
+    movd [px+5*32+%1*2], xm14
+    movd [px+6*32+%1*2], xm14
+    movd [px+7*32+%1*2], xm14
 %endif
 .body_done:
 
@@ -261,16 +1202,16 @@
     jz .top_no_right
     pmovzxbw        m1, [topq+strideq*0-(%1/2)]
     pmovzxbw        m2, [topq+strideq*1-(%1/2)]
-    movu  [px-2*%3-%1], m1
-    movu  [px-1*%3-%1], m2
+    movu  [px-2*32-%1], m1
+    movu  [px-1*32-%1], m2
     jmp .top_done
 .top_no_right:
     pmovzxbw        m1, [topq+strideq*0-%1]
     pmovzxbw        m2, [topq+strideq*1-%1]
-    movu [px-2*%3-%1*2], m1
-    movu [px-1*%3-%1*2], m2
-    movd [px-2*%3+%1*2], xm14
-    movd [px-1*%3+%1*2], xm14
+    movu [px-2*32-%1*2], m1
+    movu [px-1*32-%1*2], m2
+    movd [px-2*32+%1*2], xm14
+    movd [px-1*32+%1*2], xm14
     jmp .top_done
 .top_no_left:
     test         edgeb, 2                   ; have_right
@@ -277,10 +1218,10 @@
     jz .top_no_left_right
     pmovzxbw        m1, [topq+strideq*0]
     pmovzxbw        m2, [topq+strideq*1]
-    mova   [px-2*%3+0], m1
-    mova   [px-1*%3+0], m2
-    movd   [px-2*%3-4], xm14
-    movd   [px-1*%3-4], xm14
+    mova   [px-2*32+0], m1
+    mova   [px-1*32+0], m2
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
     jmp .top_done
 .top_no_left_right:
 %if %1 == 4
@@ -287,22 +1228,22 @@
     movd           xm1, [topq+strideq*0]
     pinsrd         xm1, [topq+strideq*1], 1
     pmovzxbw       xm1, xm1
-    movq   [px-2*%3+0], xm1
-    movhps [px-1*%3+0], xm1
+    movq   [px-2*32+0], xm1
+    movhps [px-1*32+0], xm1
 %else
     pmovzxbw       xm1, [topq+strideq*0]
     pmovzxbw       xm2, [topq+strideq*1]
-    mova   [px-2*%3+0], xm1
-    mova   [px-1*%3+0], xm2
+    mova   [px-2*32+0], xm1
+    mova   [px-1*32+0], xm2
 %endif
-    movd   [px-2*%3-4], xm14
-    movd   [px-1*%3-4], xm14
-    movd [px-2*%3+%1*2], xm14
-    movd [px-1*%3+%1*2], xm14
+    movd   [px-2*32-4], xm14
+    movd   [px-1*32-4], xm14
+    movd [px-2*32+%1*2], xm14
+    movd [px-1*32+%1*2], xm14
     jmp .top_done
 .no_top:
-    movu   [px-2*%3-%1], m14
-    movu   [px-1*%3-%1], m14
+    movu   [px-2*32-%1], m14
+    movu   [px-1*32-%1], m14
 .top_done:
 
     ; left
@@ -312,27 +1253,27 @@
 %if %2 == 8
     pmovzxbw       xm2, [leftq+ 8]
 %endif
-    movd   [px+0*%3-4], xm1
-    pextrd [px+1*%3-4], xm1, 1
-    pextrd [px+2*%3-4], xm1, 2
-    pextrd [px+3*%3-4], xm1, 3
+    movd   [px+0*32-4], xm1
+    pextrd [px+1*32-4], xm1, 1
+    pextrd [px+2*32-4], xm1, 2
+    pextrd [px+3*32-4], xm1, 3
 %if %2 == 8
-    movd   [px+4*%3-4], xm2
-    pextrd [px+5*%3-4], xm2, 1
-    pextrd [px+6*%3-4], xm2, 2
-    pextrd [px+7*%3-4], xm2, 3
+    movd   [px+4*32-4], xm2
+    pextrd [px+5*32-4], xm2, 1
+    pextrd [px+6*32-4], xm2, 2
+    pextrd [px+7*32-4], xm2, 3
 %endif
     jmp .left_done
 .no_left:
-    movd   [px+0*%3-4], xm14
-    movd   [px+1*%3-4], xm14
-    movd   [px+2*%3-4], xm14
-    movd   [px+3*%3-4], xm14
+    movd   [px+0*32-4], xm14
+    movd   [px+1*32-4], xm14
+    movd   [px+2*32-4], xm14
+    movd   [px+3*32-4], xm14
 %if %2 == 8
-    movd   [px+4*%3-4], xm14
-    movd   [px+5*%3-4], xm14
-    movd   [px+6*%3-4], xm14
-    movd   [px+7*%3-4], xm14
+    movd   [px+4*32-4], xm14
+    movd   [px+5*32-4], xm14
+    movd   [px+6*32-4], xm14
+    movd   [px+7*32-4], xm14
 %endif
 .left_done:
 
@@ -347,19 +1288,19 @@
     jz .bottom_no_right
     pmovzxbw        m1, [dst8q-(%1/2)]
     pmovzxbw        m2, [dst8q+strideq-(%1/2)]
-    movu   [px+(%2+0)*%3-%1], m1
-    movu   [px+(%2+1)*%3-%1], m2
+    movu   [px+(%2+0)*32-%1], m1
+    movu   [px+(%2+1)*32-%1], m2
     jmp .bottom_done
 .bottom_no_right:
     pmovzxbw        m1, [dst8q-%1]
     pmovzxbw        m2, [dst8q+strideq-%1]
-    movu  [px+(%2+0)*%3-%1*2], m1
-    movu  [px+(%2+1)*%3-%1*2], m2
+    movu  [px+(%2+0)*32-%1*2], m1
+    movu  [px+(%2+1)*32-%1*2], m2
 %if %1 == 8
-    movd  [px+(%2-1)*%3+%1*2], xm14                ; overwritten by previous movu
+    movd  [px+(%2-1)*32+%1*2], xm14                ; overwritten by previous movu
 %endif
-    movd  [px+(%2+0)*%3+%1*2], xm14
-    movd  [px+(%2+1)*%3+%1*2], xm14
+    movd  [px+(%2+0)*32+%1*2], xm14
+    movd  [px+(%2+1)*32+%1*2], xm14
     jmp .bottom_done
 .bottom_no_left:
     test          edgeb, 2                  ; have_right
@@ -366,10 +1307,10 @@
     jz .bottom_no_left_right
     pmovzxbw        m1, [dst8q]
     pmovzxbw        m2, [dst8q+strideq]
-    mova   [px+(%2+0)*%3+0], m1
-    mova   [px+(%2+1)*%3+0], m2
-    movd   [px+(%2+0)*%3-4], xm14
-    movd   [px+(%2+1)*%3-4], xm14
+    mova   [px+(%2+0)*32+0], m1
+    mova   [px+(%2+1)*32+0], m2
+    movd   [px+(%2+0)*32-4], xm14
+    movd   [px+(%2+1)*32-4], xm14
     jmp .bottom_done
 .bottom_no_left_right:
 %if %1 == 4
@@ -376,22 +1317,22 @@
     movd           xm1, [dst8q]
     pinsrd         xm1, [dst8q+strideq], 1
     pmovzxbw       xm1, xm1
-    movq   [px+(%2+0)*%3+0], xm1
-    movhps [px+(%2+1)*%3+0], xm1
+    movq   [px+(%2+0)*32+0], xm1
+    movhps [px+(%2+1)*32+0], xm1
 %else
     pmovzxbw       xm1, [dst8q]
     pmovzxbw       xm2, [dst8q+strideq]
-    mova   [px+(%2+0)*%3+0], xm1
-    mova   [px+(%2+1)*%3+0], xm2
+    mova   [px+(%2+0)*32+0], xm1
+    mova   [px+(%2+1)*32+0], xm2
 %endif
-    movd   [px+(%2+0)*%3-4], xm14
-    movd   [px+(%2+1)*%3-4], xm14
-    movd  [px+(%2+0)*%3+%1*2], xm14
-    movd  [px+(%2+1)*%3+%1*2], xm14
+    movd   [px+(%2+0)*32-4], xm14
+    movd   [px+(%2+1)*32-4], xm14
+    movd  [px+(%2+0)*32+%1*2], xm14
+    movd  [px+(%2+1)*32+%1*2], xm14
     jmp .bottom_done
 .no_bottom:
-    movu   [px+(%2+0)*%3-%1], m14
-    movu   [px+(%2+1)*%3-%1], m14
+    movu   [px+(%2+0)*32-%1], m14
+    movu   [px+(%2+1)*32-%1], m14
 .bottom_done:
 
     ; actual filter
@@ -452,32 +1393,32 @@
     lea           stkq, [px]
     pxor           m11, m11
 %if %1*%2*2/mmsize > 1
-.v_loop:
+.border_v_loop:
 %endif
     mov             kd, 1
 %if %1 == 4
-    movq           xm4, [stkq+%3*0]
-    movhps         xm4, [stkq+%3*1]
-    movq           xm5, [stkq+%3*2]
-    movhps         xm5, [stkq+%3*3]
+    movq           xm4, [stkq+32*0]
+    movhps         xm4, [stkq+32*1]
+    movq           xm5, [stkq+32*2]
+    movhps         xm5, [stkq+32*3]
     vinserti128     m4, xm5, 1
 %else
-    mova           xm4, [stkq+%3*0]             ; px
-    vinserti128     m4, [stkq+%3*1], 1
+    mova           xm4, [stkq+32*0]             ; px
+    vinserti128     m4, [stkq+32*1], 1
 %endif
     pxor           m15, m15                     ; sum
     mova            m7, m4                      ; max
     mova            m8, m4                      ; min
-.k_loop:
+.border_k_loop:
     vpbroadcastb    m2, [priq+kq]               ; pri_taps
     vpbroadcastb    m3, [secq+kq]               ; sec_taps
 
-    ACCUMULATE_TAP 0*2, [rsp+0], m13, m0, m2, %1, %3
-    ACCUMULATE_TAP 2*2, [rsp+8], m14, m1, m3, %1, %3
-    ACCUMULATE_TAP 6*2, [rsp+8], m14, m1, m3, %1, %3
+    ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
+    ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
+    ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
 
     dec             kq
-    jge .k_loop
+    jge .border_k_loop
 
     vpbroadcastd   m10, [pw_2048]
     pcmpgtw         m9, m11, m15
@@ -501,17 +1442,17 @@
 %if %1*%2*2/mmsize > 1
  %define vloop_lines (mmsize/(%1*2))
     lea           dstq, [dstq+strideq*vloop_lines]
-    add           stkq, %3*vloop_lines
+    add           stkq, 32*vloop_lines
     dec             hd
-    jg .v_loop
+    jg .border_v_loop
 %endif
 
     RET
 %endmacro
 
-CDEF_FILTER 8, 8, 32
-CDEF_FILTER 4, 8, 32
-CDEF_FILTER 4, 4, 32
+CDEF_FILTER 8, 8
+CDEF_FILTER 4, 8
+CDEF_FILTER 4, 4
 
 INIT_YMM avx2
 cglobal cdef_dir, 3, 4, 15, src, stride, var, stride3