shithub: dav1d

Download patch

ref: d67e3476c91d408cab8e5d7fb3f29120dadb15ed
parent: 18ef9556b71e3b6b839c35ae614ef0bb5b6a2179
author: Victorien Le Couviour--Tuffet <victorien.lecouviour.tuffet@gmail.com>
date: Mon Mar 4 10:26:52 EST 2019

x86: add SSSE3 cdef dir implementation

---------------------
x86_64:
------------------------------------------
cdef_dir_8bpc_c: 1023.1
cdef_dir_8bpc_ssse3: 110.3
cdef_dir_8bpc_avx2: 71.1
------------------------------------------

---------------------
x86_32:
------------------------------------------
cdef_dir_8bpc_c: 1074.8
cdef_dir_8bpc_ssse3: 120.6
------------------------------------------

Thanks to Ronald for the AVX2 XMM version which was a very good starting
point.

--- a/src/x86/cdef_init_tmpl.c
+++ b/src/x86/cdef_init_tmpl.c
@@ -38,6 +38,7 @@
 decl_cdef_fn(dav1d_cdef_filter_4x4_ssse3);
 
 decl_cdef_dir_fn(dav1d_cdef_dir_avx2);
+decl_cdef_dir_fn(dav1d_cdef_dir_ssse3);
 
 void bitfn(dav1d_cdef_dsp_init_x86)(Dav1dCdefDSPContext *const c) {
     const unsigned flags = dav1d_get_cpu_flags();
@@ -45,6 +46,7 @@
     if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
 
 #if BITDEPTH ==8
+    c->dir = dav1d_cdef_dir_ssse3;
     c->fb[0] = dav1d_cdef_filter_8x8_ssse3;
     c->fb[1] = dav1d_cdef_filter_4x8_ssse3;
     c->fb[2] = dav1d_cdef_filter_4x4_ssse3;
--- a/src/x86/cdef_ssse3.asm
+++ b/src/x86/cdef_ssse3.asm
@@ -29,10 +29,17 @@
 
 SECTION_RODATA 16
 
+%if ARCH_X86_32
 pb_0: times 16 db 0
+%endif
+pw_128: times 8 dw 128
 pw_256: times 8 dw 256
 pw_2048: times 8 dw 2048
 pw_0x7FFF: times 8 dw 0x7FFF
+pd_0to7: dd 0, 4, 2, 6, 1, 5, 3, 7
+div_table: dw 840, 840, 420, 420, 280, 280, 210, 210, 168, 168, 140, 140, 120, 120, 105, 105
+           dw 420, 420, 210, 210, 140, 140, 105, 105, 105, 105, 105, 105, 105, 105, 105, 105
+shufw_6543210x: db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
 tap_table: dw 4, 2, 3, 3, 2, 1
            db -1 * 16 + 1, -2 * 16 + 2
            db  0 * 16 + 1, -1 * 16 + 2
@@ -711,3 +718,589 @@
 cdef_filter_fn 8, 8, 32
 cdef_filter_fn 4, 8, 32
 cdef_filter_fn 4, 4, 32
+
+%macro MULLD 2-3 0 ; %3 = is_constant
+ %if ARCH_X86_32
+  %define m15 m1
+ %endif
+    pmulhuw        m15, %1, %2
+    pmullw          %1, %2
+    pslld          m15, 16
+    paddd           %1, m15
+%endmacro
+
+%if ARCH_X86_64
+cglobal cdef_dir, 3, 4, 16, src, stride, var, stride3
+    lea       stride3q, [strideq*3]
+    movq            m1, [srcq+strideq*0]
+    movhps          m1, [srcq+strideq*1]
+    movq            m3, [srcq+strideq*2]
+    movhps          m3, [srcq+stride3q]
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+
+    pxor            m8, m8
+    psadbw          m0, m1, m8
+    psadbw          m2, m3, m8
+    psadbw          m4, m5, m8
+    psadbw          m6, m7, m8
+    packssdw        m0, m2
+    packssdw        m4, m6
+    packssdw        m0, m4
+    SWAP            m0, m9
+
+    punpcklbw       m0, m1, m8
+    punpckhbw       m1, m8
+    punpcklbw       m2, m3, m8
+    punpckhbw       m3, m8
+    punpcklbw       m4, m5, m8
+    punpckhbw       m5, m8
+    punpcklbw       m6, m7, m8
+    punpckhbw       m7, m8
+
+    mova            m8, [pw_128]
+    psubw           m0, m8
+    psubw           m1, m8
+    psubw           m2, m8
+    psubw           m3, m8
+    psubw           m4, m8
+    psubw           m5, m8
+    psubw           m6, m8
+    psubw           m7, m8
+    psllw           m8, 3
+    psubw           m9, m8                  ; partial_sum_hv[0]
+
+    paddw           m8, m0, m1
+    paddw          m10, m2, m3
+    paddw           m8, m4
+    paddw          m10, m5
+    paddw           m8, m6
+    paddw          m10, m7
+    paddw           m8, m10                 ; partial_sum_hv[1]
+
+    pmaddwd         m8, m8
+    pmaddwd         m9, m9
+    phaddd          m9, m8
+    SWAP            m8, m9
+    MULLD           m8, [div_table+48], 1
+
+    pslldq          m9, m1, 2
+    psrldq         m10, m1, 14
+    pslldq         m11, m2, 4
+    psrldq         m12, m2, 12
+    pslldq         m13, m3, 6
+    psrldq         m14, m3, 10
+    paddw           m9, m0
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14                 ; partial_sum_diag[0] top/right half
+    paddw           m9, m11                 ; partial_sum_diag[0] top/left half
+    pslldq         m11, m4, 8
+    psrldq         m12, m4, 8
+    pslldq         m13, m5, 10
+    psrldq         m14, m5, 6
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13
+    paddw          m10, m14
+    pslldq         m11, m6, 12
+    psrldq         m12, m6, 4
+    pslldq         m13, m7, 14
+    psrldq         m14, m7, 2
+    paddw           m9, m11
+    paddw          m10, m12
+    paddw           m9, m13                 ; partial_sum_diag[0][0-7]
+    paddw          m10, m14                 ; partial_sum_diag[0][8-14,zero]
+    pshufb         m10, [shufw_6543210x]
+    punpckhwd      m11, m9, m10
+    punpcklwd       m9, m10
+    pmaddwd        m11, m11
+    pmaddwd         m9, m9
+    MULLD          m11, [div_table+16]
+    MULLD           m9, [div_table+0]
+    paddd           m9, m11                 ; cost[0a-d]
+
+    pslldq         m10, m0, 14
+    psrldq         m11, m0, 2
+    pslldq         m12, m1, 12
+    psrldq         m13, m1, 4
+    pslldq         m14, m2, 10
+    psrldq         m15, m2, 6
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m3, 8
+    psrldq         m13, m3, 8
+    pslldq         m14, m4, 6
+    psrldq         m15, m4, 10
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15
+    pslldq         m12, m5, 4
+    psrldq         m13, m5, 12
+    pslldq         m14, m6, 2
+    psrldq         m15, m6, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m15                 ; partial_sum_diag[1][8-14,zero]
+    paddw          m10, m7                  ; partial_sum_diag[1][0-7]
+    pshufb         m11, [shufw_6543210x]
+    punpckhwd      m12, m10, m11
+    punpcklwd      m10, m11
+    pmaddwd        m12, m12
+    pmaddwd        m10, m10
+    MULLD          m12, [div_table+16]
+    MULLD          m10, [div_table+0]
+    paddd          m10, m12                 ; cost[4a-d]
+    phaddd          m9, m10                 ; cost[0a/b,4a/b]
+
+    paddw          m10, m0, m1
+    paddw          m11, m2, m3
+    paddw          m12, m4, m5
+    paddw          m13, m6, m7
+    phaddw          m0, m4
+    phaddw          m1, m5
+    phaddw          m2, m6
+    phaddw          m3, m7
+
+    ; m0-3 are horizontal sums (x >> 1), m10-13 are vertical sums (y >> 1)
+    pslldq          m4, m11, 2
+    psrldq          m5, m11, 14
+    pslldq          m6, m12, 4
+    psrldq          m7, m12, 12
+    pslldq         m14, m13, 6
+    psrldq         m15, m13, 10
+    paddw           m4, m10
+    paddw           m5, m7
+    paddw           m4, m6
+    paddw           m5, m15                 ; partial_sum_alt[3] right
+    paddw           m4, m14                 ; partial_sum_alt[3] left
+    pshuflw         m5, m5, q3012
+    punpckhwd       m6, m4, m5
+    punpcklwd       m4, m5
+    pmaddwd         m6, m6
+    pmaddwd         m4, m4
+    MULLD           m6, [div_table+48], 1
+    MULLD           m4, [div_table+32]
+    paddd           m4, m6                  ; cost[7a-d]
+
+    pslldq          m5, m10, 6
+    psrldq          m6, m10, 10
+    pslldq          m7, m11, 4
+    psrldq         m10, m11, 12
+    pslldq         m11, m12, 2
+    psrldq         m12, 14
+    paddw           m5, m7
+    paddw           m6, m10
+    paddw           m5, m11
+    paddw           m6, m12
+    paddw           m5, m13
+    pshuflw         m6, m6, q3012
+    punpckhwd       m7, m5, m6
+    punpcklwd       m5, m6
+    pmaddwd         m7, m7
+    pmaddwd         m5, m5
+    MULLD           m7, [div_table+48], 1
+    MULLD           m5, [div_table+32]
+    paddd           m5, m7                  ; cost[5a-d]
+
+    pslldq          m6, m1, 2
+    psrldq          m7, m1, 14
+    pslldq         m10, m2, 4
+    psrldq         m11, m2, 12
+    pslldq         m12, m3, 6
+    psrldq         m13, m3, 10
+    paddw           m6, m0
+    paddw           m7, m11
+    paddw           m6, m10
+    paddw           m7, m13                 ; partial_sum_alt[3] right
+    paddw           m6, m12                 ; partial_sum_alt[3] left
+    pshuflw         m7, m7, q3012
+    punpckhwd      m10, m6, m7
+    punpcklwd       m6, m7
+    pmaddwd        m10, m10
+    pmaddwd         m6, m6
+    MULLD          m10, [div_table+48], 1
+    MULLD           m6, [div_table+32]
+    paddd           m6, m10                 ; cost[1a-d]
+
+    pshufd          m0, m0, q1032
+    pshufd          m1, m1, q1032
+    pshufd          m2, m2, q1032
+    pshufd          m3, m3, q1032
+
+    pslldq         m10, m0, 6
+    psrldq         m11, m0, 10
+    pslldq         m12, m1, 4
+    psrldq         m13, m1, 12
+    pslldq         m14, m2, 2
+    psrldq          m2, 14
+    paddw          m10, m12
+    paddw          m11, m13
+    paddw          m10, m14
+    paddw          m11, m2
+    paddw          m10, m3
+    pshuflw        m11, m11, q3012
+    punpckhwd      m12, m10, m11
+    punpcklwd      m10, m11
+    pmaddwd        m12, m12
+    pmaddwd        m10, m10
+    MULLD          m12, [div_table+48], 1
+    MULLD          m10, [div_table+32]
+    paddd          m10, m12                 ; cost[3a-d]
+
+    phaddd          m0, m9, m8              ; cost[0,4,2,6]
+    phaddd          m6, m5
+    phaddd         m10, m4
+    phaddd          m1, m6, m10             ; cost[1,5,3,7]
+
+    pcmpgtd         m2, m1, m0              ; [1/5/3/7] > [0/4/2/6]
+    pand            m3, m2, m1
+    pandn           m4, m2, m0
+    por             m3, m4                  ; higher 4 values
+    pshufd          m1, m1, q2301
+    pshufd          m0, m0, q2301
+    pand            m1, m2, m1
+    pandn           m4, m2, m0
+    por             m0, m4, m1              ; 4 values at idx^4 offset
+    pand           m14, m2, [pd_0to7+16]
+    pandn          m15, m2, [pd_0to7]
+    por            m15, m14
+
+    punpckhqdq      m4, m3, m0
+    punpcklqdq      m3, m0
+    pcmpgtd         m5, m4, m3              ; [2or3-6or7] > [0or1/4or5]
+    punpcklqdq      m5, m5
+    pand            m6, m5, m4
+    pandn           m7, m5, m3
+    por             m6, m7                  ; { highest 2 values, complements at idx^4 }
+    movhlps        m14, m15
+    pand           m14, m5, m14
+    pandn          m13, m5, m15
+    por            m15, m13, m14
+
+    pshufd          m7, m6, q3311
+    pcmpgtd         m8, m7, m6              ; [4or5or6or7] > [0or1or2or3]
+    punpcklqdq      m8, m8
+    pand            m9, m8, m7
+    pandn          m10, m8, m6
+    por             m9, m10                 ; max
+    movhlps        m10, m9                  ; complement at idx^4
+    psubd           m9, m10
+    psrld           m9, 10
+    movd        [varq], m9
+    pshufd         m14, m15, q1111
+    pand           m14, m8, m14
+    pandn          m13, m8, m15
+    por            m15, m13, m14
+    movd           eax, m15
+%else
+cglobal cdef_dir, 3, 5, 16, 96, src, stride, var, stride3
+ %define PIC_reg r4
+    LEA        PIC_reg, PIC_base_offset
+
+    pxor            m0, m0
+    mova            m1, [PIC_sym(pw_128)]
+
+    lea       stride3q, [strideq*3]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m2, m5, m0
+    psadbw          m3, m7, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    mova    [esp+0x00], m4
+    mova    [esp+0x10], m5
+    mova    [esp+0x20], m6
+    mova    [esp+0x50], m7
+
+    lea           srcq, [srcq+strideq*4]
+    movq            m5, [srcq+strideq*0]
+    movhps          m5, [srcq+strideq*1]
+    movq            m7, [srcq+strideq*2]
+    movhps          m7, [srcq+stride3q]
+    psadbw          m3, m5, m0
+    psadbw          m0, m7, m0
+    packssdw        m3, m0
+    pxor            m0, m0
+    packssdw        m2, m3
+    punpcklbw       m4, m5, m0
+    punpckhbw       m5, m0
+    punpcklbw       m6, m7, m0
+    punpckhbw       m7, m0
+    psubw           m4, m1
+    psubw           m5, m1
+    psubw           m6, m1
+    psubw           m7, m1
+
+    psllw           m1, 3
+    psubw           m2, m1                  ; partial_sum_hv[0]
+    pmaddwd         m2, m2
+
+    mova            m3, [esp+0x50]
+    mova            m0, [esp+0x00]
+    paddw           m0, [esp+0x10]
+    paddw           m1, m3, [esp+0x20]
+    paddw           m0, m4
+    paddw           m1, m5
+    paddw           m0, m6
+    paddw           m1, m7
+    paddw           m0, m1                  ; partial_sum_hv[1]
+    pmaddwd         m0, m0
+
+    phaddd          m2, m0
+    MULLD           m2, [PIC_sym(div_table)+48], 1
+    mova    [esp+0x30], m2
+
+    mova            m1, [esp+0x10]
+    pslldq          m0, m1, 2
+    psrldq          m1, 14
+    paddw           m0, [esp+0x00]
+    pslldq          m2, m3, 6
+    psrldq          m3, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 4
+    psrldq          m3, 12
+    paddw           m0, m2                  ; partial_sum_diag[0] top/left half
+    paddw           m1, m3                  ; partial_sum_diag[0] top/right half
+    pslldq          m2, m4, 8
+    psrldq          m3, m4, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 10
+    psrldq          m3, m5, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 12
+    psrldq          m3, m6, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m7, 14
+    psrldq          m3, m7, 2
+    paddw           m0, m2                  ; partial_sum_diag[0][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[0][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+16]
+    MULLD           m0, [PIC_sym(div_table)+0]
+    paddd           m0, m2                  ; cost[0a-d]
+    mova    [esp+0x40], m0
+
+    mova            m1, [esp+0x00]
+    pslldq          m0, m1, 14
+    psrldq          m1, 2
+    paddw           m0, m7
+    pslldq          m2, m3, 8
+    psrldq          m3, 8
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x20]
+    pslldq          m2, m3, 10
+    psrldq          m3, 6
+    paddw           m0, m2
+    paddw           m1, m3
+    mova            m3, [esp+0x10]
+    pslldq          m2, m3, 12
+    psrldq          m3, 4
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m4, 6
+    psrldq          m3, m4, 10
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m5, 4
+    psrldq          m3, m5, 12
+    paddw           m0, m2
+    paddw           m1, m3
+    pslldq          m2, m6, 2
+    psrldq          m3, m6, 14
+    paddw           m0, m2                  ; partial_sum_diag[1][0-7]
+    paddw           m1, m3                  ; partial_sum_diag[1][8-14,zero]
+    mova            m3, [esp+0x50]
+    pshufb          m1, [PIC_sym(shufw_6543210x)]
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+16]
+    MULLD           m0, [PIC_sym(div_table)+0]
+    paddd           m0, m2                  ; cost[4a-d]
+    phaddd          m1, [esp+0x40], m0      ; cost[0a/b,4a/b]
+    phaddd          m1, [esp+0x30]          ; cost[0,4,2,6]
+    mova    [esp+0x30], m1
+
+    phaddw          m0, [esp+0x00], m4
+    phaddw          m1, [esp+0x10], m5
+    paddw           m4, m5
+    mova            m2, [esp+0x20]
+    paddw           m5, m2, m3
+    phaddw          m2, m6
+    paddw           m6, m7
+    phaddw          m3, m7
+    mova            m7, [esp+0x00]
+    paddw           m7, [esp+0x10]
+    mova    [esp+0x00], m0
+    mova    [esp+0x10], m1
+    mova    [esp+0x20], m2
+
+    pslldq          m1, m4, 4
+    pslldq          m2, m6, 6
+    pslldq          m0, m5, 2
+    paddw           m1, m2
+    paddw           m0, m7
+    psrldq          m2, m5, 14
+    paddw           m0, m1                  ; partial_sum_alt[3] left
+    psrldq          m1, m4, 12
+    paddw           m1, m2
+    psrldq          m2, m6, 10
+    paddw           m1, m2                  ; partial_sum_alt[3] right
+    pshuflw         m1, m1, q3012
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48], 1
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[7a-d]
+    mova    [esp+0x40], m0
+
+    pslldq          m0, m7, 6
+    psrldq          m7, 10
+    pslldq          m1, m5, 4
+    psrldq          m5, 12
+    pslldq          m2, m4, 2
+    psrldq          m4, 14
+    paddw           m0, m6
+    paddw           m7, m5
+    paddw           m0, m1
+    paddw           m7, m4
+    paddw           m0, m2
+    pshuflw         m7, m7, q3012
+    punpckhwd       m2, m0, m7
+    punpcklwd       m0, m7
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48], 1
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[5a-d]
+    mova    [esp+0x50], m0
+
+    mova            m1, [esp+0x10]
+    mova            m2, [esp+0x20]
+    pslldq          m0, m1, 2
+    psrldq          m1, 14
+    pslldq          m4, m2, 4
+    psrldq          m2, 12
+    pslldq          m5, m3, 6
+    psrldq          m6, m3, 10
+    paddw           m0, [esp+0x00]
+    paddw           m1, m2
+    paddw           m4, m5
+    paddw           m1, m6                  ; partial_sum_alt[3] right
+    paddw           m0, m4                  ; partial_sum_alt[3] left
+    pshuflw         m1, m1, q3012
+    punpckhwd       m2, m0, m1
+    punpcklwd       m0, m1
+    pmaddwd         m2, m2
+    pmaddwd         m0, m0
+    MULLD           m2, [PIC_sym(div_table)+48], 1
+    MULLD           m0, [PIC_sym(div_table)+32]
+    paddd           m0, m2                  ; cost[1a-d]
+    phaddd          m0, [esp+0x50]
+    mova    [esp+0x50], m0
+
+    pshufd          m0, [esp+0x00], q1032
+    pshufd          m1, [esp+0x10], q1032
+    pshufd          m2, [esp+0x20], q1032
+    pshufd          m3, m3, q1032
+
+    pslldq          m4, m0, 6
+    psrldq          m0, 10
+    pslldq          m5, m1, 4
+    psrldq          m1, 12
+    pslldq          m6, m2, 2
+    psrldq          m2, 14
+    paddw           m4, m3
+    paddw           m0, m1
+    paddw           m5, m6
+    paddw           m0, m2
+    paddw           m4, m5
+    pshuflw         m0, m0, q3012
+    punpckhwd      m2, m4, m0
+    punpcklwd      m4, m0
+    pmaddwd        m2, m2
+    pmaddwd        m4, m4
+    MULLD          m2, [PIC_sym(div_table)+48], 1
+    MULLD          m4, [PIC_sym(div_table)+32]
+    paddd          m4, m2                   ; cost[3a-d]
+    phaddd         m4, [esp+0x40]
+
+    mova            m1, [esp+0x50]
+    mova            m0, [esp+0x30]          ; cost[0,4,2,6]
+    phaddd          m1, m4                  ; cost[1,5,3,7]
+
+    pcmpgtd         m2, m1, m0              ; [1/5/3/7] > [0/4/2/6]
+    pand            m3, m2, m1
+    pandn           m4, m2, m0
+    por             m3, m4                  ; higher 4 values
+    pshufd          m1, m1, q2301
+    pshufd          m0, m0, q2301
+    pand            m1, m2, m1
+    pandn           m4, m2, m0
+    por             m0, m4, m1              ; 4 values at idx^4 offset
+    pand            m5, m2, [PIC_sym(pd_0to7)+16]
+    pandn           m6, m2, [PIC_sym(pd_0to7)]
+    por             m6, m5
+
+    punpckhqdq      m4, m3, m0
+    punpcklqdq      m3, m0
+    pcmpgtd         m0, m4, m3              ; [2or3-6or7] > [0or1/4or5]
+    punpcklqdq      m0, m0
+    pand            m1, m0, m4
+    pandn           m7, m0, m3
+    por             m1, m7                  ; { highest 2 values, complements at idx^4 }
+    movhlps         m5, m6
+    pand            m5, m0, m5
+    pandn           m3, m0, m6
+    por             m6, m3, m5
+
+    pshufd          m7, m1, q3311
+    pcmpgtd         m2, m7, m1              ; [4or5or6or7] > [0or1or2or3]
+    punpcklqdq      m2, m2
+    pand            m0, m2, m7
+    pandn           m7, m2, m1
+    por             m0, m7                  ; max
+    movhlps         m7, m0                  ; complement at idx^4
+    psubd           m0, m7
+    psrld           m0, 10
+    movd        [varq], m0
+    pshufd          m5, m6, q1111
+    pand            m5, m2, m5
+    pandn           m3, m2, m6
+    por             m6, m3, m5
+    movd           eax, m6
+%endif
+
+    RET