shithub: dav1d

Download patch

ref: e94dafeaf7c82fb1109909a7b4dd0a9219f5a126
parent: c9c445acbeeb444d352f862ba4bdcf9646f22b42
author: Francois Cartegnie <fcvlcdev@free.fr>
date: Sat Feb 9 13:08:38 EST 2019

add SSSE3 prep_bilin

--- a/src/x86/mc_init_tmpl.c
+++ b/src/x86/mc_init_tmpl.c
@@ -50,6 +50,7 @@
 decl_mct_fn(dav1d_prep_8tap_sharp_regular_avx2);
 decl_mct_fn(dav1d_prep_8tap_sharp_smooth_avx2);
 decl_mct_fn(dav1d_prep_bilin_avx2);
+decl_mct_fn(dav1d_prep_bilin_ssse3);
 
 decl_avg_fn(dav1d_avg_avx2);
 decl_avg_fn(dav1d_avg_ssse3);
@@ -87,6 +88,8 @@
 
 #if BITDEPTH == 8
     init_mc_fn (FILTER_2D_BILINEAR,            bilin,               ssse3);
+
+    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               ssse3);
 
     c->avg = dav1d_avg_ssse3;
     c->w_avg = dav1d_w_avg_ssse3;
--- a/src/x86/mc_ssse3.asm
+++ b/src/x86/mc_ssse3.asm
@@ -90,8 +90,10 @@
 %endmacro
 
 %xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_ssse3.put)
+%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_ssse3.prep)
 
 BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
+BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
 
 %macro HV_JMP_TABLE 5-*
     %xdefine %%prefix mangle(private_prefix %+ _%1_%2_%3)
@@ -126,6 +128,7 @@
 %endmacro
 
 HV_JMP_TABLE put,  bilin, ssse3, 7, 2, 4, 8, 16, 32, 64, 128
+HV_JMP_TABLE prep, bilin, ssse3, 7,    4, 8, 16, 32, 64, 128
 
 %define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
 
@@ -710,6 +713,592 @@
 .hv_w128:
     lea                 t0d, [hq+(7<<16)]
     jmp .hv_w16gt
+
+DECLARE_REG_TMP 3, 5, 6
+%if ARCH_X86_32
+ %define base        t2-prep_ssse3
+%else
+ %define base        0
+%endif
+cglobal prep_bilin, 3, 7, 0, tmp, src, stride, w, h, mxy, stride3
+    movifnidn          mxyd, r5m ; mx
+    LEA                  t2, prep_ssse3
+    tzcnt                wd, wm
+    movifnidn            hd, hm
+    test               mxyd, mxyd
+    jnz .h
+    mov                mxyd, r6m ; my
+    test               mxyd, mxyd
+    jnz .v
+.prep:
+    movzx                wd, word [t2+wq*2+table_offset(prep,)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.prep_w4:
+    movd                 m0, [srcq+strideq*0]
+    movd                 m1, [srcq+strideq*1]
+    movd                 m2, [srcq+strideq*2]
+    movd                 m3, [srcq+stride3q ]
+    punpckldq            m0, m1
+    punpckldq            m2, m3
+    lea                srcq, [srcq+strideq*4]
+    pxor                 m1, m1
+    punpcklbw            m0, m1
+    punpcklbw            m2, m1
+    psllw                m0, 4
+    psllw                m2, 4
+    mova    [tmpq+mmsize*0], m0
+    mova    [tmpq+mmsize*1], m2
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .prep_w4
+    RET
+.prep_w8:
+    movq                 m0, [srcq+strideq*0]
+    movq                 m1, [srcq+strideq*1]
+    movq                 m2, [srcq+strideq*2]
+    movq                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .prep_w8
+    RET
+.prep_w16:
+    movq                 m0, [srcq+strideq*0+8*0]
+    movq                 m1, [srcq+strideq*0+8*1]
+    movq                 m2, [srcq+strideq*1+8*0]
+    movq                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 2
+    jg .prep_w16
+    RET
+.prep_w16gt:
+    mov                 t1q, srcq
+    mov                 r3q, t2q
+.prep_w16gt_hloop:
+    movq                 m0, [t1q+8*0]
+    movq                 m1, [t1q+8*1]
+    movq                 m2, [t1q+8*2]
+    movq                 m3, [t1q+8*3]
+    pxor                 m4, m4
+    punpcklbw            m0, m4
+    punpcklbw            m1, m4
+    punpcklbw            m2, m4
+    punpcklbw            m3, m4
+    psllw                m0, 4
+    psllw                m1, 4
+    psllw                m2, 4
+    psllw                m3, 4
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    add                 t1q, 32
+    sub                 r3q, 1
+    jg .prep_w16gt_hloop
+    lea                srcq, [srcq+strideq]
+    sub                  hd, 1
+    jg .prep_w16gt
+    RET
+.prep_w32:
+    mov                 t2q, 1
+    jmp .prep_w16gt
+.prep_w64:
+    mov                 t2q, 2
+    jmp .prep_w16gt
+.prep_w128:
+    mov                 t2q, 4
+    jmp .prep_w16gt
+.h:
+    ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
+    ; = (16 - mx) * src[x] + mx * src[x + 1]
+    imul               mxyd, 0xff01
+    mova                 m4, [base+bilin_h_shuf8]
+    add                mxyd, 16 << 8
+    movd                xm5, mxyd
+    mov                mxyd, r6m ; my
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+    test               mxyd, mxyd
+    jnz .hv
+%if ARCH_X86_32
+    mov                  t1, t2 ; save base reg for w4
+%endif
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_h)]
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.h_w4:
+%if ARCH_X86_32
+    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
+%else
+    mova                 m4, [bilin_h_shuf4]
+%endif
+.h_w4_loop:
+    movq                 m0, [srcq+strideq*0]
+    movhps               m0, [srcq+strideq*1]
+    movq                 m1, [srcq+strideq*2]
+    movhps               m1, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+    pshufb               m1, m4
+    pmaddubsw            m1, m5
+    mova          [tmpq+0 ], m0
+    mova          [tmpq+16], m1
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .h_w4_loop
+    RET
+.h_w8:
+    movu                 m0, [srcq+strideq*0]
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    movu                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .h_w8
+    RET
+.h_w16:
+    movu                 m0, [srcq+strideq*0+8*0]
+    movu                 m1, [srcq+strideq*0+8*1]
+    movu                 m2, [srcq+strideq*1+8*0]
+    movu                 m3, [srcq+strideq*1+8*1]
+    lea                srcq, [srcq+strideq*2]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    sub                  hd, 2
+    jg .h_w16
+    RET
+.h_w16gt:
+    mov                 t1q, srcq
+    mov                 r3q, t2q
+.h_w16gt_hloop:
+    movu                 m0, [t1q+8*0]
+    movu                 m1, [t1q+8*1]
+    movu                 m2, [t1q+8*2]
+    movu                 m3, [t1q+8*3]
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pshufb               m3, m4
+    pmaddubsw            m0, m5
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    pmaddubsw            m3, m5
+    mova        [tmpq+16*0], m0
+    mova        [tmpq+16*1], m1
+    mova        [tmpq+16*2], m2
+    mova        [tmpq+16*3], m3
+    add                tmpq, 16*4
+    add                 t1q, 32
+    sub                 r3q, 1
+    jg .h_w16gt_hloop
+    lea                srcq, [srcq+strideq]
+    sub                  hd, 1
+    jg .h_w16gt
+    RET
+.h_w32:
+    mov                 t2q, 1
+    jmp .h_w16gt
+.h_w64:
+    mov                 t2q, 2
+    jmp .h_w16gt
+.h_w128:
+    mov                 t2q, 4
+    jmp .h_w16gt
+.v:
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
+    imul               mxyd, 0xff01
+    add                mxyd, 16 << 8
+    add                  wq, t2
+    lea            stride3q, [strideq*3]
+    movd                 m5, mxyd
+    pshuflw              m5, m5, q0000
+    punpcklqdq           m5, m5
+    jmp                  wq
+.v_w4:
+    movd                 m0, [srcq+strideq*0]
+.v_w4_loop:
+    movd                 m1, [srcq+strideq*1]
+    movd                 m2, [srcq+strideq*2]
+    movd                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    punpcklwd            m0, m1  ; 0 1 _ _
+    punpcklwd            m1, m2  ; 1 2 _ _
+    punpcklbw            m1, m0
+    pmaddubsw            m1, m5
+    pshufd               m1, m1, q3120
+    mova        [tmpq+16*0], m1
+    movd                 m0, [srcq+strideq*0]
+    punpcklwd            m2, m3  ; 2 3 _ _
+    punpcklwd            m3, m0  ; 3 4 _ _
+    punpcklbw            m3, m2
+    pmaddubsw            m3, m5
+    pshufd               m3, m3, q3120
+    mova        [tmpq+16*1], m3
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .v_w4_loop
+    RET
+.v_w8:
+    movq                 m0, [srcq+strideq*0]
+.v_w8_loop:
+    movq                 m1, [srcq+strideq*2]
+    movq                 m2, [srcq+strideq*1]
+    movq                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    shufpd               m4, m0, m1, 0x0c ; 0 2
+    movq                 m0, [srcq+strideq*0]
+    shufpd               m2, m3, 0x0c ; 1 3
+    shufpd               m1, m0, 0x0c ; 2 4
+    punpcklbw            m3, m2, m4
+    pmaddubsw            m3, m5
+    mova        [tmpq+16*0], m3
+    punpckhbw            m3, m2, m4
+    pmaddubsw            m3, m5
+    mova        [tmpq+16*2], m3
+    punpcklbw            m3, m1, m2
+    punpckhbw            m1, m2
+    pmaddubsw            m3, m5
+    pmaddubsw            m1, m5
+    mova        [tmpq+16*1], m3
+    mova        [tmpq+16*3], m1
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .v_w8_loop
+    RET
+.v_w16:
+    movu                 m0, [srcq+strideq*0]
+.v_w16_loop:
+    movu                 m1, [srcq+strideq*1]
+    movu                 m2, [srcq+strideq*2]
+    punpcklbw            m3, m1, m0
+    punpckhbw            m4, m1, m0
+    pmaddubsw            m3, m5
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*0], m3
+    mova        [tmpq+16*1], m4
+    punpcklbw            m3, m2, m1
+    punpckhbw            m4, m2, m1
+    pmaddubsw            m3, m5
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*2], m3
+    mova        [tmpq+16*3], m4
+    movu                 m3, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movu                 m0, [srcq+strideq*0]
+    add                tmpq, 16*8
+    punpcklbw            m1, m3, m2
+    punpckhbw            m4, m3, m2
+    pmaddubsw            m1, m5
+    pmaddubsw            m4, m5
+    mova        [tmpq-16*4], m1
+    mova        [tmpq-16*3], m4
+    punpcklbw            m1, m0, m3
+    punpckhbw            m2, m0, m3
+    pmaddubsw            m1, m5
+    pmaddubsw            m2, m5
+    mova        [tmpq-16*2], m1
+    mova        [tmpq-16*1], m2
+    sub                  hd, 4
+    jg .v_w16_loop
+    RET
+.v_w32:
+    lea                 t2d, [hq+(0<<16)]
+    mov                 t0d, 64
+.v_w32_start:
+%if ARCH_X86_64
+ %if WIN64
+    PUSH                 r7
+ %endif
+    mov                  r7, tmpq
+%endif
+    mov                  t1, srcq
+.v_w32_loop_h:
+    movu                 m0, [srcq+strideq*0+16*0] ; 0L
+    movu                 m1, [srcq+strideq*0+16*1] ; 0U
+.v_w32_loop_v:
+    movu                 m2, [srcq+strideq*1+16*0] ; 1L
+    movu                 m3, [srcq+strideq*1+16*1] ; 1U
+    lea                srcq, [srcq+strideq*2]
+    punpcklbw            m4, m2, m0
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*0], m4
+    punpckhbw            m4, m2, m0
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*1], m4
+    punpcklbw            m4, m3, m1
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*2], m4
+    punpckhbw            m4, m3, m1
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*3], m4
+    add                tmpq, t0q
+    movu                 m0, [srcq+strideq*0+16*0] ; 2L
+    movu                 m1, [srcq+strideq*0+16*1] ; 2U
+    punpcklbw            m4, m0, m2
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*0], m4
+    punpckhbw            m4, m0, m2
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*1], m4
+    punpcklbw            m4, m1, m3
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*2], m4
+    punpckhbw            m4, m1, m3
+    pmaddubsw            m4, m5
+    mova        [tmpq+16*3], m4
+    add                tmpq, t0q
+    sub                  hd, 2
+    jg .v_w32_loop_v
+    movzx                hd, t2w
+    add                  t1, 32
+    mov                srcq, t1
+%if ARCH_X86_64
+    add                  r7, 2*16*2
+    mov                tmpq, r7
+%else
+    mov                tmpq, tmpmp
+    add                tmpq, 2*16*2
+    mov               tmpmp, tmpq
+%endif
+    sub                 t2d, 1<<16
+    jg .v_w32_loop_h
+%if WIN64
+    POP                  r7
+%endif
+    RET
+.v_w64:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 128
+    jmp .v_w32_start
+.v_w128:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 256
+    jmp .v_w32_start
+.hv:
+    ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
+    ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
+    %assign stack_offset stack_offset - stack_size_padded
+    WIN64_SPILL_XMM       8
+    movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_hv)]
+    shl                mxyd, 11
+    movd                xm6, mxyd
+    add                  wq, t2
+    pshuflw              m6, m6, q0000
+    punpcklqdq           m6, m6
+%if ARCH_X86_32
+    mov                  t1, t2 ; save base reg for w4
+%endif
+    lea            stride3q, [strideq*3]
+    jmp                  wq
+.hv_w4:
+%if ARCH_X86_32
+    mova                 m4, [t1-prep_ssse3+bilin_h_shuf4]
+%else
+    mova                 m4, [bilin_h_shuf4]
+%endif
+    movq                 m0, [srcq+strideq*0] ; 0 _
+    punpcklqdq           m0, m0
+    pshufb               m0, m4
+    pmaddubsw            m0, m5
+.hv_w4_loop:
+    movq                 m1, [srcq+strideq*1]
+    movhps               m1, [srcq+strideq*2] ; 1 _ 2 _
+    movq                 m2, [srcq+stride3q ]
+    lea                srcq, [srcq+strideq*4]
+    movhps               m2, [srcq+strideq*0] ; 3 _ 4 _
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5           ; 1 + 2 +
+    shufpd               m3, m0, m1, 0x01 ; 0 + 1 +
+    pmaddubsw            m0, m2, m5       ; 3 + 4 +
+    shufpd               m2, m1, m0, 0x01 ; 2 + 3 +
+    psubw                m1, m3
+    pmulhrsw             m1, m6
+    paddw                m1, m3
+    psubw                m3, m0, m2
+    pmulhrsw             m3, m6
+    paddw                m3, m2
+    mova        [tmpq+16*0], m1
+    mova        [tmpq+16*1], m3
+    add                tmpq, 32
+    sub                  hd, 4
+    jg .hv_w4_loop
+    RET
+.hv_w8:
+    movu                 m0,     [srcq+strideq*0]
+    pshufb               m0, m4
+    pmaddubsw            m0, m5                   ; 0 +
+.hv_w8_loop:
+    movu                 m1,     [srcq+strideq*1] ; 1
+    movu                 m2,     [srcq+strideq*2] ; 2
+    pshufb               m1, m4
+    pshufb               m2, m4
+    pmaddubsw            m1, m5 ; 1 +
+    pmaddubsw            m2, m5 ; 2 +
+    psubw                m3, m1, m0  ; 1-0
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    psubw                m7, m2, m1  ; 2-1
+    pmulhrsw             m7, m6
+    paddw                m7, m1
+    mova        [tmpq+16*0], m3
+    mova        [tmpq+16*1], m7
+    movu                 m1,     [srcq+stride3q ] ; 3
+    lea                srcq,     [srcq+strideq*4]
+    movu                 m0,     [srcq+strideq*0] ; 4
+    pshufb               m1, m4
+    pshufb               m0, m4
+    pmaddubsw            m1, m5 ; 3 +
+    pmaddubsw            m0, m5 ; 4 +
+    psubw                m3, m1, m2  ; 3-2
+    pmulhrsw             m3, m6
+    paddw                m3, m2
+    psubw                m7, m0, m1  ; 4-3
+    pmulhrsw             m7, m6
+    paddw                m7, m1
+    mova        [tmpq+16*2], m3
+    mova        [tmpq+16*3], m7
+    add                tmpq, 16*4
+    sub                  hd, 4
+    jg .hv_w8_loop
+    RET
+.hv_w16:
+    lea                 t2d, [hq+(0<<16)]
+    mov                 t0d, 32
+.hv_w16_start:
+%if ARCH_X86_64
+ %if WIN64
+    PUSH                 r7
+ %endif
+    mov                  r7, tmpq
+%endif
+    mov                  t1, srcq
+.hv_w16_loop_h:
+    movu                 m0,     [srcq+strideq*0+8*0] ; 0L
+    movu                 m1,     [srcq+strideq*0+8*1] ; 0U
+    pshufb               m0, m4
+    pshufb               m1, m4
+    pmaddubsw            m0, m5      ; 0L +
+    pmaddubsw            m1, m5      ; 0U +
+.hv_w16_loop_v:
+    movu                 m2,     [srcq+strideq*1+8*0] ; 1L
+    pshufb               m2, m4
+    pmaddubsw            m2, m5      ; 1L +
+    psubw                m3, m2, m0  ; 1L-0L
+    pmulhrsw             m3, m6
+    paddw                m3, m0
+    mova        [tmpq+16*0], m3
+    movu                 m3,     [srcq+strideq*1+8*1] ; 1U
+    lea                srcq,     [srcq+strideq*2]
+    pshufb               m3, m4
+    pmaddubsw            m3, m5      ; 1U +
+    psubw                m0, m3, m1  ; 1U-0U
+    pmulhrsw             m0, m6
+    paddw                m0, m1
+    mova        [tmpq+16*1], m0
+    add                tmpq, t0q
+    movu                 m0,     [srcq+strideq*0+8*0] ; 2L
+    pshufb               m0, m4
+    pmaddubsw            m0, m5      ; 2L +
+    psubw                m1, m0, m2  ; 2L-1L
+    pmulhrsw             m1, m6
+    paddw                m1, m2
+    mova        [tmpq+16*0], m1
+    movu                 m1,     [srcq+strideq*0+8*1] ; 2U
+    pshufb               m1, m4
+    pmaddubsw            m1, m5      ; 2U +
+    psubw                m2, m1, m3  ; 2U-1U
+    pmulhrsw             m2, m6
+    paddw                m2, m3
+    mova        [tmpq+16*1], m2
+    add                tmpq, t0q
+    sub                  hd, 2
+    jg .hv_w16_loop_v
+    movzx                hd, t2w
+    add                  t1, 16
+    mov                srcq, t1
+%if ARCH_X86_64
+    add                  r7, 2*16
+    mov                tmpq, r7
+%else
+    mov                tmpq, tmpmp
+    add                tmpq, 2*16
+    mov               tmpmp, tmpq
+%endif
+    sub                 t2d, 1<<16
+    jg .hv_w16_loop_h
+%if WIN64
+    POP                  r7
+%endif
+    RET
+.hv_w32:
+    lea                 t2d, [hq+(1<<16)]
+    mov                 t0d, 64
+    jmp .hv_w16_start
+.hv_w64:
+    lea                 t2d, [hq+(3<<16)]
+    mov                 t0d, 128
+    jmp .hv_w16_start
+.hv_w128:
+    lea                 t2d, [hq+(7<<16)]
+    mov                 t0d, 256
+    jmp .hv_w16_start
 
 %if WIN64
 DECLARE_REG_TMP 6, 4