shithub: openh264

ref: 02e824d1253cdf8800c51fb39a359a062cbb0f45
dir: /codec/common/x86/mc_luma.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  mc_luma.asm
;*
;*  Abstract
;*      sse2 motion compensation
;*
;*  History
;*      17/08/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"

;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
%ifdef X86_32_PICASM
SECTION .text align=32
%else
SECTION .rodata align=32
%endif

;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
;*******************************************************************************

%ifdef HAVE_AVX2
ALIGN 32
dwm32768_256:
    times 16 dw -32768
maddubsw_m2p10_m40m40_p10m2_p0p0_256:
    times 4 db -2, 10, -40, -40, 10, -2, 0, 0
dwm1024_256:
    times 16 dw -1024
dd32768_256:
    times 8 dd 32768
maddubsw_p1m5_256:
    times 16 db 1, -5
maddubsw_m5p1_256:
    times 16 db -5, 1
db20_256:
    times 32 db 20
maddubsw_m5p20_256:
    times 16 db -5, 20
maddubsw_p20m5_256:
    times 16 db 20, -5
h264_w0x10_256:
    times 16 dw 16
dw32_256:
    times 16 dw 32
%endif ; HAVE_AVX2

ALIGN 16
shufb_32435465768798A9:
    db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9
shufb_011267784556ABBC:
    db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch
maddubsw_p1m5_p1m5_m5p1_m5p1_128:
    times 2 db 1, -5, 1, -5, -5, 1, -5, 1
maddubsw_m2p10_m40m40_p10m2_p0p0_128:
    times 2 db -2, 10, -40, -40, 10, -2, 0, 0
dwm1024_128:
    times 8 dw -1024
dd32768_128:
    times 4 dd 32768
maddubsw_p1m5_128:
    times 8 db 1, -5
maddubsw_m5p1_128:
    times 8 db -5, 1
db20_128:
    times 16 db 20
maddubsw_m5p20_128:
    times 8 db -5, 20
maddubsw_p20m5_128:
    times 8 db 20, -5
h264_w0x10_1:
    dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
    dw 32, 32, 32, 32, 32, 32, 32, 32


;*******************************************************************************
; Code
;*******************************************************************************

SECTION .text

%ifdef X86_32_PICASM

%macro MOVEIMM_DW16 1
    pcmpeqw      %1,  %1
    psrlw        %1,  15
    psllw        %1,  4
%endmacro

%endif

;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
;                       int iSrcStride,
;                       uint8_t *pDst,
;                       int iDstStride,
;                       int iHeight)
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq4_mmx
    %assign  push_num 0
    LOAD_5_PARA
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d

    sub r0, 2
    WELS_Zero mm7
%ifdef X86_32_PICASM
    MOVEIMM_DW16 mm6
%else
    movq mm6, [h264_w0x10_1]
%endif
.height_loop:
    movd mm0, [r0]
    punpcklbw mm0, mm7
    movd mm1, [r0+5]
    punpcklbw mm1, mm7
    movd mm2, [r0+1]
    punpcklbw mm2, mm7
    movd mm3, [r0+4]
    punpcklbw mm3, mm7
    movd mm4, [r0+2]
    punpcklbw mm4, mm7
    movd mm5, [r0+3]
    punpcklbw mm5, mm7

    paddw mm2, mm3
    paddw mm4, mm5
    psllw mm4, 2
    psubw mm4, mm2
    paddw mm0, mm1
    paddw mm0, mm4
    psllw mm4, 2
    paddw mm0, mm4
    paddw mm0, mm6
    psraw mm0, 5
    packuswb mm0, mm7
    movd [r2], mm0

    add r0, r1
    add r2, r3
    dec r4
    jnz .height_loop

    WELSEMMS
    LOAD_5_PARA_POP
    ret

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************


%macro SSE_LOAD_8P 3
    movq %1, %3
    punpcklbw %1, %2
%endmacro

%macro FILTER_HV_W8 9
    paddw   %1, %6
    paddw   %1, [pic(h264_w0x10_1)]
    movdqa  %8, %3
    movdqa  %7, %2
    paddw   %8, %4
    paddw   %7, %5
    psllw   %8, 2
    psubw   %8, %7
    paddw   %1, %8
    psllw   %8, 2
    paddw   %1, %8
    psraw   %1, 5
    WELS_Zero %8
    packuswb %1, %8
    movq    %9, %1
%endmacro


%macro FILTER_HV_W4 9
paddw   %1, %6
paddw   %1, [pic(h264_w0x10_1)]
movdqa  %8, %3
movdqa  %7, %2
paddw   %8, %4
paddw   %7, %5
psllw   %8, 2
psubw   %8, %7
paddw   %1, %8
psllw   %8, 2
paddw   %1, %8
psraw   %1, 5
WELS_Zero %8
packuswb %1, %8
movd    %9, %1
%endmacro


;*******************************************************************************
; Code
;*******************************************************************************

SECTION .text

;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
;                       int16_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride
;                       int32_t iHeight
;                       )
;***********************************************************************
WELS_EXTERN McHorVer22Width8HorFirst_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    pxor xmm7, xmm7

    sub r0, r1              ;;;;;;;;need more 5 lines.
    sub r0, r1

.yloop_width_8:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    movdqa [r2], xmm0

    add r0, r1
    add r2, r3
    dec r4
    jnz .yloop_width_8
    POP_XMM
    LOAD_5_PARA_POP
    ret

;*******************************************************************************
; void McHorVer20WidthEq8_sse2(  const uint8_t *pSrc,
;                       int iSrcStride,
;                                               uint8_t *pDst,
;                                               int iDstStride,
;                                               int iHeight,
;                      );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq8_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    lea r0, [r0-2]            ;pSrc -= 2;

    pxor xmm7, xmm7
%ifdef X86_32_PICASM
    MOVEIMM_DW16 xmm6
%else
    movdqa xmm6, [h264_w0x10_1]
%endif
.y_loop:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    paddw xmm0, xmm6
    psraw xmm0, 5

    packuswb xmm0, xmm7
    movq [r2], xmm0

    lea r2, [r2+r3]
    lea r0, [r0+r1]
    dec r4
    jnz near .y_loop

    POP_XMM
    LOAD_5_PARA_POP
    ret

;*******************************************************************************
; void McHorVer20WidthEq16_sse2(  const uint8_t *pSrc,
;                       int iSrcStride,
;                                               uint8_t *pDst,
;                                               int iDstStride,
;                                               int iHeight,
;                      );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq16_sse2
    %assign  push_num 0
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    lea r0, [r0-2]            ;pSrc -= 2;

    pxor xmm7, xmm7
%ifdef X86_32_PICASM
    MOVEIMM_DW16 xmm6
%else
    movdqa xmm6, [h264_w0x10_1]
%endif
.y_loop:

    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    paddw xmm0, xmm6
    psraw xmm0, 5
    packuswb xmm0, xmm7
    movq [r2], xmm0

    movq xmm0, [r0+8]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5+8]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1+8]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4+8]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2+8]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3+8]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    paddw xmm0, xmm6
    psraw xmm0, 5
    packuswb xmm0, xmm7
    movq [r2+8], xmm0

    lea r2, [r2+r3]
    lea r0, [r0+r1]
    dec r4
    jnz near .y_loop

    POP_XMM
    LOAD_5_PARA_POP
    ret


;*******************************************************************************
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
;                       int iSrcStride,
;                       uint8_t *pDst,
;                       int iDstStride,
;                       int iHeight )
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
    %assign  push_num 0
    INIT_X86_32_PIC r5
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    sub r0, r1
    sub r0, r1

    WELS_Zero xmm7

    SSE_LOAD_8P xmm0, xmm7, [r0]
    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm2, xmm7, [r0]
    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm4, xmm7, [r0]
    SSE_LOAD_8P xmm5, xmm7, [r0+r1]

.start:
    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r4
    jz near .xx_exit

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm6, xmm7, [r0]
    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
    dec r4
    jz near .xx_exit

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
    dec r4
    jz near .xx_exit

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm0, xmm1, [r0]
    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
    dec r4
    jz near .xx_exit

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
    dec r4
    jz near .xx_exit

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm2, xmm3, [r0]
    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
    dec r4
    jz near .xx_exit

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
    dec r4
    jz near .xx_exit

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm4, xmm5, [r0]
    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
    dec r4
    jz near .xx_exit

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
    jmp near .start

.xx_exit:
    POP_XMM
    LOAD_5_PARA_POP
    DEINIT_X86_32_PIC
    ret

;***********************************************************************
; Code
;***********************************************************************

SECTION .text



;***********************************************************************
; void McHorVer02Height9Or17_sse2(  const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       int32_t iWidth,
;                       int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d

%ifndef X86_32
    push r12
    push r13
    push r14
    mov  r12, r0
    mov  r13, r2
    mov  r14, r5
%endif

    shr r4, 3
    sub r0, r1
    sub r0, r1

.xloop:
    WELS_Zero xmm7
    SSE_LOAD_8P xmm0, xmm7, [r0]
    SSE_LOAD_8P xmm1, xmm7, [r0+r1]
    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm2, xmm7, [r0]
    SSE_LOAD_8P xmm3, xmm7, [r0+r1]
    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm4, xmm7, [r0]
    SSE_LOAD_8P xmm5, xmm7, [r0+r1]

    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm6, xmm7, [r0]
    movdqa xmm0,xmm1
    movdqa xmm1,xmm2
    movdqa xmm2,xmm3
    movdqa xmm3,xmm4
    movdqa xmm4,xmm5
    movdqa xmm5,xmm6
    add r2, r3
    sub r0, r1

.start:
    FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm6, xmm7, [r0]
    FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm7, xmm0, [r0+r1]
    FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm0, xmm1, [r0]
    FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm1, xmm2, [r0+r1]
    FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm2, xmm3, [r0]
    FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm3, xmm4, [r0+r1]
    FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    SSE_LOAD_8P xmm4, xmm5, [r0]
    FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    SSE_LOAD_8P xmm5, xmm6, [r0+r1]
    jmp near .start

.x_loop_dec:
    dec r4
    jz  near .xx_exit
%ifdef X86_32
    mov r0, arg1
    mov r2, arg3
    mov r5, arg6
%else
    mov r0, r12
    mov r2, r13
    mov r5, r14
%endif
    sub r0, r1
    sub r0, r1
    add r0, 8
    add r2, 8
    jmp near .xloop

.xx_exit:
%ifndef X86_32
    pop r14
    pop r13
    pop r12
%endif
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret


;***********************************************************************
; void McHorVer02Height5_sse2(  const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       int32_t iWidth,
;                       int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height5_sse2
%assign  push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION  r1, r1d
SIGN_EXTENSION  r3, r3d
SIGN_EXTENSION  r4, r4d
SIGN_EXTENSION  r5, r5d

%ifndef X86_32
push r12
push r13
push r14
mov  r12, r0
mov  r13, r2
mov  r14, r5
%endif

shr r4, 2
sub r0, r1
sub r0, r1

.xloop:
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]

FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
movdqa xmm0,xmm1
movdqa xmm1,xmm2
movdqa xmm2,xmm3
movdqa xmm3,xmm4
movdqa xmm4,xmm5
movdqa xmm5,xmm6
add r2, r3
sub r0, r1

.start:
FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W4 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start

.x_loop_dec:
dec r4
jz  near .xx_exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
sub r0, r1
sub r0, r1
add r0, 4
add r2, 4
jmp near .xloop

.xx_exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret


;***********************************************************************
; void McHorVer20Width9Or17_sse2(       const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       int32_t iWidth,
;                       int32_t iHeight
;                      );
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    sub r0, 2
    pxor xmm7, xmm7

    cmp r4, 9
    jne near .width_17

.yloop_width_9:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    movdqa xmm7, xmm2
    paddw   xmm7, xmm3
    movdqa xmm6, xmm4
    paddw   xmm6, xmm5
    psllw xmm6, 2
    psubw xmm6, xmm7
    paddw xmm0, xmm1
    paddw xmm0, xmm6
    psllw xmm6, 2
    paddw xmm0, xmm6
    paddw xmm0, [pic(h264_w0x10_1)]
    psraw  xmm0, 5
    packuswb xmm0, xmm0
    movd [r2], xmm0

    pxor  xmm7, xmm7
    movq xmm0, [r0+6]
    punpcklbw xmm0, xmm7

    paddw xmm4, xmm1
    paddw xmm5, xmm3
    psllw xmm5, 2
    psubw xmm5, xmm4
    paddw xmm2, xmm0
    paddw xmm2, xmm5
    psllw xmm5, 2
    paddw xmm2, xmm5
    paddw xmm2, [pic(h264_w0x10_1)]
    psraw  xmm2, 5
    packuswb xmm2, xmm2
    movq [r2+1], xmm2

    add r0, r1
    add r2, r3
    dec r5
    jnz .yloop_width_9
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret


.width_17:
.yloop_width_17:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    paddw xmm0, [pic(h264_w0x10_1)]
    psraw  xmm0, 5
    packuswb xmm0, xmm0
    movq [r2], xmm0

    movq xmm0, [r0+8]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5+8]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1+8]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4+8]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2+8]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3+8]
    punpcklbw xmm5, xmm7

    movdqa xmm7, xmm2
    paddw   xmm7, xmm3
    movdqa xmm6, xmm4
    paddw   xmm6, xmm5
    psllw xmm6, 2
    psubw xmm6, xmm7
    paddw xmm0, xmm1
    paddw xmm0, xmm6
    psllw xmm6, 2
    paddw xmm0, xmm6
    paddw xmm0, [pic(h264_w0x10_1)]
    psraw  xmm0, 5
    packuswb xmm0, xmm0
    movd [r2+8], xmm0


    pxor  xmm7, xmm7
    movq xmm0, [r0+6+8]
    punpcklbw xmm0, xmm7

    paddw xmm4, xmm1
    paddw xmm5, xmm3
    psllw xmm5, 2
    psubw xmm5, xmm4
    paddw xmm2, xmm0
    paddw xmm2, xmm5
    psllw xmm5, 2
    paddw xmm2, xmm5
    paddw xmm2, [pic(h264_w0x10_1)]
    psraw  xmm2, 5
    packuswb xmm2, xmm2
    movq [r2+9], xmm2
    add r0, r1
    add r2, r3
    dec r5
    jnz .yloop_width_17
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret


;***********************************************************************
; void McHorVer20Width5_sse2(       const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       int32_t iWidth,
;                       int32_t iHeight
;                      );
;***********************************************************************
WELS_EXTERN McHorVer20Width5_sse2
%assign  push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION  r1, r1d
SIGN_EXTENSION  r3, r3d
SIGN_EXTENSION  r4, r4d
SIGN_EXTENSION  r5, r5d
sub r0, 2
pxor xmm7, xmm7

.yloop_width_5:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7

movdqa xmm7, xmm2
paddw   xmm7, xmm3
movdqa xmm6, xmm4
paddw   xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [pic(h264_w0x10_1)]
psraw  xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0

pxor  xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7

paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [pic(h264_w0x10_1)]
psraw  xmm2, 5
packuswb xmm2, xmm2
movd [r2+1], xmm2

add r0, r1
add r2, r3
dec r5
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret


;***********************************************************************
;void McHorVer22HorFirst_sse2
;                           (const uint8_t *pSrc,
;                           int32_t iSrcStride,
;                           uint8_t * pTap,
;                           int32_t iTapStride,
;                           int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22HorFirst_sse2
    %assign  push_num 0
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    pxor xmm7, xmm7
    sub r0, r1              ;;;;;;;;need more 5 lines.
    sub r0, r1

    cmp r4, 9
    jne near .width_17

.yloop_width_9:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    movdqa xmm7, xmm2
    paddw   xmm7, xmm3
    movdqa xmm6, xmm4
    paddw   xmm6, xmm5
    psllw xmm6, 2
    psubw xmm6, xmm7
    paddw xmm0, xmm1
    paddw xmm0, xmm6
    psllw xmm6, 2
    paddw xmm0, xmm6
    movd [r2], xmm0

    pxor  xmm7, xmm7
    movq xmm0, [r0+6]
    punpcklbw xmm0, xmm7

    paddw xmm4, xmm1
    paddw xmm5, xmm3
    psllw xmm5, 2
    psubw xmm5, xmm4
    paddw xmm2, xmm0
    paddw xmm2, xmm5
    psllw xmm5, 2
    paddw xmm2, xmm5
    movq [r2+2], xmm2
    movhps [r2+2+8], xmm2

    add r0, r1
    add r2, r3
    dec r5
    jnz .yloop_width_9
    POP_XMM
    LOAD_6_PARA_POP
    ret


.width_17:
.yloop_width_17:
    movq xmm0, [r0]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3]
    punpcklbw xmm5, xmm7

    paddw xmm2, xmm3
    paddw xmm4, xmm5
    psllw xmm4, 2
    psubw xmm4, xmm2
    paddw xmm0, xmm1
    paddw xmm0, xmm4
    psllw xmm4, 2
    paddw xmm0, xmm4
    movdqa [r2], xmm0

    movq xmm0, [r0+8]
    punpcklbw xmm0, xmm7
    movq xmm1, [r0+5+8]
    punpcklbw xmm1, xmm7
    movq xmm2, [r0+1+8]
    punpcklbw xmm2, xmm7
    movq xmm3, [r0+4+8]
    punpcklbw xmm3, xmm7
    movq xmm4, [r0+2+8]
    punpcklbw xmm4, xmm7
    movq xmm5, [r0+3+8]
    punpcklbw xmm5, xmm7

    movdqa xmm7, xmm2
    paddw   xmm7, xmm3
    movdqa xmm6, xmm4
    paddw   xmm6, xmm5
    psllw xmm6, 2
    psubw xmm6, xmm7
    paddw xmm0, xmm1
    paddw xmm0, xmm6
    psllw xmm6, 2
    paddw xmm0, xmm6
    movd [r2+16], xmm0


    pxor  xmm7, xmm7
    movq xmm0, [r0+6+8]
    punpcklbw xmm0, xmm7

    paddw xmm4, xmm1
    paddw xmm5, xmm3
    psllw xmm5, 2
    psubw xmm5, xmm4
    paddw xmm2, xmm0
    paddw xmm2, xmm5
    psllw xmm5, 2
    paddw xmm2, xmm5
    movq [r2+18], xmm2
    movhps [r2+18+8], xmm2

    add r0, r1
    add r2, r3
    dec r5
    jnz .yloop_width_17
    POP_XMM
    LOAD_6_PARA_POP
    ret


%macro FILTER_VER 9
    paddw  %1, %6
    movdqa %7, %2
    movdqa %8, %3


    paddw %7, %5
    paddw %8, %4

    psubw  %1, %7
    psraw   %1, 2
    paddw  %1, %8
    psubw  %1, %7
    psraw   %1, 2
    paddw  %8, %1
    paddw  %8, [pic(h264_mc_hc_32)]
    psraw   %8, 6
    packuswb %8, %8
    movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
;                                           const uint8_t *pTap,
;                                           int32_t iTapStride,
;                                           uint8_t * pDst,
;                                           int32_t iDstStride,
;                                           int32_t iWidth,
;                                           int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
%ifndef X86_32
    push r12
    push r13
    push r14
    mov  r12, r0
    mov  r13, r2
    mov  r14, r5
%endif

    shr r4, 3

.width_loop:
    movdqa xmm0, [r0]
    movdqa xmm1, [r0+r1]
    lea r0, [r0+2*r1]
    movdqa xmm2, [r0]
    movdqa xmm3, [r0+r1]
    lea r0, [r0+2*r1]
    movdqa xmm4, [r0]
    movdqa xmm5, [r0+r1]

    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    lea r0, [r0+2*r1]
    movdqa xmm6, [r0]

    movdqa xmm0, xmm1
    movdqa xmm1, xmm2
    movdqa xmm2, xmm3
    movdqa xmm3, xmm4
    movdqa xmm4, xmm5
    movdqa xmm5, xmm6

    add r2, r3
    sub r0, r1

.start:
    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqa xmm6, [r0]
    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqa xmm7, [r0+r1]
    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqa xmm0, [r0]
    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqa xmm1, [r0+r1]
    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqa xmm2, [r0]
    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqa xmm3, [r0+r1]
    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqa xmm4, [r0]
    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqa xmm5, [r0+r1]
    jmp near .start

.x_loop_dec:
    dec r4
    jz near .exit
%ifdef X86_32
    mov r0, arg1
    mov r2, arg3
    mov r5, arg6
%else
    mov r0, r12
    mov r2, r13
    mov r5, r14
%endif
    add r0, 16
    add r2, 8
    jmp .width_loop

.exit:
%ifndef X86_32
    pop r14
    pop r13
    pop r12
%endif
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret

;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
;                                           const uint8_t *pTap,
;                                           int32_t iTapStride,
;                                           uint8_t * pDst,
;                                           int32_t iDstStride,
;                                           int32_t iWidth,
;                                           int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
%ifndef X86_32
    push r12
    push r13
    push r14
    mov  r12, r0
    mov  r13, r2
    mov  r14, r5
%endif
    shr r4, 3

.width_loop:
    movdqu xmm0, [r0]
    movdqu xmm1, [r0+r1]
    lea r0, [r0+2*r1]
    movdqu xmm2, [r0]
    movdqu xmm3, [r0+r1]
    lea r0, [r0+2*r1]
    movdqu xmm4, [r0]
    movdqu xmm5, [r0+r1]

    FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    lea r0, [r0+2*r1]
    movdqu xmm6, [r0]

    movdqa xmm0, xmm1
    movdqa xmm1, xmm2
    movdqa xmm2, xmm3
    movdqa xmm3, xmm4
    movdqa xmm4, xmm5
    movdqa xmm5, xmm6

    add r2, r3
    sub r0, r1

.start:
    FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqu xmm6, [r0]
    FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqu xmm7, [r0+r1]
    FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqu xmm0, [r0]
    FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqu xmm1, [r0+r1]
    FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqu xmm2, [r0]
    FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqu xmm3, [r0+r1]
    FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
    dec r5
    jz near .x_loop_dec

    lea r0, [r0+2*r1]
    movdqu xmm4, [r0]
    FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
    dec r5
    jz near .x_loop_dec

    lea r2, [r2+2*r3]
    movdqu xmm5, [r0+r1]
    jmp near .start

.x_loop_dec:
    dec r4
    jz near .exit
%ifdef X86_32
    mov r0, arg1
    mov r2, arg3
    mov r5, arg6
%else
    mov r0, r12
    mov r2, r13
    mov r5, r14
%endif
    add r0, 16
    add r2, 8
    jmp .width_loop

.exit:
%ifndef X86_32
    pop r14
    pop r13
    pop r12
%endif
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret


;***********************************************************************
;void McHorVer22Width5HorFirst_sse2
;                           (const uint8_t *pSrc,
;                           int32_t iSrcStride,
;                           uint8_t * pTap,
;                           int32_t iTapStride,
;                           int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width5HorFirst_sse2
%assign  push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION  r1, r1d
SIGN_EXTENSION  r3, r3d
SIGN_EXTENSION  r4, r4d
SIGN_EXTENSION  r5, r5d
pxor xmm7, xmm7
sub r0, r1              ;;;;;;;;need more 5 lines.
sub r0, r1

.yloop_width_5:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7

movdqa xmm7, xmm2
paddw   xmm7, xmm3
movdqa xmm6, xmm4
paddw   xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2], xmm0

pxor  xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7

paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+2], xmm2
movhps [r2+2+8], xmm2

add r0, r1
add r2, r3
dec r5
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
ret


%macro FILTER_VER_4 9
paddw  %1, %6
movdqa %7, %2
movdqa %8, %3


paddw %7, %5
paddw %8, %4

psubw  %1, %7
psraw   %1, 2
paddw  %1, %8
psubw  %1, %7
psraw   %1, 2
paddw  %8, %1
paddw  %8, [pic(h264_mc_hc_32)]
psraw   %8, 6
packuswb %8, %8
movd %9, %8
%endmacro


;***********************************************************************
;void McHorVer22Width4VerLastAlign_sse2(
;                                           const uint8_t *pTap,
;                                           int32_t iTapStride,
;                                           uint8_t * pDst,
;                                           int32_t iDstStride,
;                                           int32_t iWidth,
;                                           int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
%assign  push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION  r1, r1d
SIGN_EXTENSION  r3, r3d
SIGN_EXTENSION  r4, r4d
SIGN_EXTENSION  r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov  r12, r0
mov  r13, r2
mov  r14, r5
%endif

shr r4, 2

.width_loop:
movdqa xmm0, [r0]
movdqa xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]

FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqa xmm6, [r0]

movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6

add r2, r3
sub r0, r1

.start:
FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqa xmm6, [r0]
FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqa xmm7, [r0+r1]
FILTER_VER_4  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqa xmm0, [r0]
FILTER_VER_4  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqa xmm1, [r0+r1]
FILTER_VER_4  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqa xmm2, [r0]
FILTER_VER_4  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqa xmm3, [r0+r1]
FILTER_VER_4  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqa xmm4, [r0]
FILTER_VER_4  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqa xmm5, [r0+r1]
jmp near .start

.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 8
add r2, 4
jmp .width_loop

.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret


;***********************************************************************
;void McHorVer22Width4VerLastUnAlign_sse2(
;                                           const uint8_t *pTap,
;                                           int32_t iTapStride,
;                                           uint8_t * pDst,
;                                           int32_t iDstStride,
;                                           int32_t iWidth,
;                                           int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
%assign  push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION  r1, r1d
SIGN_EXTENSION  r3, r3d
SIGN_EXTENSION  r4, r4d
SIGN_EXTENSION  r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov  r12, r0
mov  r13, r2
mov  r14, r5
%endif
shr r4, 2

.width_loop:
movdqu xmm0, [r0]
movdqu xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqu xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
movdqu xmm5, [r0+r1]

FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqu xmm6, [r0]

movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6

add r2, r3
sub r0, r1

.start:
FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqu xmm6, [r0]
FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqu xmm7, [r0+r1]
FILTER_VER_4  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqu xmm0, [r0]
FILTER_VER_4  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqu xmm1, [r0+r1]
FILTER_VER_4  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqu xmm2, [r0]
FILTER_VER_4  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqu xmm3, [r0+r1]
FILTER_VER_4  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec

lea r0, [r0+2*r1]
movdqu xmm4, [r0]
FILTER_VER_4  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec

lea r2, [r2+2*r3]
movdqu xmm5, [r0+r1]
jmp near .start

.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 8
add r2, 4
jmp .width_loop

.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret


; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
%macro SSSE3_FilterVertical_8px 7
    pmaddubsw       %1, %4
    movdqa          %7, %2
    pmaddubsw       %7, %5
    paddw           %1, %7
    movdqa          %7, %3
    pmaddubsw       %7, %6
    paddw           %1, %7
    paddw           %1, [pic(h264_w0x10_1)]
    psraw           %1, 5
%endmacro

; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
%macro SSSE3_FilterVertical2_8px 8
    movdqa          %8, %2
    pxor            %7, %7
    punpcklbw       %1, %7
    punpcklbw       %8, %7
    paddw           %1, %8
    movdqa          %7, %3
    pmaddubsw       %7, %5
    paddw           %1, %7
    movdqa          %7, %4
    pmaddubsw       %7, %6
    paddw           %1, %7
    paddw           %1, [pic(h264_w0x10_1)]
    psraw           %1, 5
%endmacro

; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontalbw_8px 6
    movdqa          %5, %1
    pshufb          %1, %2
    pshufb          %5, %3
    pshufd          %6, %1, 10110001b
    pmaddubsw       %1, [pic(db20_128)]
    pmaddubsw       %5, %4
    pmaddubsw       %6, %4
    paddw           %1, %5
    paddw           %1, %6
%endmacro

; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontal_8px 6
    SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
    paddw           %1, [pic(h264_w0x10_1)]
    psraw           %1, 5
%endmacro

; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontalbw_2x4px 7
    movdqa          %6, %1
    movdqa          %7, %2
    pshufb          %1, %3
    pshufb          %2, %3
    punpcklqdq      %1, %2
    pshufb          %6, %4
    pshufb          %7, %4
    punpcklqdq      %6, %7
    pshufd          %7, %1, 10110001b
    pmaddubsw       %1, [pic(db20_128)]
    pmaddubsw       %6, %5
    pmaddubsw       %7, %5
    paddw           %1, %6
    paddw           %1, %7
%endmacro

; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontal_2x4px 7
    SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
    paddw           %1, [pic(h264_w0x10_1)]
    psraw           %1, 5
%endmacro

; pixels=%1 -32768>>scale=%2 tmp=%3
%macro SSSE3_FilterHorizontalbw_2px 3
    pmaddubsw       %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
    pmaddwd         %1, %2
    pshufd          %3, %1, 10110001b
    paddd           %1, %3
%endmacro

; pixels=%1 tmp=%2
%macro SSSE3_FilterHorizontal_2px 2
    SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
    paddd           %1, [pic(dd32768_128)]
%endmacro

; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
%macro SSE2_FilterVerticalw_8px 7
    paddw           %1, %6
    movdqa          %7, %2
    paddw           %7, %5
    psubw           %1, %7
    psraw           %1, 2
    psubw           %1, %7
    movdqa          %7, %3
    paddw           %7, %4
    paddw           %1, %7
    psraw           %1, 2
    paddw           %7, [pic(h264_mc_hc_32)]
    paddw           %1, %7
    psraw           %1, 6
%endmacro

;***********************************************************************
; void McHorVer02_ssse3(const uint8_t *pSrc,
;                       int32_t iSrcStride,
;                       uint8_t *pDst,
;                       int32_t iDstStride,
;                       int32_t iWidth,
;                       int32_t iHeight)
;***********************************************************************

WELS_EXTERN McHorVer02_ssse3
%define p_src         r0
%define i_srcstride   r1
%define p_dst         r2
%define i_dststride   r3
%ifdef X86_32_PICASM
%define i_width       dword arg5
%else
%define i_width       r4
%endif
%define i_height      r5
%define i_srcstride3  r6
    %assign push_num 0
%ifdef X86_32
    push            r6
    %assign push_num 1
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    INIT_X86_32_PIC_NOPRESERVE r4
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    lea             i_srcstride3, [3 * i_srcstride]
    %assign push_num_begin push_num
    cmp             i_width, 4
    jg              .width8or16

    movd            xmm0, [p_src]
    movd            xmm4, [p_src + i_srcstride]
    punpcklbw       xmm0, xmm4
    movd            xmm1, [p_src + 2 * i_srcstride]
    punpcklbw       xmm4, xmm1
    punpcklqdq      xmm0, xmm4
    movd            xmm4, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklbw       xmm1, xmm4
    movd            xmm2, [p_src]
    punpcklbw       xmm4, xmm2
    punpcklqdq      xmm1, xmm4
    movd            xmm4, [p_src + i_srcstride]
    lea             p_src, [p_src + 2 * i_srcstride]
    punpcklbw       xmm2, xmm4
    movd            xmm3, [p_src]
    punpcklbw       xmm4, xmm3
    punpcklqdq      xmm2, xmm4
    movdqa          xmm5, [pic(db20_128)]
    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm0, xmm0
    movd            [p_dst], xmm0
    psrlq           xmm0, 32
    movd            [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    movd            xmm4, [p_src + i_srcstride]
    punpcklbw       xmm3, xmm4
    movd            xmm0, [p_src + 2 * i_srcstride]
    punpcklbw       xmm4, xmm0
    punpcklqdq      xmm3, xmm4
    SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm1, xmm1
    movd            [p_dst], xmm1
    psrlq           xmm1, 32
    movd            [p_dst + i_dststride], xmm1
    cmp             i_height, 5
    jl              .width4_height_le5_done
    lea             p_dst, [p_dst + 2 * i_dststride]
    movd            xmm4, [p_src + i_srcstride3]
    punpcklbw       xmm0, xmm4
    jg              .width4_height_ge8
    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm2, xmm2
    movd            [p_dst], xmm2
.width4_height_le5_done:
    DEINIT_X86_32_PIC_KEEPDEF
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret
.width4_height_ge8:
    lea             p_src, [p_src + 4 * i_srcstride]
    movd            xmm1, [p_src]
    punpcklbw       xmm4, xmm1
    punpcklqdq      xmm0, xmm4
    SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm2, xmm2
    movd            [p_dst], xmm2
    psrlq           xmm2, 32
    movd            [p_dst + i_dststride], xmm2
    lea             p_dst, [p_dst + 2 * i_dststride]
    movd            xmm4, [p_src + i_srcstride]
    punpcklbw       xmm1, xmm4
    movd            xmm2, [p_src + 2 * i_srcstride]
    punpcklbw       xmm4, xmm2
    punpcklqdq      xmm1, xmm4
    SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm3, xmm3
    movd            [p_dst], xmm3
    psrlq           xmm3, 32
    movd            [p_dst + i_dststride], xmm3
    cmp             i_height, 9
    jl              .width4_height_ge8_done
    lea             p_dst, [p_dst + 2 * i_dststride]
    movd            xmm4, [p_src + i_srcstride3]
    punpcklbw       xmm2, xmm4
    SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm0, xmm0
    movd            [p_dst], xmm0
.width4_height_ge8_done:
    DEINIT_X86_32_PIC_KEEPDEF
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret

.width8or16:
    %assign push_num push_num_begin
    sub             i_height, 1
    push            i_height
    %assign push_num push_num + 1
%xdefine i_ycnt i_height
%define i_height [r7]
.xloop:
    push            p_src
    push            p_dst
    %assign push_num push_num + 2
    test            i_ycnt, 1
    jnz             .yloop_begin_even
    movq            xmm0, [p_src]
    movq            xmm1, [p_src + i_srcstride]
    punpcklbw       xmm0, xmm1
    movq            xmm2, [p_src + 2 * i_srcstride]
    movq            xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklbw       xmm2, xmm3
    movq            xmm4, [p_src]
    movq            xmm5, [p_src + i_srcstride]
    lea             p_src, [p_src + 2 * i_srcstride]
    punpcklbw       xmm4, xmm5
    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
    packuswb        xmm0, xmm0
    movlps          [p_dst], xmm0
    add             p_dst, i_dststride
    jmp             .yloop
.yloop_begin_even:
    movq            xmm1, [p_src]
    movq            xmm2, [p_src + i_srcstride]
    movq            xmm3, [p_src + 2 * i_srcstride]
    add             p_src, i_srcstride3
    punpcklbw       xmm2, xmm3
    movq            xmm4, [p_src]
    movq            xmm5, [p_src + i_srcstride]
    lea             p_src, [p_src + 2 * i_srcstride]
    punpcklbw       xmm4, xmm5
.yloop:
    movq            xmm6, [p_src]
    SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
    movq            xmm7, [p_src + i_srcstride]
    punpcklbw       xmm6, xmm7
    SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
    packuswb        xmm1, xmm2
    movlps          [p_dst], xmm1
    movhps          [p_dst + i_dststride], xmm1
    lea             p_dst, [p_dst + 2 * i_dststride]
    movq            xmm0, [p_src + 2 * i_srcstride]
    SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
    movq            xmm1, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklbw       xmm0, xmm1
    SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
    packuswb        xmm3, xmm4
    movlps          [p_dst], xmm3
    movhps          [p_dst + i_dststride], xmm3
    cmp             i_ycnt, 4
    jle             .yloop_exit
    lea             p_dst, [p_dst + 2 * i_dststride]
    movq            xmm2, [p_src]
    SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
    movq            xmm3, [p_src + i_srcstride]
    punpcklbw       xmm2, xmm3
    SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
    packuswb        xmm5, xmm6
    movlps          [p_dst], xmm5
    movhps          [p_dst + i_dststride], xmm5
    lea             p_dst, [p_dst + 2 * i_dststride]
    movq            xmm4, [p_src + 2 * i_srcstride]
    SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
    movq            xmm5, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklbw       xmm4, xmm5
    SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
    packuswb        xmm7, xmm0
    movlps          [p_dst], xmm7
    movhps          [p_dst + i_dststride], xmm7
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_ycnt, 8
    jg              .yloop
.yloop_exit:
    pop             p_dst
    pop             p_src
    %assign push_num push_num - 2
    sub             i_width, 8
    jle             .width8or16_done
    add             p_src, 8
    add             p_dst, 8
    mov             i_ycnt, i_height
    jmp             .xloop
.width8or16_done:
    pop             i_ycnt
    %assign push_num push_num - 1
    DEINIT_X86_32_PIC
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret
%undef p_src
%undef i_srcstride
%undef i_srcstride3
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_ycnt


;*******************************************************************************
; void McHorVer20_ssse3(const uint8_t *pSrc,
;                       int iSrcStride,
;                       uint8_t *pDst,
;                       int iDstStride,
;                       int iWidth,
;                       int iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_width      r4
%define i_height     r5
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    movdqa          xmm4, [pic(shufb_32435465768798A9)]
    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    cmp             i_width, 8
    je              .width8_yloop
    jg              .width16_yloop
.width4_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm1, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
    packuswb        xmm0, xmm0
    movd            [p_dst], xmm0
    psrlq           xmm0, 32
    movd            [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 2
    jg              .width4_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width8_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm1, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
    packuswb        xmm0, xmm1
    movlps          [p_dst], xmm0
    movhps          [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 2
    jg              .width8_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width16_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm1, [p_src + 6]
    add             p_src, i_srcstride
    SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
    packuswb        xmm0, xmm1
    MOVDQ           [p_dst], xmm0
    add             p_dst, i_dststride
    sub             i_height, 1
    jg              .width16_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height


;***********************************************************************
; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,
;                                    int32_t iSrcStride,
;                                    uint8_t *pDst,
;                                    int32_t iDstStride,
;                                    int32_t iWidth,
;                                    int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer20Width5Or9Or17_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_width      r4
%define i_height     r5
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    movdqa          xmm5, [pic(shufb_32435465768798A9)]
    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    cmp             i_width, 9
    je              .width9_yloop
    jg              .width17_yloop
.width5_yloop:
    movdqu          xmm0, [p_src - 2]
    add             p_src, i_srcstride
    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
    packuswb        xmm0, xmm0
    movdqa          xmm1, xmm0
    psrlq           xmm1, 8
    movd            [p_dst], xmm0
    movd            [p_dst + 1], xmm1
    add             p_dst, i_dststride
    sub             i_height, 1
    jg              .width5_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width9_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm4, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    movdqa          xmm3, xmm0
    punpckhqdq      xmm3, xmm4
    SSSE3_FilterHorizontal_2px xmm3, xmm2
    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
    packuswb        xmm3, xmm0
    movd            [p_dst + 5], xmm3
    movhps          [p_dst], xmm3
    add             p_dst, i_dststride
    SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2
    packuswb        xmm4, xmm4
    psrldq          xmm3, 4
    movd            [p_dst + 5], xmm3
    movlps          [p_dst], xmm4
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .width9_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width17_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm3, [p_src + 6]
    add             p_src, i_srcstride
    movdqa          xmm4, xmm3
    SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
    packuswb        xmm0, xmm3
    movdqu          xmm1, [p_src - 2]
    movdqu          xmm3, [p_src + 6]
    add             p_src, i_srcstride
    punpckhqdq      xmm4, xmm3
    SSSE3_FilterHorizontal_2px xmm4, xmm2
    packuswb        xmm4, xmm4
    movd            [p_dst + 13], xmm4
    MOVDQ           [p_dst], xmm0
    add             p_dst, i_dststride
    psrldq          xmm4, 4
    movd            [p_dst + 13], xmm4
    SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2
    SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2
    packuswb        xmm1, xmm3
    MOVDQ           [p_dst], xmm1
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .width17_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height


;*******************************************************************************
; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,
;                                    int iSrcStride,
;                                    int16_t *pDst,
;                                    int iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20Width4U8ToS16_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_height     r3
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    movdqa          xmm4, [pic(shufb_32435465768798A9)]
    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 1
.yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm1, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
    movdqa          [p_dst], xmm0
    add             p_dst, 16
    sub             i_height, 2
    jg              .yloop
    ; Height % 2 remainder.
    movdqu          xmm0, [p_src - 2]
    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    movlps          [p_dst], xmm0
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height


;***********************************************************************
; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,
;                                    uint8_t *pDst,
;                                    int32_t iDstStride,
;                                    int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width4S16ToU8_ssse3
%define p_src        r0
%define p_dst        r1
%define i_dststride  r2
%define i_height     r3
%define i_srcstride  8
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    movdqa          xmm0, [p_src +  0 * i_srcstride]
    movdqu          xmm1, [p_src +  1 * i_srcstride]
    movdqa          xmm2, [p_src +  2 * i_srcstride]
    movdqu          xmm3, [p_src +  3 * i_srcstride]
    movdqa          xmm4, [p_src +  4 * i_srcstride]
    movdqu          xmm5, [p_src +  5 * i_srcstride]
    movdqa          xmm6, [p_src +  6 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
    packuswb        xmm0, xmm0
    movd            [p_dst], xmm0
    psrlq           xmm0, 32
    movd            [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    movdqu          xmm7, [p_src +  7 * i_srcstride]
    movdqa          xmm0, [p_src +  8 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1
    packuswb        xmm2, xmm2
    movd            [p_dst], xmm2
    psrlq           xmm2, 32
    movd            [p_dst + i_dststride], xmm2
    cmp             i_height, 4
    jle             .done
    lea             p_dst, [p_dst + 2 * i_dststride]
    movdqu          xmm1, [p_src +  9 * i_srcstride]
    movdqa          xmm2, [p_src + 10 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3
    packuswb        xmm4, xmm4
    movd            [p_dst], xmm4
    psrlq           xmm4, 32
    movd            [p_dst + i_dststride], xmm4
    lea             p_dst, [p_dst + 2 * i_dststride]
    movdqu          xmm3, [p_src + 11 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5
    packuswb        xmm6, xmm6
    movd            [p_dst], xmm6
    psrlq           xmm6, 32
    movd            [p_dst + i_dststride], xmm6
.done:
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride


;***********************************************************************
; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,
;                                    int16_t iSrcStride,
;                                    int16_t *pDst,
;                                    int32_t iDstStride,
;                                    int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer20Width8U8ToS16_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_height     r4
    %assign  push_num 0
    INIT_X86_32_PIC r5
    LOAD_5_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    movdqa          xmm4, [pic(shufb_32435465768798A9)]
    movdqa          xmm5, [pic(shufb_011267784556ABBC)]
    movdqa          xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 1
.yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm1, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    MOVDQ           [p_dst], xmm0
    add             p_dst, i_dststride
    SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
    MOVDQ           [p_dst], xmm1
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .yloop
    jl              .done
    movdqu          xmm0, [p_src - 2]
    SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    MOVDQ           [p_dst], xmm0
.done:
    POP_XMM
    LOAD_5_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height


;***********************************************************************
; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,
;                                    int32_t iTapStride,
;                                    uint8_t *pDst,
;                                    int32_t iDstStride,
;                                    int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width5S16ToU8_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_height     r4
%define i_srcstride3 r5
    %assign  push_num 0
%ifdef X86_32
    push            r5
    %assign  push_num 1
%endif
    INIT_X86_32_PIC r6
    LOAD_5_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    lea             i_srcstride3, [3 * i_srcstride]
    movdqa          xmm0, [p_src]
    movdqa          xmm1, [p_src + i_srcstride]
    movdqa          xmm2, [p_src + 2 * i_srcstride]
    movdqa          xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    movdqa          xmm4, [p_src]
    movdqa          xmm5, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    movdqa          xmm6, [p_src + 2 * i_srcstride]
    packuswb        xmm0, xmm0
    movdqa          xmm7, xmm0
    psrlq           xmm7, 8
    movd            [p_dst + 1], xmm7
    movd            [p_dst], xmm0
    add             p_dst, i_dststride
    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
    movdqa          xmm7, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    packuswb        xmm1, xmm1
    movdqa          xmm0, xmm1
    psrlq           xmm0, 8
    movd            [p_dst + 1], xmm0
    movd            [p_dst], xmm1
    add             p_dst, i_dststride
    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
    movdqa          xmm0, [p_src]
    packuswb        xmm2, xmm2
    movdqa          xmm1, xmm2
    psrlq           xmm1, 8
    movd            [p_dst + 1], xmm1
    movd            [p_dst], xmm2
    add             p_dst, i_dststride
    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
    packuswb        xmm3, xmm3
    movdqa          xmm2, xmm3
    psrlq           xmm2, 8
    movd            [p_dst + 1], xmm2
    movd            [p_dst], xmm3
    add             p_dst, i_dststride
    movdqa          xmm1, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
    packuswb        xmm4, xmm4
    movdqa          xmm3, xmm4
    psrlq           xmm3, 8
    movd            [p_dst + 1], xmm3
    movd            [p_dst], xmm4
    cmp             i_height, 5
    jle             .done
    add             p_dst, i_dststride
    movdqa          xmm2, [p_src + 2 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
    movdqa          xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    packuswb        xmm5, xmm5
    movdqa          xmm4, xmm5
    psrlq           xmm4, 8
    movd            [p_dst + 1], xmm4
    movd            [p_dst], xmm5
    add             p_dst, i_dststride
    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
    movdqa          xmm4, [p_src]
    packuswb        xmm6, xmm6
    movdqa          xmm5, xmm6
    psrlq           xmm5, 8
    movd            [p_dst + 1], xmm5
    movd            [p_dst], xmm6
    add             p_dst, i_dststride
    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
    packuswb        xmm7, xmm7
    movdqa          xmm6, xmm7
    psrlq           xmm6, 8
    movd            [p_dst + 1], xmm6
    movd            [p_dst], xmm7
    add             p_dst, i_dststride
    movdqa          xmm5, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    packuswb        xmm0, xmm0
    movdqa          xmm7, xmm0
    psrlq           xmm7, 8
    movd            [p_dst + 1], xmm7
    movd            [p_dst], xmm0
.done:
    POP_XMM
    LOAD_5_PARA_POP
    DEINIT_X86_32_PIC
%ifdef X86_32
    pop             r5
%endif
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride3


;***********************************************************************
; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,
;                                        int32_t iSrcStride,
;                                        int16_t *pDst,
;                                        int32_t iDstStride,
;                                        int32_t iWidth,
;                                        int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3
%define p_src       r0
%define i_srcstride r1
%define p_dst       r2
%define i_dststride r3
%define i_width     r4
%define i_height    r5
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    pcmpeqw         xmm4, xmm4
    psllw           xmm4, 15                                ; dw -32768
    movdqa          xmm5, [pic(shufb_32435465768798A9)]
    movdqa          xmm6, [pic(shufb_011267784556ABBC)]
    movdqa          xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    cmp             i_width, 9
    jne             .width17_yloop

.width9_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqa          xmm3, xmm0
    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
    movdqu          xmm2, [p_src + i_srcstride - 2]
    lea             p_src, [p_src + 2 * i_srcstride]
    punpckhqdq      xmm3, xmm2
    SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1
    movlps          [p_dst + 10], xmm3
    MOVDQ           [p_dst], xmm0
    add             p_dst, i_dststride
    movhps          [p_dst + 10], xmm3
    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0
    MOVDQ           [p_dst], xmm2
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .width9_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret

.width17_yloop:
    movdqu          xmm0, [p_src - 2]
    movdqu          xmm3, [p_src + 6]
    add             p_src, i_srcstride
    SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
    MOVDQ           [p_dst], xmm0
    movdqa          xmm0, xmm3
    SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
    movdqu          xmm2, [p_src + 6]
    punpckhqdq      xmm0, xmm2
    SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1
    movdqu          xmm1, [p_src - 2]
    add             p_src, i_srcstride
    movlps          [p_dst + 26], xmm0
    MOVDQ           [p_dst + 16], xmm3
    add             p_dst, i_dststride
    movhps          [p_dst + 26], xmm0
    SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3
    MOVDQ           [p_dst], xmm1
    SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3
    MOVDQ           [p_dst + 16], xmm2
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .width17_yloop
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height


;***********************************************************************
; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,
;                                      int32_t iSrcStride,
;                                      uint8_t *pDst,
;                                      int32_t iDstStride,
;                                      int32_t iWidth,
;                                      int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%ifdef X86_32_PICASM
%define i_width      dword arg5
%else
%define i_width      r4
%endif
%define i_height     r5
%define i_srcstride3 r6
    %assign  push_num 0
%ifdef X86_32
    push            r6
    %assign  push_num 1
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    INIT_X86_32_PIC_NOPRESERVE r4
    sub             i_height, 1
    push            i_height
    %assign push_num push_num + 1
    lea             i_srcstride3, [3 * i_srcstride]
    test            i_width, 1
    jz              .width_loop
    push            p_src
    push            p_dst
    %assign push_num push_num + 2
%ifdef X86_32_PICASM
    add             p_src, i_width
    add             p_src, i_width
    sub             p_src, 2
%else
    lea             p_src, [p_src + 2 * i_width - 2]
%endif
    add             p_dst, i_width
    movd            xmm0, [p_src]
    punpcklwd       xmm0, [p_src + i_srcstride]
    movd            xmm1, [p_src + 2 * i_srcstride]
    add             p_src, i_srcstride3
    punpcklwd       xmm1, [p_src]
    punpckldq       xmm0, xmm1
    movd            xmm1, [p_src + i_srcstride]
    cmp             i_height, 4
    je              .filter5_unalign
    punpcklwd       xmm1, [p_src + 2 * i_srcstride]
    movd            xmm2, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklwd       xmm2, [p_src]
    punpckldq       xmm1, xmm2
    punpcklqdq      xmm0, xmm1
.height_loop_unalign:
    movd            xmm1, [p_src + i_srcstride]
    palignr         xmm1, xmm0, 2
    movd            xmm2, [p_src + 2 * i_srcstride]
    palignr         xmm2, xmm1, 2
    movd            xmm3, [p_src + i_srcstride3]
    palignr         xmm3, xmm2, 2
    lea             p_src, [p_src + 4 * i_srcstride]
    movd            xmm4, [p_src]
    palignr         xmm4, xmm3, 2
    movd            xmm5, [p_src + i_srcstride]
    palignr         xmm5, xmm4, 2
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
    packuswb        xmm0, xmm0
    movdqa          xmm6, xmm0
    pslld           xmm6, 24
    movd            [p_dst - 4], xmm6
    movlps          [p_dst + 4 * i_dststride - 8], xmm6
    add             p_dst, i_dststride
    movdqa          xmm6, xmm0
    pslld           xmm6, 16
    movd            [p_dst - 4], xmm6
    movlps          [p_dst + 4 * i_dststride - 8], xmm6
    add             p_dst, i_dststride
    movdqa          xmm6, xmm0
    pslld           xmm6, 8
    movd            [p_dst - 4], xmm6
    movd            [p_dst + i_dststride - 4], xmm0
    lea             p_dst, [p_dst + 4 * i_dststride]
    movlps          [p_dst - 8], xmm6
    movlps          [p_dst + i_dststride - 8], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jle             .height_loop_unalign_exit
    movd            xmm1, [p_src + 2 * i_srcstride]
    palignr         xmm1, xmm5, 2
    movd            xmm0, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    punpcklwd       xmm0, [p_src]
    palignr         xmm0, xmm1, 4
    jmp             .height_loop_unalign
.height_loop_unalign_exit:
    movddup         xmm6, [p_src + 2 * i_srcstride - 6]
    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
    packuswb        xmm1, xmm1
    movlps          [p_dst - 8], xmm1
    jmp             .unalign_done
.filter5_unalign:
    pslldq          xmm0, 8
    palignr         xmm1, xmm0, 2
    movd            xmm2, [p_src + 2 * i_srcstride]
    palignr         xmm2, xmm1, 2
    movd            xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    palignr         xmm3, xmm2, 2
    movd            xmm4, [p_src]
    palignr         xmm4, xmm3, 2
    movd            xmm5, [p_src + i_srcstride]
    palignr         xmm5, xmm4, 2
    movd            xmm6, [p_src + 2 * i_srcstride]
    palignr         xmm6, xmm5, 2
    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
    packuswb        xmm1, xmm1
    movdqa          xmm0, xmm1
    psrlq           xmm1,  8
    movdqa          xmm2, xmm0
    psrlq           xmm2, 16
    movdqa          xmm3, xmm0
    psrlq           xmm3, 24
    movd            [p_dst - 4], xmm0
    movd            [p_dst + i_dststride - 4], xmm1
    lea             p_dst, [p_dst + 2 * i_dststride]
    movd            [p_dst - 4], xmm2
    movd            [p_dst + i_dststride - 4], xmm3
    movlps          [p_dst + 2 * i_dststride - 8], xmm0
.unalign_done:
    pop             p_dst
    pop             p_src
    %assign push_num push_num - 2
    mov             i_height, [r7]
    sub             i_width, 1
.width_loop:
    push            p_src
    push            p_dst
    %assign push_num push_num + 2
    movdqa          xmm0, [p_src]
    movdqa          xmm1, [p_src + i_srcstride]
    movdqa          xmm2, [p_src + 2 * i_srcstride]
    movdqa          xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    movdqa          xmm4, [p_src]
.height_loop:
    movdqa          xmm5, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    movdqa          xmm6, [p_src + 2 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
    movdqa          xmm7, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    packuswb        xmm0, xmm1
    movlps          [p_dst], xmm0
    movhps          [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
    movdqa          xmm0, [p_src]
    SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
    packuswb        xmm2, xmm3
    movlps          [p_dst], xmm2
    movhps          [p_dst + i_dststride], xmm2
    cmp             i_height, 4
    jl              .x_loop_dec
    lea             p_dst, [p_dst + 2 * i_dststride]
    movdqa          xmm1, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
    je              .store_xmm4_exit
    movdqa          xmm2, [p_src + 2 * i_srcstride]
    SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
    movdqa          xmm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    packuswb        xmm4, xmm5
    movlps          [p_dst], xmm4
    movhps          [p_dst + i_dststride], xmm4
    lea             p_dst, [p_dst + 2 * i_dststride]
    SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
    movdqa          xmm4, [p_src]
    SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
    packuswb        xmm6, xmm7
    movlps          [p_dst], xmm6
    movhps          [p_dst + i_dststride], xmm6
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jg              .height_loop
    jl              .x_loop_dec
    movdqa          xmm5, [p_src + i_srcstride]
    SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
    packuswb        xmm0, xmm0
    movlps          [p_dst], xmm0
.x_loop_dec:
    pop             p_dst
    pop             p_src
    %assign push_num push_num - 2
    sub             i_width, 8
    jle             .done
    mov             i_height, [r7]
    add             p_src, 16
    add             p_dst, 8
    jmp             .width_loop
.store_xmm4_exit:
    packuswb        xmm4, xmm4
    movlps          [p_dst], xmm4
    pop             p_dst
    pop             p_src
.done:
    pop             i_height
    %assign push_num push_num - 1
    DEINIT_X86_32_PIC
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_srcstride3


%ifdef HAVE_AVX2

; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro AVX2_FilterHorizontalbw_16px 6
    vpshufb         %5, %1, %3
    vpshufb         %1, %1, %2
    vpshufd         %6, %1, 10110001b
    vpmaddubsw      %1, %1, [pic(db20_256)]
    vpmaddubsw      %5, %5, %4
    vpmaddubsw      %6, %6, %4
    vpaddw          %1, %1, %5
    vpaddw          %1, %1, %6
%endmacro

; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
%macro AVX2_FilterHorizontal_16px 6
    AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
    vpaddw          %1, %1, [pic(h264_w0x10_256)]
    vpsraw          %1, %1, 5
%endmacro

; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro AVX2_FilterHorizontalbw_4x4px 7
    vpshufb         %6, %1, %4
    vpshufb         %7, %2, %4
    vpshufb         %1, %1, %3
    vpshufb         %2, %2, %3
    vpunpcklqdq     %1, %1, %2
    vpunpcklqdq     %6, %6, %7
    vpshufd         %7, %1, 10110001b
    vpmaddubsw      %1, %1, [pic(db20_256)]
    vpmaddubsw      %6, %6, %5
    vpmaddubsw      %7, %7, %5
    vpaddw          %1, %1, %6
    vpaddw          %1, %1, %7
%endmacro

; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
%macro AVX2_FilterHorizontal_4x4px 7
    AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
    vpaddw          %1, %1, [pic(h264_w0x10_256)]
    vpsraw          %1, %1, 5
%endmacro

; pixels=%1 -32768>>scale=%2 tmp=%3
%macro AVX2_FilterHorizontalbw_4px 3
    vpmaddubsw      %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
    vpmaddwd        %1, %1, %2
    vpshufd         %3, %1, 10110001b
    vpaddd          %1, %1, %3
%endmacro

; pixels=%1 tmp=%2
%macro AVX2_FilterHorizontal_4px 2
    AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
    vpaddd          %1, %1, [pic(dd32768_256)]
%endmacro

; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
%macro AVX2_FilterVertical_16px 7
    vpmaddubsw      %1, %1, %4
    vpmaddubsw      %7, %2, %5
    vpaddw          %1, %1, %7
    vpmaddubsw      %7, %3, %6
    vpaddw          %1, %1, %7
    vpaddw          %1, %1, [pic(h264_w0x10_256)]
    vpsraw          %1, %1, 5
%endmacro

; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
%macro AVX2_FilterVertical2_16px 8
    vpxor           %7, %7, %7
    vpunpcklbw      %1, %1, %7
    vpunpcklbw      %8, %2, %7
    vpaddw          %1, %1, %8
    vpmaddubsw      %7, %3, %5
    vpaddw          %1, %1, %7
    vpmaddubsw      %7, %4, %6
    vpaddw          %1, %1, %7
    vpaddw          %1, %1, [pic(h264_w0x10_256)]
    vpsraw          %1, %1, 5
%endmacro

; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
%macro AVX2_FilterVerticalw_16px 7
    vpaddw          %1, %1, %6
    vpaddw          %7, %2, %5
    vpsubw          %1, %1, %7
    vpsraw          %1, %1, 2
    vpsubw          %1, %1, %7
    vpaddw          %7, %3, %4
    vpaddw          %1, %1, %7
    vpsraw          %1, %1, 2
    vpaddw          %7, %7, [pic(dw32_256)]
    vpaddw          %1, %1, %7
    vpsraw          %1, %1, 6
%endmacro

;***********************************************************************
; void McHorVer02_avx2(const uint8_t *pSrc,
;                      int32_t iSrcStride,
;                      uint8_t *pDst,
;                      int32_t iDstStride,
;                      int32_t iWidth,
;                      int32_t iHeight)
;***********************************************************************

WELS_EXTERN McHorVer02_avx2
%define p_src         r0
%define i_srcstride   r1
%define p_dst         r2
%define i_dststride   r3
%ifdef X86_32_PICASM
%define i_width       dword arg5
%else
%define i_width       r4
%endif
%define i_height      r5
%define i_srcstride3  r6
    %assign push_num 0
%ifdef X86_32
    push            r6
    %assign push_num 1
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    INIT_X86_32_PIC_NOPRESERVE r4
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    lea             i_srcstride3, [3 * i_srcstride]
    cmp             i_width, 8
    je              .width8
    jg              .width16
; .width4:
    vmovd           xmm0, [p_src]
    vpbroadcastd    xmm5, [p_src + i_srcstride]
    vpunpcklbw      xmm0, xmm0, xmm5
    vpbroadcastd    ymm1, [p_src + 2 * i_srcstride]
    vpunpcklbw      xmm5, xmm5, xmm1
    vpblendd        xmm0, xmm0, xmm5, 1100b
    vpbroadcastd    ymm5, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklbw      ymm1, ymm1, ymm5
    vpbroadcastd    ymm2, [p_src]
    vpunpcklbw      ymm5, ymm5, ymm2
    vpblendd        ymm1, ymm1, ymm5, 11001100b
    vpblendd        ymm0, ymm0, ymm1, 11110000b
    vpbroadcastd    ymm5, [p_src + i_srcstride]
    lea             p_src, [p_src + 2 * i_srcstride]
    vpunpcklbw      ymm2, ymm2, ymm5
    vpbroadcastd    ymm3, [p_src]
    vpunpcklbw      ymm5, ymm5, ymm3
    vpblendd        ymm2, ymm2, ymm5, 11001100b
    vpblendd        ymm1, ymm1, ymm2, 11110000b
    vpbroadcastd    ymm5, [p_src + i_srcstride]
    vpunpcklbw      ymm3, ymm3, ymm5
    vpbroadcastd    ymm4, [p_src + 2 * i_srcstride]
    vpunpcklbw      ymm5, ymm5, ymm4
    vpblendd        ymm3, ymm3, ymm5, 11001100b
    vpblendd        ymm2, ymm2, ymm3, 11110000b
    vbroadcasti128  ymm6, [pic(db20_128)]
    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
    vpackuswb       ymm0, ymm0, ymm0
    vmovd           [p_dst], xmm0
    vpsrlq          xmm5, xmm0, 32
    vmovd           [p_dst + i_dststride], xmm5
    lea             p_dst, [p_dst + 2 * i_dststride]
    vextracti128    xmm0, ymm0, 1
    vmovd           [p_dst], xmm0
    vpsrlq          xmm5, xmm0, 32
    vmovd           [p_dst + i_dststride], xmm5
    cmp             i_height, 5
    jl              .width4_done
    lea             p_dst, [p_dst + 2 * i_dststride]
    vpbroadcastd    ymm5, [p_src + i_srcstride3]
    vpunpcklbw      ymm4, ymm4, ymm5
    jg              .width4_height_ge8
    AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
    vpackuswb       xmm2, xmm2, xmm2
    vmovd           [p_dst], xmm2
    jmp             .width4_done
.width4_height_ge8:
    lea             p_src, [p_src + 4 * i_srcstride]
    vpbroadcastd    ymm1, [p_src]
    vpunpcklbw      ymm5, ymm5, ymm1
    vpblendd        ymm4, ymm4, ymm5, 11001100b
    vpblendd        ymm3, ymm3, ymm4, 11110000b
    vpbroadcastd    ymm5, [p_src + i_srcstride]
    vpunpcklbw      ymm1, ymm5
    vpbroadcastd    ymm0, [p_src + 2 * i_srcstride]
    vpunpcklbw      ymm5, ymm5, ymm0
    vpblendd        ymm1, ymm1, ymm5, 11001100b
    vpblendd        ymm4, ymm4, ymm1, 11110000b
    AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
    vpackuswb       ymm2, ymm2, ymm2
    vmovd           [p_dst], xmm2
    vpsrlq          xmm5, xmm2, 32
    vmovd           [p_dst + i_dststride], xmm5
    lea             p_dst, [p_dst + 2 * i_dststride]
    vextracti128    xmm2, ymm2, 1
    vmovd           [p_dst], xmm2
    vpsrlq          xmm5, xmm2, 32
    vmovd           [p_dst + i_dststride], xmm5
    cmp             i_height, 9
    jl              .width4_done
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovd           xmm5, [p_src + i_srcstride3]
    vpunpcklbw      xmm0, xmm0, xmm5
    AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
    vpackuswb       xmm4, xmm4, xmm4
    vmovd           [p_dst], xmm4
.width4_done:
    vzeroupper
    DEINIT_X86_32_PIC_KEEPDEF
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret

.width8:
    sub             i_height, 1
    vmovq           xmm0, [p_src]
    vmovq           xmm4, [p_src + i_srcstride]
    vpunpcklbw      xmm0, xmm0, xmm4
    vmovq           xmm1, [p_src + 2 * i_srcstride]
    vpunpcklbw      xmm4, xmm4, xmm1
    vinserti128     ymm0, ymm0, xmm4, 1
    vmovq           xmm4, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklbw      xmm1, xmm1, xmm4
    vmovq           xmm6, [p_src]
    vpunpcklbw      xmm4, xmm4, xmm6
    vinserti128     ymm1, ymm1, xmm4, 1
.width8_yloop:
    vmovq           xmm4, [p_src + i_srcstride]
    vpunpcklbw      xmm2, xmm6, xmm4
    vmovq           xmm3, [p_src + 2 * i_srcstride]
    vpunpcklbw      xmm4, xmm4, xmm3
    vinserti128     ymm2, ymm2, xmm4, 1
    vbroadcasti128  ymm5, [pic(db20_128)]
    AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
    vmovq           xmm4, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklbw      xmm3, xmm3, xmm4
    vmovq           xmm6, [p_src]
    vpunpcklbw      xmm4, xmm4, xmm6
    vinserti128     ymm3, ymm3, xmm4, 1
    AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
    vpackuswb       ymm0, ymm0, ymm1
    vmovlps         [p_dst], xmm0
    vextracti128    xmm1, ymm0, 1
    vmovlps         [p_dst + i_dststride], xmm1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovhps         [p_dst], xmm0
    vmovhps         [p_dst + i_dststride], xmm1
    cmp             i_height, 4
    jl              .width8_done
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovq           xmm4, [p_src + i_srcstride]
    vpunpcklbw      xmm0, xmm6, xmm4
    jg              .width8_height_ge8
    AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
    vpackuswb       xmm2, xmm2, xmm2
    vmovlps         [p_dst], xmm2
    jmp             .width8_done
.width8_height_ge8:
    vmovq           xmm1, [p_src + 2 * i_srcstride]
    vpunpcklbw      xmm4, xmm4, xmm1
    vinserti128     ymm0, ymm0, xmm4, 1
    AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
    vmovq           xmm4, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklbw      xmm1, xmm1, xmm4
    vmovq           xmm6, [p_src]
    vpunpcklbw      xmm4, xmm4, xmm6
    vinserti128     ymm1, ymm1, xmm4, 1
    AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
    vpackuswb       ymm2, ymm2, ymm3
    vmovlps         [p_dst], xmm2
    vextracti128    xmm3, ymm2, 1
    vmovlps         [p_dst + i_dststride], xmm3
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovhps         [p_dst], xmm2
    vmovhps         [p_dst + i_dststride], xmm3
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jg              .width8_yloop
    jl              .width8_done
    vmovq           xmm4, [p_src + i_srcstride]
    vpunpcklbw      xmm2, xmm6, xmm4
    AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
    vpackuswb       xmm0, xmm0, xmm0
    vmovlps         [p_dst], xmm0
.width8_done:
    vzeroupper
    DEINIT_X86_32_PIC_KEEPDEF
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret

.width16:
    sub             i_height, 1
    test            i_height, 1
    jnz             .width16_yloop_begin_even
    vmovq           xmm0, [p_src]
    vpbroadcastq    ymm1, [p_src + 8]
    vpblendd        ymm0, ymm0, ymm1, 11110000b
    vmovq           xmm1, [p_src + i_srcstride]
    vpbroadcastq    ymm2, [p_src + i_srcstride + 8]
    vpblendd        ymm1, ymm1, ymm2, 11110000b
    vpunpcklbw      ymm0, ymm0, ymm1
    vmovq           xmm2, [p_src + 2 * i_srcstride]
    vpbroadcastq    ymm3, [p_src + 2 * i_srcstride + 8]
    vpblendd        ymm2, ymm2, ymm3, 11110000b
    vmovq           xmm3, [p_src + i_srcstride3]
    vpbroadcastq    ymm4, [p_src + i_srcstride3 + 8]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpblendd        ymm3, ymm3, ymm4, 11110000b
    vpunpcklbw      ymm2, ymm2, ymm3
    vmovq           xmm4, [p_src]
    vpbroadcastq    ymm5, [p_src + 8]
    vpblendd        ymm4, ymm4, ymm5, 11110000b
    vmovq           xmm5, [p_src + i_srcstride]
    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
    lea             p_src, [p_src + 2 * i_srcstride]
    vpblendd        ymm5, ymm5, ymm6, 11110000b
    vpunpcklbw      ymm4, ymm4, ymm5
    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
    vpackuswb       ymm0, ymm0, ymm0
    vpermq          ymm0, ymm0, 1000b
    vmovdqa         [p_dst], xmm0
    add             p_dst, i_dststride
    jmp             .width16_yloop
.width16_yloop_begin_even:
    vmovq           xmm1, [p_src]
    vpbroadcastq    ymm2, [p_src + 8]
    vpblendd        ymm1, ymm1, ymm2, 11110000b
    vmovq           xmm2, [p_src + i_srcstride]
    vpbroadcastq    ymm3, [p_src + i_srcstride + 8]
    vpblendd        ymm2, ymm2, ymm3, 11110000b
    vmovq           xmm3, [p_src + 2 * i_srcstride]
    vpbroadcastq    ymm4, [p_src + 2 * i_srcstride + 8]
    add             p_src, i_srcstride3
    vpblendd        ymm3, ymm3, ymm4, 11110000b
    vpunpcklbw      ymm2, ymm2, ymm3
    vmovq           xmm4, [p_src]
    vpbroadcastq    ymm5, [p_src + 8]
    vpblendd        ymm4, ymm4, ymm5, 11110000b
    vmovq           xmm5, [p_src + i_srcstride]
    vpbroadcastq    ymm6, [p_src + i_srcstride + 8]
    lea             p_src, [p_src + 2 * i_srcstride]
    vpblendd        ymm5, ymm5, ymm6, 11110000b
    vpunpcklbw      ymm4, ymm4, ymm5
.width16_yloop:
    vmovq           xmm6, [p_src]
    vpbroadcastq    ymm7, [p_src + 8]
    vpblendd        ymm6, ymm6, ymm7, 11110000b
    AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
    vmovq           xmm7, [p_src + i_srcstride]
    vpbroadcastq    ymm0, [p_src + i_srcstride + 8]
    vpblendd        ymm7, ymm7, ymm0, 11110000b
    vpunpcklbw      ymm6, ymm6, ymm7
    AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
    vpackuswb       ymm1, ymm1, ymm2
    vpermq          ymm1, ymm1, 11011000b
    vmovdqa         [p_dst], xmm1
    vextracti128    [p_dst + i_dststride], ymm1, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovq           xmm0, [p_src + 2 * i_srcstride]
    vpbroadcastq    ymm1, [p_src + 2 * i_srcstride + 8]
    vpblendd        ymm0, ymm0, ymm1, 11110000b
    AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
    vmovq           xmm1, [p_src + i_srcstride3]
    vpbroadcastq    ymm2, [p_src + i_srcstride3 + 8]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpblendd        ymm1, ymm1, ymm2, 11110000b
    vpunpcklbw      ymm0, ymm0, ymm1
    AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
    vpackuswb       ymm3, ymm3, ymm4
    vpermq          ymm3, ymm3, 11011000b
    vmovdqa         [p_dst], xmm3
    vextracti128    [p_dst + i_dststride], ymm3, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovq           xmm2, [p_src]
    vpbroadcastq    ymm3, [p_src + 8]
    vpblendd        ymm2, ymm2, ymm3, 11110000b
    AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
    vmovq           xmm3, [p_src + i_srcstride]
    vpbroadcastq    ymm4, [p_src + i_srcstride + 8]
    vpblendd        ymm3, ymm3, ymm4, 11110000b
    vpunpcklbw      ymm2, ymm2, ymm3
    AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
    vpackuswb       ymm5, ymm5, ymm6
    vpermq          ymm5, ymm5, 11011000b
    vmovdqa         [p_dst], xmm5
    vextracti128    [p_dst + i_dststride], ymm5, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovq           xmm4, [p_src + 2 * i_srcstride]
    vpbroadcastq    ymm5, [p_src + 2 * i_srcstride + 8]
    vpblendd        ymm4, ymm4, ymm5, 11110000b
    AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
    vmovq           xmm5, [p_src + i_srcstride3]
    vpbroadcastq    ymm6, [p_src + i_srcstride3 + 8]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpblendd        ymm5, ymm5, ymm6, 11110000b
    vpunpcklbw      ymm4, ymm4, ymm5
    AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
    vpackuswb       ymm7, ymm7, ymm0
    vpermq          ymm7, ymm7, 11011000b
    vmovdqa         [p_dst], xmm7
    vextracti128    [p_dst + i_dststride], ymm7, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jg              .width16_yloop
    vzeroupper
    DEINIT_X86_32_PIC
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret
%undef p_src
%undef i_srcstride
%undef i_srcstride3
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_ycnt


;*******************************************************************************
; void McHorVer20_avx2(const uint8_t *pSrc,
;                      int iSrcStride,
;                      uint8_t *pDst,
;                      int iDstStride,
;                      int iWidth,
;                      int iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_width      r4
%define i_height     r5
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    cmp             i_width, 8
    je              .width8
    jg              .width16_yloop
%xdefine i_srcstride3 i_width
%undef i_width
    lea             i_srcstride3, [3 * i_srcstride]
.width4_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm1, [p_src + i_srcstride - 2]
    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
    vpackuswb       ymm0, ymm0, ymm0
    vmovd           [p_dst], xmm0
    vpsrlq          xmm1, xmm0, 32
    vmovd           [p_dst + i_dststride], xmm1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vextracti128    xmm0, ymm0, 1
    vmovd           [p_dst], xmm0
    vpsrlq          xmm1, xmm0, 32
    vmovd           [p_dst + i_dststride], xmm1
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 4
    jg              .width4_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width8:
    lea             i_srcstride3, [3 * i_srcstride]
.width8_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm1, [p_src + i_srcstride - 2]
    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
    vpackuswb       ymm0, ymm0, ymm1
    vmovlps         [p_dst], xmm0
    vmovhps         [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    vextracti128    xmm0, ymm0, 1
    vmovlps         [p_dst], xmm0
    vmovhps         [p_dst + i_dststride], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 4
    jg              .width8_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
%undef i_srcstride3
.width16_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm1, [p_src + 6]
    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
    lea             p_src, [p_src + 2 * i_srcstride]
    AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
    AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
    vpackuswb       ymm0, ymm0, ymm1
    vmovdqa         [p_dst], xmm0
    vextracti128    [p_dst + i_dststride], ymm0, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 2
    jg              .width16_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height


;***********************************************************************
; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc,
;                                   int32_t iSrcStride,
;                                   uint8_t *pDst,
;                                   int32_t iDstStride,
;                                   int32_t iWidth,
;                                   int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer20Width5Or9Or17_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%define i_width      r4
%define i_height     r5
    %assign  push_num 0
    INIT_X86_32_PIC r6
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    cmp             i_width, 9
    je              .width9
    jg              .width17
.width5_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
    lea             p_src, [p_src + 2 * i_srcstride]
    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
    vpackuswb       ymm0, ymm0, ymm0
    vpsrlq          xmm1, xmm0, 8
    vmovd           [p_dst + 1], xmm1
    vmovd           [p_dst], xmm0
    add             p_dst, i_dststride
    vextracti128    xmm0, ymm0, 1
    vpsrlq          xmm1, xmm0, 8
    vmovd           [p_dst + 1], xmm1
    vmovd           [p_dst], xmm0
    add             p_dst, i_dststride
    sub             i_height, 2
    jg              .width5_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width9:
%xdefine i_srcstride3 i_width
%undef i_width
    lea             i_srcstride3, [3 * i_srcstride]
.width9_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm4, [p_src + i_srcstride - 2]
    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
    vinserti128     ymm4, ymm4, [p_src + i_srcstride3 - 2], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpckhqdq     ymm3, ymm0, ymm4
    AVX2_FilterHorizontal_4px ymm3, ymm2
    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
    vpackuswb       ymm3, ymm3, ymm0
    vmovd           [p_dst + 5], xmm3
    vmovhps         [p_dst], xmm3
    add             p_dst, i_dststride
    AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2
    vpackuswb       ymm4, ymm4, ymm4
    vpsrlq          xmm2, xmm3, 32
    vmovd           [p_dst + 5], xmm2
    vmovlps         [p_dst], xmm4
    add             p_dst, i_dststride
    vextracti128    xmm3, ymm3, 1
    vextracti128    xmm4, ymm4, 1
    vmovd           [p_dst + 5], xmm3
    vmovhps         [p_dst], xmm3
    add             p_dst, i_dststride
    vpsrlq          xmm2, xmm3, 32
    vmovd           [p_dst + 5], xmm2
    vmovlps         [p_dst], xmm4
    add             p_dst, i_dststride
    sub             i_height, 4
    jg              .width9_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC_KEEPDEF
    ret
.width17:
    lea             i_srcstride3, [3 * i_srcstride]
.width17_yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm3, [p_src + 6]
    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
    vmovdqa         ymm4, ymm3
    AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
    vpackuswb       ymm0, ymm0, ymm3
    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
    vmovdqu         xmm3, [p_src + 2 * i_srcstride + 6]
    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
    vinserti128     ymm3, ymm3, [p_src + i_srcstride3 + 6], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpckhqdq     ymm4, ymm4, ymm3
    AVX2_FilterHorizontal_4px ymm4, ymm2
    vpackuswb       ymm4, ymm4, ymm4
    vmovd           [p_dst + 13], xmm4
    vmovdqa         [p_dst], xmm0
    add             p_dst, i_dststride
    vextracti128    xmm2, ymm4, 1
    vmovd           [p_dst + 13], xmm2
    vextracti128    [p_dst], ymm0, 1
    add             p_dst, i_dststride
    vpsrlq          xmm4, xmm4, 32
    vmovd           [p_dst + 13], xmm4
    AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4
    AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4
    vpackuswb       ymm1, ymm1, ymm3
    vmovdqa         [p_dst], xmm1
    add             p_dst, i_dststride
    vpsrlq          xmm2, xmm2, 32
    vmovd           [p_dst + 13], xmm2
    vextracti128    [p_dst], ymm1, 1
    add             p_dst, i_dststride
    sub             i_height, 4
    jg              .width17_yloop
    vzeroupper
    POP_XMM
    LOAD_6_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef i_srcstride3
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height


;*******************************************************************************
; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc,
;                                   int iSrcStride,
;                                   int16_t *pDst,
;                                   int iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20Width4U8ToS16_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_height     r3
%define i_srcstride3 r4
%define i_dststride   8
    %assign  push_num 0
%ifdef X86_32
    push            r4
    %assign  push_num 1
%endif
    INIT_X86_32_PIC r5
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    lea             i_srcstride3, [3 * i_srcstride]
    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 3
.yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm1, [p_src + i_srcstride - 2]
    vinserti128     ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
    vmovdqa         [p_dst], ymm0
    add             p_dst, 4 * i_dststride
    sub             i_height, 4
    jg              .yloop
    ; Height % 4 remaining single.
    vmovdqu         xmm0, [p_src - 2]
    AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
    vmovlps         [p_dst], xmm0
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
%ifdef X86_32
    pop             r4
%endif
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_srcstride3
%undef i_dststride


;***********************************************************************
; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc,
;                                   uint8_t *pDst,
;                                   int32_t iDstStride,
;                                   int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width4S16ToU8_avx2
%define p_src        r0
%define p_dst        r1
%define i_dststride  r2
%define i_height     r3
%define i_dststride3 r4
%define i_srcstride  8
    %assign  push_num 0
%ifdef X86_32
    push            r4
    %assign  push_num 1
%endif
    INIT_X86_32_PIC r5
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    lea             i_dststride3, [3 * i_dststride]
    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
    vmovdqu         ymm1, [p_src +  1 * i_srcstride]
    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
    vmovdqu         ymm3, [p_src +  3 * i_srcstride]
    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
    vmovdqu         ymm5, [p_src +  5 * i_srcstride]
    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
    vpackuswb       ymm0, ymm0, ymm0
    vmovd           [p_dst], xmm0
    vpsrlq          xmm7, xmm0, 32
    vmovd           [p_dst + i_dststride], xmm7
    vextracti128    xmm0, ymm0, 1
    vmovd           [p_dst + 2 * i_dststride], xmm0
    vpsrlq          xmm7, xmm0, 32
    vmovd           [p_dst + i_dststride3], xmm7
    cmp             i_height, 4
    jle             .done
    lea             p_dst, [p_dst + 4 * i_dststride]
    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
    vpackuswb       ymm4, ymm4, ymm4
    vmovd           [p_dst], xmm4
    vpsrlq          xmm3, xmm4, 32
    vmovd           [p_dst + i_dststride], xmm3
    vextracti128    xmm4, ymm4, 1
    vmovd           [p_dst + 2 * i_dststride], xmm4
    vpsrlq          xmm3, xmm4, 32
    vmovd           [p_dst + i_dststride3], xmm3
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
%ifdef X86_32
    pop             r4
%endif
    ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride
%undef i_dststride3


;*******************************************************************************
; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc,
;                                   int iSrcStride,
;                                   int16_t *pDst,
;                                   int iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20Width8U8ToS16_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_height     r3
%define i_dststride  16
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 6
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    vbroadcasti128  ymm3, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm4, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 1
.yloop:
    vmovdqu         xmm0, [p_src - 2]
    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
    lea             p_src, [p_src + 2 * i_srcstride]
    AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2
    vmovdqu         [p_dst], ymm0
    add             p_dst, 2 * i_dststride
    sub             i_height, 2
    jg              .yloop
    jl              .done
    vmovdqu         xmm0, [p_src - 2]
    AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2
    vmovdqa         [p_dst], xmm0
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_dststride


;***********************************************************************
; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc,
;                                   uint8_t *pDst,
;                                   int32_t iDstStride,
;                                   int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width5S16ToU8_avx2
%define p_src        r0
%define p_dst        r1
%define i_dststride  r2
%define i_height     r3
%define i_srcstride  16
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    vmovdqu         ymm0, [p_src +  0 * i_srcstride]
    vmovdqu         ymm2, [p_src +  2 * i_srcstride]
    vmovdqu         ymm4, [p_src +  4 * i_srcstride]
    vmovdqu         ymm6, [p_src +  6 * i_srcstride]
    vperm2i128      ymm1, ymm0, ymm2, 00100001b
    vperm2i128      ymm3, ymm2, ymm4, 00100001b
    vperm2i128      ymm5, ymm4, ymm6, 00100001b
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
    vpackuswb       ymm0, ymm0, ymm0
    vpsrlq          xmm7, xmm0, 8
    vmovd           [p_dst + 1], xmm7
    vmovd           [p_dst], xmm0
    add             p_dst, i_dststride
    vextracti128    xmm0, ymm0, 1
    vpsrlq          xmm7, xmm0, 8
    vmovd           [p_dst + 1], xmm7
    vmovd           [p_dst], xmm0
    add             p_dst, i_dststride
    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
    vmovdqu         ymm0, [p_src +  8 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
    vpackuswb       ymm2, ymm2, ymm2
    vpsrlq          xmm1, xmm2, 8
    vmovd           [p_dst + 1], xmm1
    vmovd           [p_dst], xmm2
    add             p_dst, i_dststride
    vextracti128    xmm2, ymm2, 1
    vpsrlq          xmm1, xmm2, 8
    vmovd           [p_dst + 1], xmm1
    vmovd           [p_dst], xmm2
    add             p_dst, i_dststride
    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
    vmovdqu         ymm2, [p_src + 10 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
    vpackuswb       ymm4, ymm4, ymm4
    vpsrlq          xmm3, xmm4, 8
    vmovd           [p_dst + 1], xmm3
    vmovd           [p_dst], xmm4
    cmp             i_height, 5
    jle             .done
    add             p_dst, i_dststride
    vextracti128    xmm4, ymm4, 1
    vpsrlq          xmm3, xmm4, 8
    vmovd           [p_dst + 1], xmm3
    vmovd           [p_dst], xmm4
    add             p_dst, i_dststride
    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
    vmovdqu         xmm4, [p_src + 12 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
    vpackuswb       ymm6, ymm6, ymm6
    vpsrlq          xmm5, xmm6, 8
    vmovd           [p_dst + 1], xmm5
    vmovd           [p_dst], xmm6
    add             p_dst, i_dststride
    vextracti128    xmm6, ymm6, 1
    vpsrlq          xmm5, xmm6, 8
    vmovd           [p_dst + 1], xmm5
    vmovd           [p_dst], xmm6
    add             p_dst, i_dststride
    vmovdqu         xmm5, [p_src + 13 * i_srcstride]
    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
    vpackuswb       xmm0, xmm0, xmm0
    vpsrlq          xmm7, xmm0, 8
    vmovd           [p_dst + 1], xmm7
    vmovd           [p_dst], xmm0
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride


;***********************************************************************
; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc,
;                                   uint8_t *pDst,
;                                   int32_t iDstStride,
;                                   int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width8S16ToU8_avx2
%define p_src        r0
%define p_dst        r1
%define i_dststride  r2
%define i_height     r3
%define i_dststride3 r4
%define i_srcstride  16
    %assign  push_num 0
%ifdef X86_32
    push            r4
    %assign  push_num 1
%endif
    INIT_X86_32_PIC r5
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    lea             i_dststride3, [3 * i_dststride]
    vmovdqa         ymm0, [p_src +  0 * i_srcstride]
    vmovdqa         ymm2, [p_src +  2 * i_srcstride]
    vmovdqa         ymm4, [p_src +  4 * i_srcstride]
    vperm2i128      ymm1, ymm0, ymm2, 00100001b
    vperm2i128      ymm3, ymm2, ymm4, 00100001b
.yloop:
    vmovdqa         ymm6, [p_src +  6 * i_srcstride]
    vperm2i128      ymm5, ymm4, ymm6, 00100001b
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
    vmovdqu         ymm7, [p_src +  7 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
    vpackuswb       ymm1, ymm0, ymm2
    vmovdqa         ymm0, [p_src +  8 * i_srcstride]
    vextracti128    xmm2, ymm1, 1
    vmovlps         [p_dst], xmm1
    vmovlps         [p_dst + i_dststride], xmm2
    vmovhps         [p_dst + 2 * i_dststride], xmm1
    vmovhps         [p_dst + i_dststride3], xmm2
    cmp             i_height, 4
    jle             .done
    lea             p_dst, [p_dst + 4 * i_dststride]
    vmovdqu         ymm1, [p_src +  9 * i_srcstride]
    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
    vmovdqu         ymm3, [p_src + 11 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
    vpackuswb       ymm5, ymm4, ymm6
    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
    add             p_src, 8 * i_srcstride
    vextracti128    xmm6, ymm5, 1
    vmovlps         [p_dst], xmm5
    vmovlps         [p_dst + i_dststride], xmm6
    vmovhps         [p_dst + 2 * i_dststride], xmm5
    vmovhps         [p_dst + i_dststride3], xmm6
    lea             p_dst, [p_dst + 4 * i_dststride]
    sub             i_height, 8
    jg              .yloop
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
%ifdef X86_32
    pop             r4
%endif
    ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_dststride3
%undef i_srcstride


;*******************************************************************************
; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc,
;                                    int32_t iSrcStride,
;                                    int16_t *pDst,
;                                    int32_t iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20Width16U8ToS16_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_height     r3
%define i_dststride  32
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 7
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    vbroadcasti128  ymm4, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm5, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 1
.yloop:
    vmovdqu         xmm0, [p_src - 2]
    vinserti128     ymm0, ymm0, [p_src + 6], 1
    vmovdqu         xmm1, [p_src + i_srcstride - 2]
    vinserti128     ymm1, ymm1, [p_src + i_srcstride + 6], 1
    lea             p_src, [p_src + 2 * i_srcstride]
    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
    vmovdqa         [p_dst], ymm0
    AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
    vmovdqa         [p_dst + i_dststride], ymm1
    add             p_dst, 2 * i_dststride
    sub             i_height, 2
    jg              .yloop
    jl              .done
    vmovdqu         xmm0, [p_src - 2]
    vinserti128     ymm0, ymm0, [p_src + 6], 1
    AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2
    vmovdqa         [p_dst], ymm0
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_dststride


;***********************************************************************
; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc,
;                                   uint8_t *pDst,
;                                   int32_t iDstStride,
;                                   int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width9S16ToU8_avx2
%define p_src        r0
%define p_dst        r1
%define i_dststride  r2
%define i_height     r3
%define i_srcstride  32
    %assign  push_num 0
    INIT_X86_32_PIC r4
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r2, r2d
    SIGN_EXTENSION  r3, r3d
    vmovdqa         ymm0, [p_src + 0 * i_srcstride]
    vmovdqa         ymm1, [p_src + 1 * i_srcstride]
    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
    vmovdqa         ymm3, [p_src + 3 * i_srcstride]
    vmovdqa         ymm4, [p_src + 4 * i_srcstride]
    sub             i_height, 1
.height_loop:
    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
    vmovdqa         ymm6, [p_src + 6 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
    vmovdqa         ymm7, [p_src + 7 * i_srcstride]
    vpackuswb       ymm0, ymm0, ymm1
    vextracti128    xmm1, ymm0, 1
    vpsllq          xmm1, xmm1, 56
    vmovlps         [p_dst + 1], xmm1
    vmovlps         [p_dst], xmm0
    add             p_dst, i_dststride
    vmovhps         [p_dst + 1], xmm1
    vmovhps         [p_dst], xmm0
    add             p_dst, i_dststride
    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
    vmovdqa         ymm0, [p_src + 8 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
    vpackuswb       ymm2, ymm2, ymm3
    vextracti128    xmm3, ymm2, 1
    vpsllq          xmm3, xmm3, 56
    vmovlps         [p_dst + 1], xmm3
    vmovlps         [p_dst], xmm2
    add             p_dst, i_dststride
    vmovhps         [p_dst + 1], xmm3
    vmovhps         [p_dst], xmm2
    add             p_dst, i_dststride
    vmovdqa         ymm1, [p_src + 9 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
    vmovdqa         ymm2, [p_src + 10 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
    vmovdqa         ymm3, [p_src + 11 * i_srcstride]
    vpackuswb       ymm4, ymm4, ymm5
    vextracti128    xmm5, ymm4, 1
    vpsllq          xmm5, xmm5, 56
    vmovlps         [p_dst + 1], xmm5
    vmovlps         [p_dst], xmm4
    cmp             i_height, 4
    jle             .done
    add             p_dst, i_dststride
    vmovhps         [p_dst + 1], xmm5
    vmovhps         [p_dst], xmm4
    add             p_dst, i_dststride
    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
    vmovdqa         ymm4, [p_src + 12 * i_srcstride]
    add             p_src, 8 * i_srcstride
    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
    vpackuswb       ymm6, ymm6, ymm7
    vextracti128    xmm7, ymm6, 1
    vpsllq          xmm7, xmm7, 56
    vmovlps         [p_dst + 1], xmm7
    vmovlps         [p_dst], xmm6
    add             p_dst, i_dststride
    vmovhps         [p_dst + 1], xmm7
    vmovhps         [p_dst], xmm6
    add             p_dst, i_dststride
    sub             i_height, 8
    jg              .height_loop
    vmovdqa         ymm5, [p_src + 5 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
    vpackuswb       ymm0, ymm0, ymm0
    vextracti128    xmm1, ymm0, 1
    vpsllq          xmm1, xmm1, 56
    vmovlps         [p_dst + 1], xmm1
    vmovlps         [p_dst], xmm0
.done:
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height


;*******************************************************************************
; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc,
;                                    int32_t iSrcStride,
;                                    int16_t *pDst,
;                                    int32_t iHeight);
;*******************************************************************************

WELS_EXTERN McHorVer20Width17U8ToS16_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_height     r3
%define i_srcstride3 r4
%define i_dststride  64
    %assign  push_num 0
%ifdef X86_32
    push            r4
    %assign  push_num 1
%endif
    INIT_X86_32_PIC r5
    LOAD_4_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    sub             p_src, i_srcstride
    sub             p_src, i_srcstride
    lea             i_srcstride3, [3 * i_srcstride]
    vbroadcasti128  ymm5, [pic(shufb_32435465768798A9)]
    vbroadcasti128  ymm6, [pic(shufb_011267784556ABBC)]
    vbroadcasti128  ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
    sub             i_height, 3
.yloop:
    vmovdqu         xmm0, [p_src - 2]
    vmovdqu         xmm3, [p_src + 6]
    vinserti128     ymm0, ymm0, [p_src + i_srcstride - 2], 1
    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
    vmovdqa         ymm4, ymm3
    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
    vmovdqa         [p_dst], xmm0
    vextracti128    [p_dst + i_dststride], ymm0, 1
    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
    vmovdqu         xmm1, [p_src + 2 * i_srcstride - 2]
    vmovdqu         xmm0, [p_src + 2 * i_srcstride + 6]
    vinserti128     ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
    vinserti128     ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpckhqdq     ymm4, ymm4, ymm0
    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
    vmovlps         [p_dst + 26], xmm4
    vmovdqa         [p_dst + 16], xmm3
    vextracti128    xmm2, ymm4, 1
    vmovlps         [p_dst + i_dststride + 26], xmm2
    vextracti128    [p_dst + i_dststride + 16], ymm3, 1
    vmovhps         [p_dst + 2 * i_dststride + 26], xmm4
    AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4
    vmovdqa         [p_dst + 2 * i_dststride], xmm1
    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4
    vmovdqa         [p_dst + 2 * i_dststride + 16], xmm0
    vextracti128    [p_dst + 3 * i_dststride], ymm1, 1
    vmovhps         [p_dst + 3 * i_dststride + 26], xmm2
    vextracti128    [p_dst + 3 * i_dststride + 16], ymm0, 1
    add             p_dst, 4 * i_dststride
    sub             i_height, 4
    jg              .yloop
    ; Handle remaining 2 lines after 4x unrolled loop.
    vmovdqu         xmm0, [p_src - 2]
    vinserti128     ymm0, ymm0, [p_src + 6], 1
    vmovdqu         xmm3, [p_src + i_srcstride - 2]
    vinserti128     ymm3, ymm3, [p_src + i_srcstride + 6], 1
    vpunpckhqdq     ymm4, ymm0, ymm3
    AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
    AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
    AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
    vextracti128    xmm4, ymm4, 1
    vmovlps         [p_dst + 26], xmm4
    vmovdqa         [p_dst], ymm0
    vmovhps         [p_dst + i_dststride + 26], xmm4
    vmovdqa         [p_dst + i_dststride], ymm3
    vzeroupper
    POP_XMM
    LOAD_4_PARA_POP
    DEINIT_X86_32_PIC
%ifdef X86_32
    pop             r4
%endif
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride3


;***********************************************************************
; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc,
;                                        int32_t iSrcStride,
;                                        uint8_t *pDst,
;                                        int32_t iDstStride,
;                                        int32_t iWidth,
;                                        int32_t iHeight);
;***********************************************************************

WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2
%define p_src        r0
%define i_srcstride  r1
%define p_dst        r2
%define i_dststride  r3
%ifdef X86_32_PICASM
%define i_width      dword arg5
%else
%define i_width      r4
%endif
%define i_height     r5
%define i_srcstride3 r6
    %assign  push_num 0
%ifdef X86_32
    push            r6
    %assign  push_num 1
%endif
    LOAD_6_PARA
    PUSH_XMM 8
    SIGN_EXTENSION  r1, r1d
    SIGN_EXTENSION  r3, r3d
    SIGN_EXTENSION  r4, r4d
    SIGN_EXTENSION  r5, r5d
    INIT_X86_32_PIC_NOPRESERVE r4
    sub             i_height, 1
    lea             i_srcstride3, [3 * i_srcstride]
    test            i_width, 1
    jz              .align_begin
    push            i_height
    push            p_src
    push            p_dst
    %assign push_num push_num + 3
%ifdef X86_32_PICASM
    add             p_src, i_width
    add             p_src, i_width
    sub             p_src, 2
%else
    lea             p_src, [p_src + 2 * i_width - 2]
%endif
    add             p_dst, i_width
    vmovd           xmm0, [p_src]
    vpunpcklwd      xmm0, xmm0, [p_src + i_srcstride]
    vmovd           xmm1, [p_src + 2 * i_srcstride]
    add             p_src, i_srcstride3
    vpunpcklwd      xmm1, xmm1, [p_src]
    vpunpckldq      xmm0, xmm0, xmm1
    vmovd           xmm1, [p_src + i_srcstride]
    vpunpcklwd      xmm1, xmm1, [p_src + 2 * i_srcstride]
    vmovd           xmm2, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklwd      xmm2, xmm2, [p_src]
    vpunpckldq      xmm1, xmm1, xmm2
    vpunpcklqdq     xmm0, xmm0, xmm1
.height_loop_unalign:
    vmovd           xmm1, [p_src + i_srcstride]
    vpalignr        xmm1, xmm1, xmm0, 2
    vmovd           xmm2, [p_src + 2 * i_srcstride]
    vpalignr        xmm2, xmm2, xmm1, 2
    vmovd           xmm3, [p_src + i_srcstride3]
    vpalignr        xmm3, xmm3, xmm2, 2
    lea             p_src, [p_src + 4 * i_srcstride]
    vmovd           xmm4, [p_src]
    vpalignr        xmm4, xmm4, xmm3, 2
    vmovd           xmm5, [p_src + i_srcstride]
    vpalignr        xmm5, xmm5, xmm4, 2
    AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
    vpackuswb       xmm0, xmm0, xmm0
    vpslld          xmm6, xmm0, 24
    vmovd           [p_dst - 4], xmm6
    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
    add             p_dst, i_dststride
    vpslld          xmm6, xmm0, 16
    vmovd           [p_dst - 4], xmm6
    vmovlps         [p_dst + 4 * i_dststride - 8], xmm6
    add             p_dst, i_dststride
    vpslld          xmm6, xmm0, 8
    vmovd           [p_dst - 4], xmm6
    vmovd           [p_dst + i_dststride - 4], xmm0
    lea             p_dst, [p_dst + 4 * i_dststride]
    vmovlps         [p_dst - 8], xmm6
    vmovlps         [p_dst + i_dststride - 8], xmm0
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jle             .height_loop_unalign_exit
    vmovd           xmm1, [p_src + 2 * i_srcstride]
    vpalignr        xmm1, xmm1, xmm5, 2
    vmovd           xmm0, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpunpcklwd      xmm0, xmm0, [p_src]
    vpalignr        xmm0, xmm0, xmm1, 4
    jmp             .height_loop_unalign
.height_loop_unalign_exit:
    vpbroadcastq    xmm6, [p_src + 2 * i_srcstride - 6]
    AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
    vpackuswb       xmm1, xmm1, xmm1
    vmovlps         [p_dst - 8], xmm1
    pop             p_dst
    pop             p_src
    pop             i_height
    %assign push_num push_num - 3
.align_begin:
    vmovdqa         ymm0, [p_src]
    vmovdqa         ymm1, [p_src + i_srcstride]
    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
    vmovdqa         ymm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vmovdqa         ymm4, [p_src]
.height_loop:
    vmovdqa         ymm5, [p_src + i_srcstride]
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
    vmovdqa         ymm6, [p_src + 2 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
    vmovdqa         ymm7, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpackuswb       ymm0, ymm0, ymm1
    vpermq          ymm0, ymm0, 11011000b
    vmovdqa         [p_dst], xmm0
    vextracti128    [p_dst + i_dststride], ymm0, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
    vmovdqa         ymm0, [p_src]
    AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
    vpackuswb       ymm2, ymm2, ymm3
    vpermq          ymm2, ymm2, 11011000b
    vmovdqa         [p_dst], xmm2
    vextracti128    [p_dst + i_dststride], ymm2, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    vmovdqa         ymm1, [p_src + i_srcstride]
    AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
    vmovdqa         ymm2, [p_src + 2 * i_srcstride]
    AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
    vmovdqa         ymm3, [p_src + i_srcstride3]
    lea             p_src, [p_src + 4 * i_srcstride]
    vpackuswb       ymm4, ymm4, ymm5
    vpermq          ymm4, ymm4, 11011000b
    vmovdqa        [p_dst], xmm4
    vextracti128   [p_dst + i_dststride], ymm4, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
    vmovdqa         ymm4, [p_src]
    AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
    vpackuswb       ymm6, ymm6, ymm7
    vpermq          ymm6, ymm6, 11011000b
    vmovdqa         [p_dst], xmm6
    vextracti128    [p_dst + i_dststride], ymm6, 1
    lea             p_dst, [p_dst + 2 * i_dststride]
    sub             i_height, 8
    jg              .height_loop
    jl              .done
    vmovdqa         ymm5, [p_src + i_srcstride]
    AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
    vpackuswb       ymm0, ymm0, ymm0
    vpermq          ymm0, ymm0, 11011000b
    vmovdqa         [p_dst], xmm0
.done:
    vzeroupper
    DEINIT_X86_32_PIC
    POP_XMM
    LOAD_6_PARA_POP
%ifdef X86_32
    pop             r6
%endif
    ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_srcstride3

%endif ; HAVE_AVX2