ref: 0ffc5ea47d09778a50edafed29a3a664c2f4f3f3
dir: /codec/common/x86/mc_luma.asm/
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*
;* mc_luma.asm
;*
;* Abstract
;* sse2 motion compensation
;*
;* History
;* 17/08/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;*******************************************************************************
; Local Data (Read Only)
;*******************************************************************************
%ifdef X86_32_PICASM
SECTION .text align=32
%else
SECTION .rodata align=32
%endif
;*******************************************************************************
; Various memory constants (trigonometric values or rounding values)
;*******************************************************************************
%ifdef HAVE_AVX2
ALIGN 32
dwm32768_256:
times 16 dw -32768
maddubsw_m2p10_m40m40_p10m2_p0p0_256:
times 4 db -2, 10, -40, -40, 10, -2, 0, 0
dwm1024_256:
times 16 dw -1024
dd32768_256:
times 8 dd 32768
maddubsw_p1m5_256:
times 16 db 1, -5
maddubsw_m5p1_256:
times 16 db -5, 1
db20_256:
times 32 db 20
maddubsw_m5p20_256:
times 16 db -5, 20
maddubsw_p20m5_256:
times 16 db 20, -5
h264_w0x10_256:
times 16 dw 16
dw32_256:
times 16 dw 32
%endif ; HAVE_AVX2
ALIGN 16
shufb_32435465768798A9:
db 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9
shufb_011267784556ABBC:
db 0, 1, 1, 2, 6, 7, 7, 8, 4, 5, 5, 6, 0Ah, 0Bh, 0Bh, 0Ch
maddubsw_p1m5_p1m5_m5p1_m5p1_128:
times 2 db 1, -5, 1, -5, -5, 1, -5, 1
maddubsw_m2p10_m40m40_p10m2_p0p0_128:
times 2 db -2, 10, -40, -40, 10, -2, 0, 0
dwm1024_128:
times 8 dw -1024
dd32768_128:
times 4 dd 32768
maddubsw_p1m5_128:
times 8 db 1, -5
maddubsw_m5p1_128:
times 8 db -5, 1
db20_128:
times 16 db 20
maddubsw_m5p20_128:
times 8 db -5, 20
maddubsw_p20m5_128:
times 8 db 20, -5
h264_w0x10_1:
dw 16, 16, 16, 16, 16, 16, 16, 16
ALIGN 16
h264_mc_hc_32:
dw 32, 32, 32, 32, 32, 32, 32, 32
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
%ifdef X86_32_PICASM
%macro MOVEIMM_DW16 1
pcmpeqw %1, %1
psrlw %1, 15
psllw %1, 4
%endmacro
%endif
;*******************************************************************************
; void McHorVer20WidthEq4_mmx( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight)
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq4_mmx
%assign push_num 0
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, 2
WELS_Zero mm7
%ifdef X86_32_PICASM
MOVEIMM_DW16 mm6
%else
movq mm6, [h264_w0x10_1]
%endif
.height_loop:
movd mm0, [r0]
punpcklbw mm0, mm7
movd mm1, [r0+5]
punpcklbw mm1, mm7
movd mm2, [r0+1]
punpcklbw mm2, mm7
movd mm3, [r0+4]
punpcklbw mm3, mm7
movd mm4, [r0+2]
punpcklbw mm4, mm7
movd mm5, [r0+3]
punpcklbw mm5, mm7
paddw mm2, mm3
paddw mm4, mm5
psllw mm4, 2
psubw mm4, mm2
paddw mm0, mm1
paddw mm0, mm4
psllw mm4, 2
paddw mm0, mm4
paddw mm0, mm6
psraw mm0, 5
packuswb mm0, mm7
movd [r2], mm0
add r0, r1
add r2, r3
dec r4
jnz .height_loop
WELSEMMS
LOAD_5_PARA_POP
ret
;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************
%macro SSE_LOAD_8P 3
movq %1, %3
punpcklbw %1, %2
%endmacro
%macro FILTER_HV_W8 9
paddw %1, %6
paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
paddw %7, %5
psllw %8, 2
psubw %8, %7
paddw %1, %8
psllw %8, 2
paddw %1, %8
psraw %1, 5
WELS_Zero %8
packuswb %1, %8
movq %9, %1
%endmacro
%macro FILTER_HV_W4 9
paddw %1, %6
paddw %1, [pic(h264_w0x10_1)]
movdqa %8, %3
movdqa %7, %2
paddw %8, %4
paddw %7, %5
psllw %8, 2
psubw %8, %7
paddw %1, %8
psllw %8, 2
paddw %1, %8
psraw %1, 5
WELS_Zero %8
packuswb %1, %8
movd %9, %1
%endmacro
;*******************************************************************************
; Code
;*******************************************************************************
SECTION .text
;***********************************************************************
; void McHorVer22Width8HorFirst_sse2(const int16_t *pSrc,
; int16_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride
; int32_t iHeight
; )
;***********************************************************************
WELS_EXTERN McHorVer22Width8HorFirst_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
.yloop_width_8:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
add r0, r1
add r2, r3
dec r4
jnz .yloop_width_8
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer20WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq8_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
%ifdef X86_32_PICASM
MOVEIMM_DW16 xmm6
%else
movdqa xmm6, [h264_w0x10_1]
%endif
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer20WidthEq16_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight,
; );
;*******************************************************************************
WELS_EXTERN McHorVer20WidthEq16_sse2
%assign push_num 0
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea r0, [r0-2] ;pSrc -= 2;
pxor xmm7, xmm7
%ifdef X86_32_PICASM
MOVEIMM_DW16 xmm6
%else
movdqa xmm6, [h264_w0x10_1]
%endif
.y_loop:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, xmm6
psraw xmm0, 5
packuswb xmm0, xmm7
movq [r2+8], xmm0
lea r2, [r2+r3]
lea r0, [r0+r1]
dec r4
jnz near .y_loop
POP_XMM
LOAD_5_PARA_POP
ret
;*******************************************************************************
; void McHorVer02WidthEq8_sse2( const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iHeight )
;*******************************************************************************
WELS_EXTERN McHorVer02WidthEq8_sse2
%assign push_num 0
INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r0, r1
sub r0, r1
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r4
jz near .xx_exit
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r4
jz near .xx_exit
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.xx_exit:
POP_XMM
LOAD_5_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
; Code
;***********************************************************************
SECTION .text
;***********************************************************************
; void McHorVer02Height9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height9Or17_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
sub r0, r1
sub r0, r1
.xloop:
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
movdqa xmm0,xmm1
movdqa xmm1,xmm2
movdqa xmm2,xmm3
movdqa xmm3,xmm4
movdqa xmm4,xmm5
movdqa xmm5,xmm6
add r2, r3
sub r0, r1
.start:
FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W8 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W8 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W8 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W8 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W8 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .xx_exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
sub r0, r1
sub r0, r1
add r0, 8
add r2, 8
jmp near .xloop
.xx_exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
; void McHorVer02Height5_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight )
;***********************************************************************
WELS_EXTERN McHorVer02Height5_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 2
sub r0, r1
sub r0, r1
.xloop:
WELS_Zero xmm7
SSE_LOAD_8P xmm0, xmm7, [r0]
SSE_LOAD_8P xmm1, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm7, [r0]
SSE_LOAD_8P xmm3, xmm7, [r0+r1]
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm7, [r0]
SSE_LOAD_8P xmm5, xmm7, [r0+r1]
FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
movdqa xmm0,xmm1
movdqa xmm1,xmm2
movdqa xmm2,xmm3
movdqa xmm3,xmm4
movdqa xmm4,xmm5
movdqa xmm5,xmm6
add r2, r3
sub r0, r1
.start:
FILTER_HV_W4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm6, xmm7, [r0]
FILTER_HV_W4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm7, xmm0, [r0+r1]
FILTER_HV_W4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm0, xmm1, [r0]
FILTER_HV_W4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm1, xmm2, [r0+r1]
FILTER_HV_W4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm2, xmm3, [r0]
FILTER_HV_W4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm3, xmm4, [r0+r1]
FILTER_HV_W4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
SSE_LOAD_8P xmm4, xmm5, [r0]
FILTER_HV_W4 xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
SSE_LOAD_8P xmm5, xmm6, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .xx_exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
sub r0, r1
sub r0, r1
add r0, 4
add r2, 4
jmp near .xloop
.xx_exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
; void McHorVer20Width9Or17_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
sub r0, 2
pxor xmm7, xmm7
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+1], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movq [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2+8], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movq [r2+9], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
; void McHorVer20Width5_sse2( const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight
; );
;***********************************************************************
WELS_EXTERN McHorVer20Width5_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
sub r0, 2
pxor xmm7, xmm7
.yloop_width_5:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
paddw xmm0, [pic(h264_w0x10_1)]
psraw xmm0, 5
packuswb xmm0, xmm0
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
paddw xmm2, [pic(h264_w0x10_1)]
psraw xmm2, 5
packuswb xmm2, xmm2
movd [r2+1], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
;void McHorVer22HorFirst_sse2
; (const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
cmp r4, 9
jne near .width_17
.yloop_width_9:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+2], xmm2
movhps [r2+2+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_9
POP_XMM
LOAD_6_PARA_POP
ret
.width_17:
.yloop_width_17:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
paddw xmm2, xmm3
paddw xmm4, xmm5
psllw xmm4, 2
psubw xmm4, xmm2
paddw xmm0, xmm1
paddw xmm0, xmm4
psllw xmm4, 2
paddw xmm0, xmm4
movdqa [r2], xmm0
movq xmm0, [r0+8]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5+8]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1+8]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4+8]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2+8]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3+8]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2+16], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6+8]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+18], xmm2
movhps [r2+18+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_17
POP_XMM
LOAD_6_PARA_POP
ret
%macro FILTER_VER 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
paddw %7, %5
paddw %8, %4
psubw %1, %7
psraw %1, 2
paddw %1, %8
psubw %1, %7
psraw %1, 2
paddw %8, %1
paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movq %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width8VerLastAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastAlign_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqa xmm0, [r0]
movdqa xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
;void McHorVer22Width8VerLastUnAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width8VerLastUnAlign_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 3
.width_loop:
movdqu xmm0, [r0]
movdqu xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqu xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
movdqu xmm5, [r0+r1]
FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm7, [r0+r1]
FILTER_VER xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm0, [r0]
FILTER_VER xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm1, [r0+r1]
FILTER_VER xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
FILTER_VER xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm3, [r0+r1]
FILTER_VER xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
FILTER_VER xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 16
add r2, 8
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
;void McHorVer22Width5HorFirst_sse2
; (const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t * pTap,
; int32_t iTapStride,
; int32_t iWidth,int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width5HorFirst_sse2
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
pxor xmm7, xmm7
sub r0, r1 ;;;;;;;;need more 5 lines.
sub r0, r1
.yloop_width_5:
movq xmm0, [r0]
punpcklbw xmm0, xmm7
movq xmm1, [r0+5]
punpcklbw xmm1, xmm7
movq xmm2, [r0+1]
punpcklbw xmm2, xmm7
movq xmm3, [r0+4]
punpcklbw xmm3, xmm7
movq xmm4, [r0+2]
punpcklbw xmm4, xmm7
movq xmm5, [r0+3]
punpcklbw xmm5, xmm7
movdqa xmm7, xmm2
paddw xmm7, xmm3
movdqa xmm6, xmm4
paddw xmm6, xmm5
psllw xmm6, 2
psubw xmm6, xmm7
paddw xmm0, xmm1
paddw xmm0, xmm6
psllw xmm6, 2
paddw xmm0, xmm6
movd [r2], xmm0
pxor xmm7, xmm7
movq xmm0, [r0+6]
punpcklbw xmm0, xmm7
paddw xmm4, xmm1
paddw xmm5, xmm3
psllw xmm5, 2
psubw xmm5, xmm4
paddw xmm2, xmm0
paddw xmm2, xmm5
psllw xmm5, 2
paddw xmm2, xmm5
movq [r2+2], xmm2
movhps [r2+2+8], xmm2
add r0, r1
add r2, r3
dec r5
jnz .yloop_width_5
POP_XMM
LOAD_6_PARA_POP
ret
%macro FILTER_VER_4 9
paddw %1, %6
movdqa %7, %2
movdqa %8, %3
paddw %7, %5
paddw %8, %4
psubw %1, %7
psraw %1, 2
paddw %1, %8
psubw %1, %7
psraw %1, 2
paddw %8, %1
paddw %8, [pic(h264_mc_hc_32)]
psraw %8, 6
packuswb %8, %8
movd %9, %8
%endmacro
;***********************************************************************
;void McHorVer22Width4VerLastAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width4VerLastAlign_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 2
.width_loop:
movdqa xmm0, [r0]
movdqa xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
movdqa xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
movdqa xmm5, [r0+r1]
FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm6, [r0]
FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm7, [r0+r1]
FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm0, [r0]
FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm1, [r0+r1]
FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm2, [r0]
FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm3, [r0+r1]
FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqa xmm4, [r0]
FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqa xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 8
add r2, 4
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
;***********************************************************************
;void McHorVer22Width4VerLastUnAlign_sse2(
; const uint8_t *pTap,
; int32_t iTapStride,
; uint8_t * pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer22Width4VerLastUnAlign_sse2
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
%ifndef X86_32
push r12
push r13
push r14
mov r12, r0
mov r13, r2
mov r14, r5
%endif
shr r4, 2
.width_loop:
movdqu xmm0, [r0]
movdqu xmm1, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
movdqu xmm3, [r0+r1]
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
movdqu xmm5, [r0+r1]
FILTER_VER_4 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm2, xmm3
movdqa xmm3, xmm4
movdqa xmm4, xmm5
movdqa xmm5, xmm6
add r2, r3
sub r0, r1
.start:
FILTER_VER_4 xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm6, [r0]
FILTER_VER_4 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm7, [r0+r1]
FILTER_VER_4 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm0, [r0]
FILTER_VER_4 xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm1, [r0+r1]
FILTER_VER_4 xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm2, [r0]
FILTER_VER_4 xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm3, [r0+r1]
FILTER_VER_4 xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[r2]
dec r5
jz near .x_loop_dec
lea r0, [r0+2*r1]
movdqu xmm4, [r0]
FILTER_VER_4 xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [r2+r3]
dec r5
jz near .x_loop_dec
lea r2, [r2+2*r3]
movdqu xmm5, [r0+r1]
jmp near .start
.x_loop_dec:
dec r4
jz near .exit
%ifdef X86_32
mov r0, arg1
mov r2, arg3
mov r5, arg6
%else
mov r0, r12
mov r2, r13
mov r5, r14
%endif
add r0, 8
add r2, 4
jmp .width_loop
.exit:
%ifndef X86_32
pop r14
pop r13
pop r12
%endif
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
%macro SSSE3_FilterVertical_8px 7
pmaddubsw %1, %4
movdqa %7, %2
pmaddubsw %7, %5
paddw %1, %7
movdqa %7, %3
pmaddubsw %7, %6
paddw %1, %7
paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
%macro SSSE3_FilterVertical2_8px 8
movdqa %8, %2
pxor %7, %7
punpcklbw %1, %7
punpcklbw %8, %7
paddw %1, %8
movdqa %7, %3
pmaddubsw %7, %5
paddw %1, %7
movdqa %7, %4
pmaddubsw %7, %6
paddw %1, %7
paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontalbw_8px 6
movdqa %5, %1
pshufb %1, %2
pshufb %5, %3
pshufd %6, %1, 10110001b
pmaddubsw %1, [pic(db20_128)]
pmaddubsw %5, %4
pmaddubsw %6, %4
paddw %1, %5
paddw %1, %6
%endmacro
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro SSSE3_FilterHorizontal_8px 6
SSSE3_FilterHorizontalbw_8px %1, %2, %3, %4, %5, %6
paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontalbw_2x4px 7
movdqa %6, %1
movdqa %7, %2
pshufb %1, %3
pshufb %2, %3
punpcklqdq %1, %2
pshufb %6, %4
pshufb %7, %4
punpcklqdq %6, %7
pshufd %7, %1, 10110001b
pmaddubsw %1, [pic(db20_128)]
pmaddubsw %6, %5
pmaddubsw %7, %5
paddw %1, %6
paddw %1, %7
%endmacro
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro SSSE3_FilterHorizontal_2x4px 7
SSSE3_FilterHorizontalbw_2x4px %1, %2, %3, %4, %5, %6, %7
paddw %1, [pic(h264_w0x10_1)]
psraw %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro SSSE3_FilterHorizontalbw_2px 3
pmaddubsw %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_128)]
pmaddwd %1, %2
pshufd %3, %1, 10110001b
paddd %1, %3
%endmacro
; pixels=%1 tmp=%2
%macro SSSE3_FilterHorizontal_2px 2
SSSE3_FilterHorizontalbw_2px %1, [pic(dwm1024_128)], %2
paddd %1, [pic(dd32768_128)]
%endmacro
; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
%macro SSE2_FilterVerticalw_8px 7
paddw %1, %6
movdqa %7, %2
paddw %7, %5
psubw %1, %7
psraw %1, 2
psubw %1, %7
movdqa %7, %3
paddw %7, %4
paddw %1, %7
psraw %1, 2
paddw %7, [pic(h264_mc_hc_32)]
paddw %1, %7
psraw %1, 6
%endmacro
;***********************************************************************
; void McHorVer02_ssse3(const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight)
;***********************************************************************
WELS_EXTERN McHorVer02_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%ifdef X86_32_PICASM
%define i_width dword arg5
%else
%define i_width r4
%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
%ifdef X86_32
push r6
%assign push_num 1
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
%assign push_num_begin push_num
cmp i_width, 4
jg .width8or16
movd xmm0, [p_src]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm0, xmm4
movd xmm1, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm1
punpcklqdq xmm0, xmm4
movd xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm1, xmm4
movd xmm2, [p_src]
punpcklbw xmm4, xmm2
punpcklqdq xmm1, xmm4
movd xmm4, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm2, xmm4
movd xmm3, [p_src]
punpcklbw xmm4, xmm3
punpcklqdq xmm2, xmm4
movdqa xmm5, [pic(db20_128)]
SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
movd [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm3, xmm4
movd xmm0, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm0
punpcklqdq xmm3, xmm4
SSSE3_FilterVertical_8px xmm1, xmm2, xmm3, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm1, xmm1
movd [p_dst], xmm1
psrlq xmm1, 32
movd [p_dst + i_dststride], xmm1
cmp i_height, 5
jl .width4_height_le5_done
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm0, xmm4
jg .width4_height_ge8
SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
.width4_height_le5_done:
DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
.width4_height_ge8:
lea p_src, [p_src + 4 * i_srcstride]
movd xmm1, [p_src]
punpcklbw xmm4, xmm1
punpcklqdq xmm0, xmm4
SSSE3_FilterVertical_8px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm2, xmm2
movd [p_dst], xmm2
psrlq xmm2, 32
movd [p_dst + i_dststride], xmm2
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride]
punpcklbw xmm1, xmm4
movd xmm2, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm2
punpcklqdq xmm1, xmm4
SSSE3_FilterVertical_8px xmm3, xmm0, xmm1, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm3, xmm3
movd [p_dst], xmm3
psrlq xmm3, 32
movd [p_dst + i_dststride], xmm3
cmp i_height, 9
jl .width4_height_ge8_done
lea p_dst, [p_dst + 2 * i_dststride]
movd xmm4, [p_src + i_srcstride3]
punpcklbw xmm2, xmm4
SSSE3_FilterVertical_8px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_128)], xmm5, [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm0, xmm0
movd [p_dst], xmm0
.width4_height_ge8_done:
DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
.width8or16:
%assign push_num push_num_begin
sub i_height, 1
push i_height
%assign push_num push_num + 1
%xdefine i_ycnt i_height
%define i_height [r7]
.xloop:
push p_src
push p_dst
%assign push_num push_num + 2
test i_ycnt, 1
jnz .yloop_begin_even
movq xmm0, [p_src]
movq xmm1, [p_src + i_srcstride]
punpcklbw xmm0, xmm1
movq xmm2, [p_src + 2 * i_srcstride]
movq xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm2, xmm3
movq xmm4, [p_src]
movq xmm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm5
SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm7
packuswb xmm0, xmm0
movlps [p_dst], xmm0
add p_dst, i_dststride
jmp .yloop
.yloop_begin_even:
movq xmm1, [p_src]
movq xmm2, [p_src + i_srcstride]
movq xmm3, [p_src + 2 * i_srcstride]
add p_src, i_srcstride3
punpcklbw xmm2, xmm3
movq xmm4, [p_src]
movq xmm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
punpcklbw xmm4, xmm5
.yloop:
movq xmm6, [p_src]
SSSE3_FilterVertical2_8px xmm1, xmm6, xmm2, xmm4, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm0, xmm7
movq xmm7, [p_src + i_srcstride]
punpcklbw xmm6, xmm7
SSSE3_FilterVertical_8px xmm2, xmm4, xmm6, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm0
packuswb xmm1, xmm2
movlps [p_dst], xmm1
movhps [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm0, [p_src + 2 * i_srcstride]
SSSE3_FilterVertical2_8px xmm3, xmm0, xmm4, xmm6, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm2, xmm1
movq xmm1, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm0, xmm1
SSSE3_FilterVertical_8px xmm4, xmm6, xmm0, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm2
packuswb xmm3, xmm4
movlps [p_dst], xmm3
movhps [p_dst + i_dststride], xmm3
cmp i_ycnt, 4
jle .yloop_exit
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm2, [p_src]
SSSE3_FilterVertical2_8px xmm5, xmm2, xmm6, xmm0, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm4, xmm3
movq xmm3, [p_src + i_srcstride]
punpcklbw xmm2, xmm3
SSSE3_FilterVertical_8px xmm6, xmm0, xmm2, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm4
packuswb xmm5, xmm6
movlps [p_dst], xmm5
movhps [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
movq xmm4, [p_src + 2 * i_srcstride]
SSSE3_FilterVertical2_8px xmm7, xmm4, xmm0, xmm2, [pic(maddubsw_m5p20_128)], [pic(maddubsw_p20m5_128)], xmm6, xmm5
movq xmm5, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklbw xmm4, xmm5
SSSE3_FilterVertical_8px xmm0, xmm2, xmm4, [pic(maddubsw_p1m5_128)], [pic(db20_128)], [pic(maddubsw_m5p1_128)], xmm6
packuswb xmm7, xmm0
movlps [p_dst], xmm7
movhps [p_dst + i_dststride], xmm7
lea p_dst, [p_dst + 2 * i_dststride]
sub i_ycnt, 8
jg .yloop
.yloop_exit:
pop p_dst
pop p_src
%assign push_num push_num - 2
sub i_width, 8
jle .width8or16_done
add p_src, 8
add p_dst, 8
mov i_ycnt, i_height
jmp .xloop
.width8or16_done:
pop i_ycnt
%assign push_num push_num - 1
DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
%undef p_src
%undef i_srcstride
%undef i_srcstride3
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_ycnt
;*******************************************************************************
; void McHorVer20_ssse3(const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iWidth,
; int iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_width r4
%define i_height r5
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
movdqa xmm4, [pic(shufb_32435465768798A9)]
movdqa xmm5, [pic(shufb_011267784556ABBC)]
movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8_yloop
jg .width16_yloop
.width4_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm1, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
SSSE3_FilterHorizontal_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
movd [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 2
jg .width4_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width8_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm1, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
packuswb xmm0, xmm1
movlps [p_dst], xmm0
movhps [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 2
jg .width8_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width16_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm1, [p_src + 6]
add p_src, i_srcstride
SSSE3_FilterHorizontal_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
SSSE3_FilterHorizontal_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
packuswb xmm0, xmm1
MOVDQ [p_dst], xmm0
add p_dst, i_dststride
sub i_height, 1
jg .width16_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
;***********************************************************************
; void McHorVer20Width5Or9Or17_ssse3(const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer20Width5Or9Or17_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_width r4
%define i_height r5
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
movdqa xmm5, [pic(shufb_32435465768798A9)]
movdqa xmm6, [pic(shufb_011267784556ABBC)]
movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9_yloop
jg .width17_yloop
.width5_yloop:
movdqu xmm0, [p_src - 2]
add p_src, i_srcstride
SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
packuswb xmm0, xmm0
movdqa xmm1, xmm0
psrlq xmm1, 8
movd [p_dst], xmm0
movd [p_dst + 1], xmm1
add p_dst, i_dststride
sub i_height, 1
jg .width5_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width9_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm4, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
movdqa xmm3, xmm0
punpckhqdq xmm3, xmm4
SSSE3_FilterHorizontal_2px xmm3, xmm2
SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
packuswb xmm3, xmm0
movd [p_dst + 5], xmm3
movhps [p_dst], xmm3
add p_dst, i_dststride
SSSE3_FilterHorizontal_8px xmm4, xmm5, xmm6, xmm7, xmm1, xmm2
packuswb xmm4, xmm4
psrldq xmm3, 4
movd [p_dst + 5], xmm3
movlps [p_dst], xmm4
add p_dst, i_dststride
sub i_height, 2
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm3, [p_src + 6]
add p_src, i_srcstride
movdqa xmm4, xmm3
SSSE3_FilterHorizontal_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
packuswb xmm0, xmm3
movdqu xmm1, [p_src - 2]
movdqu xmm3, [p_src + 6]
add p_src, i_srcstride
punpckhqdq xmm4, xmm3
SSSE3_FilterHorizontal_2px xmm4, xmm2
packuswb xmm4, xmm4
movd [p_dst + 13], xmm4
MOVDQ [p_dst], xmm0
add p_dst, i_dststride
psrldq xmm4, 4
movd [p_dst + 13], xmm4
SSSE3_FilterHorizontal_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm2
SSSE3_FilterHorizontal_8px xmm3, xmm5, xmm6, xmm7, xmm0, xmm2
packuswb xmm1, xmm3
MOVDQ [p_dst], xmm1
add p_dst, i_dststride
sub i_height, 2
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
;*******************************************************************************
; void McHorVer20Width4U8ToS16_ssse3(const uint8_t *pSrc,
; int iSrcStride,
; int16_t *pDst,
; int iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20Width4U8ToS16_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_height r3
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
movdqa xmm4, [pic(shufb_32435465768798A9)]
movdqa xmm5, [pic(shufb_011267784556ABBC)]
movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm1, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
SSSE3_FilterHorizontalbw_2x4px xmm0, xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
movdqa [p_dst], xmm0
add p_dst, 16
sub i_height, 2
jg .yloop
; Height % 2 remainder.
movdqu xmm0, [p_src - 2]
SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
movlps [p_dst], xmm0
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
;***********************************************************************
; void McHorVer02Width4S16ToU8_ssse3(const int16_t *pSrc,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width4S16ToU8_ssse3
%define p_src r0
%define p_dst r1
%define i_dststride r2
%define i_height r3
%define i_srcstride 8
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
movdqa xmm0, [p_src + 0 * i_srcstride]
movdqu xmm1, [p_src + 1 * i_srcstride]
movdqa xmm2, [p_src + 2 * i_srcstride]
movdqu xmm3, [p_src + 3 * i_srcstride]
movdqa xmm4, [p_src + 4 * i_srcstride]
movdqu xmm5, [p_src + 5 * i_srcstride]
movdqa xmm6, [p_src + 6 * i_srcstride]
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
packuswb xmm0, xmm0
movd [p_dst], xmm0
psrlq xmm0, 32
movd [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
movdqu xmm7, [p_src + 7 * i_srcstride]
movdqa xmm0, [p_src + 8 * i_srcstride]
SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm1
packuswb xmm2, xmm2
movd [p_dst], xmm2
psrlq xmm2, 32
movd [p_dst + i_dststride], xmm2
cmp i_height, 4
jle .done
lea p_dst, [p_dst + 2 * i_dststride]
movdqu xmm1, [p_src + 9 * i_srcstride]
movdqa xmm2, [p_src + 10 * i_srcstride]
SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm3
packuswb xmm4, xmm4
movd [p_dst], xmm4
psrlq xmm4, 32
movd [p_dst + i_dststride], xmm4
lea p_dst, [p_dst + 2 * i_dststride]
movdqu xmm3, [p_src + 11 * i_srcstride]
SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm5
packuswb xmm6, xmm6
movd [p_dst], xmm6
psrlq xmm6, 32
movd [p_dst + i_dststride], xmm6
.done:
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride
;***********************************************************************
; void McHorVer20Width8U8ToS16_ssse3(const uint8_t *pSrc,
; int16_t iSrcStride,
; int16_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer20Width8U8ToS16_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_height r4
%assign push_num 0
INIT_X86_32_PIC r5
LOAD_5_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub p_src, i_srcstride
sub p_src, i_srcstride
movdqa xmm4, [pic(shufb_32435465768798A9)]
movdqa xmm5, [pic(shufb_011267784556ABBC)]
movdqa xmm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm1, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
MOVDQ [p_dst], xmm0
add p_dst, i_dststride
SSSE3_FilterHorizontalbw_8px xmm1, xmm4, xmm5, xmm6, xmm2, xmm3
MOVDQ [p_dst], xmm1
add p_dst, i_dststride
sub i_height, 2
jg .yloop
jl .done
movdqu xmm0, [p_src - 2]
SSSE3_FilterHorizontalbw_8px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
MOVDQ [p_dst], xmm0
.done:
POP_XMM
LOAD_5_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
;***********************************************************************
; void McHorVer02Width5S16ToU8_ssse3(const int16_t *pSrc,
; int32_t iTapStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width5S16ToU8_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_height r4
%define i_srcstride3 r5
%assign push_num 0
%ifdef X86_32
push r5
%assign push_num 1
%endif
INIT_X86_32_PIC r6
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
lea i_srcstride3, [3 * i_srcstride]
movdqa xmm0, [p_src]
movdqa xmm1, [p_src + i_srcstride]
movdqa xmm2, [p_src + 2 * i_srcstride]
movdqa xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
movdqa xmm4, [p_src]
movdqa xmm5, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movdqa xmm6, [p_src + 2 * i_srcstride]
packuswb xmm0, xmm0
movdqa xmm7, xmm0
psrlq xmm7, 8
movd [p_dst + 1], xmm7
movd [p_dst], xmm0
add p_dst, i_dststride
SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
movdqa xmm7, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
packuswb xmm1, xmm1
movdqa xmm0, xmm1
psrlq xmm0, 8
movd [p_dst + 1], xmm0
movd [p_dst], xmm1
add p_dst, i_dststride
SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
movdqa xmm0, [p_src]
packuswb xmm2, xmm2
movdqa xmm1, xmm2
psrlq xmm1, 8
movd [p_dst + 1], xmm1
movd [p_dst], xmm2
add p_dst, i_dststride
SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
packuswb xmm3, xmm3
movdqa xmm2, xmm3
psrlq xmm2, 8
movd [p_dst + 1], xmm2
movd [p_dst], xmm3
add p_dst, i_dststride
movdqa xmm1, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
packuswb xmm4, xmm4
movdqa xmm3, xmm4
psrlq xmm3, 8
movd [p_dst + 1], xmm3
movd [p_dst], xmm4
cmp i_height, 5
jle .done
add p_dst, i_dststride
movdqa xmm2, [p_src + 2 * i_srcstride]
SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
movdqa xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
packuswb xmm5, xmm5
movdqa xmm4, xmm5
psrlq xmm4, 8
movd [p_dst + 1], xmm4
movd [p_dst], xmm5
add p_dst, i_dststride
SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm4, [p_src]
packuswb xmm6, xmm6
movdqa xmm5, xmm6
psrlq xmm5, 8
movd [p_dst + 1], xmm5
movd [p_dst], xmm6
add p_dst, i_dststride
SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
packuswb xmm7, xmm7
movdqa xmm6, xmm7
psrlq xmm6, 8
movd [p_dst + 1], xmm6
movd [p_dst], xmm7
add p_dst, i_dststride
movdqa xmm5, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
packuswb xmm0, xmm0
movdqa xmm7, xmm0
psrlq xmm7, 8
movd [p_dst + 1], xmm7
movd [p_dst], xmm0
.done:
POP_XMM
LOAD_5_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r5
%endif
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride3
;***********************************************************************
; void McHorVer20Width9Or17U8ToS16_ssse3(const uint8_t *pSrc,
; int32_t iSrcStride,
; int16_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer20Width9Or17U8ToS16_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_width r4
%define i_height r5
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
sub p_src, i_srcstride
sub p_src, i_srcstride
pcmpeqw xmm4, xmm4
psllw xmm4, 15 ; dw -32768
movdqa xmm5, [pic(shufb_32435465768798A9)]
movdqa xmm6, [pic(shufb_011267784556ABBC)]
movdqa xmm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
jne .width17_yloop
.width9_yloop:
movdqu xmm0, [p_src - 2]
movdqa xmm3, xmm0
SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
movdqu xmm2, [p_src + i_srcstride - 2]
lea p_src, [p_src + 2 * i_srcstride]
punpckhqdq xmm3, xmm2
SSSE3_FilterHorizontalbw_2px xmm3, xmm4, xmm1
movlps [p_dst + 10], xmm3
MOVDQ [p_dst], xmm0
add p_dst, i_dststride
movhps [p_dst + 10], xmm3
SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm1, xmm0
MOVDQ [p_dst], xmm2
add p_dst, i_dststride
sub i_height, 2
jg .width9_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width17_yloop:
movdqu xmm0, [p_src - 2]
movdqu xmm3, [p_src + 6]
add p_src, i_srcstride
SSSE3_FilterHorizontalbw_8px xmm0, xmm5, xmm6, xmm7, xmm1, xmm2
MOVDQ [p_dst], xmm0
movdqa xmm0, xmm3
SSSE3_FilterHorizontalbw_8px xmm3, xmm5, xmm6, xmm7, xmm1, xmm2
movdqu xmm2, [p_src + 6]
punpckhqdq xmm0, xmm2
SSSE3_FilterHorizontalbw_2px xmm0, xmm4, xmm1
movdqu xmm1, [p_src - 2]
add p_src, i_srcstride
movlps [p_dst + 26], xmm0
MOVDQ [p_dst + 16], xmm3
add p_dst, i_dststride
movhps [p_dst + 26], xmm0
SSSE3_FilterHorizontalbw_8px xmm1, xmm5, xmm6, xmm7, xmm0, xmm3
MOVDQ [p_dst], xmm1
SSSE3_FilterHorizontalbw_8px xmm2, xmm5, xmm6, xmm7, xmm0, xmm3
MOVDQ [p_dst + 16], xmm2
add p_dst, i_dststride
sub i_height, 2
jg .width17_yloop
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
;***********************************************************************
; void McHorVer02WidthGe8S16ToU8_ssse3(const int16_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02WidthGe8S16ToU8_ssse3
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%ifdef X86_32_PICASM
%define i_width dword arg5
%else
%define i_width r4
%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
%ifdef X86_32
push r6
%assign push_num 1
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
push i_height
%assign push_num push_num + 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
jz .width_loop
push p_src
push p_dst
%assign push_num push_num + 2
%ifdef X86_32_PICASM
add p_src, i_width
add p_src, i_width
sub p_src, 2
%else
lea p_src, [p_src + 2 * i_width - 2]
%endif
add p_dst, i_width
movd xmm0, [p_src]
punpcklwd xmm0, [p_src + i_srcstride]
movd xmm1, [p_src + 2 * i_srcstride]
add p_src, i_srcstride3
punpcklwd xmm1, [p_src]
punpckldq xmm0, xmm1
movd xmm1, [p_src + i_srcstride]
cmp i_height, 4
je .filter5_unalign
punpcklwd xmm1, [p_src + 2 * i_srcstride]
movd xmm2, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklwd xmm2, [p_src]
punpckldq xmm1, xmm2
punpcklqdq xmm0, xmm1
.height_loop_unalign:
movd xmm1, [p_src + i_srcstride]
palignr xmm1, xmm0, 2
movd xmm2, [p_src + 2 * i_srcstride]
palignr xmm2, xmm1, 2
movd xmm3, [p_src + i_srcstride3]
palignr xmm3, xmm2, 2
lea p_src, [p_src + 4 * i_srcstride]
movd xmm4, [p_src]
palignr xmm4, xmm3, 2
movd xmm5, [p_src + i_srcstride]
palignr xmm5, xmm4, 2
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
packuswb xmm0, xmm0
movdqa xmm6, xmm0
pslld xmm6, 24
movd [p_dst - 4], xmm6
movlps [p_dst + 4 * i_dststride - 8], xmm6
add p_dst, i_dststride
movdqa xmm6, xmm0
pslld xmm6, 16
movd [p_dst - 4], xmm6
movlps [p_dst + 4 * i_dststride - 8], xmm6
add p_dst, i_dststride
movdqa xmm6, xmm0
pslld xmm6, 8
movd [p_dst - 4], xmm6
movd [p_dst + i_dststride - 4], xmm0
lea p_dst, [p_dst + 4 * i_dststride]
movlps [p_dst - 8], xmm6
movlps [p_dst + i_dststride - 8], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jle .height_loop_unalign_exit
movd xmm1, [p_src + 2 * i_srcstride]
palignr xmm1, xmm5, 2
movd xmm0, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
punpcklwd xmm0, [p_src]
palignr xmm0, xmm1, 4
jmp .height_loop_unalign
.height_loop_unalign_exit:
movddup xmm6, [p_src + 2 * i_srcstride - 6]
SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
packuswb xmm1, xmm1
movlps [p_dst - 8], xmm1
jmp .unalign_done
.filter5_unalign:
pslldq xmm0, 8
palignr xmm1, xmm0, 2
movd xmm2, [p_src + 2 * i_srcstride]
palignr xmm2, xmm1, 2
movd xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
palignr xmm3, xmm2, 2
movd xmm4, [p_src]
palignr xmm4, xmm3, 2
movd xmm5, [p_src + i_srcstride]
palignr xmm5, xmm4, 2
movd xmm6, [p_src + 2 * i_srcstride]
palignr xmm6, xmm5, 2
SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
packuswb xmm1, xmm1
movdqa xmm0, xmm1
psrlq xmm1, 8
movdqa xmm2, xmm0
psrlq xmm2, 16
movdqa xmm3, xmm0
psrlq xmm3, 24
movd [p_dst - 4], xmm0
movd [p_dst + i_dststride - 4], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
movd [p_dst - 4], xmm2
movd [p_dst + i_dststride - 4], xmm3
movlps [p_dst + 2 * i_dststride - 8], xmm0
.unalign_done:
pop p_dst
pop p_src
%assign push_num push_num - 2
mov i_height, [r7]
sub i_width, 1
.width_loop:
push p_src
push p_dst
%assign push_num push_num + 2
movdqa xmm0, [p_src]
movdqa xmm1, [p_src + i_srcstride]
movdqa xmm2, [p_src + 2 * i_srcstride]
movdqa xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
movdqa xmm4, [p_src]
.height_loop:
movdqa xmm5, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
movdqa xmm6, [p_src + 2 * i_srcstride]
SSE2_FilterVerticalw_8px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
movdqa xmm7, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
packuswb xmm0, xmm1
movlps [p_dst], xmm0
movhps [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
SSE2_FilterVerticalw_8px xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0
movdqa xmm0, [p_src]
SSE2_FilterVerticalw_8px xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1
packuswb xmm2, xmm3
movlps [p_dst], xmm2
movhps [p_dst + i_dststride], xmm2
cmp i_height, 4
jl .x_loop_dec
lea p_dst, [p_dst + 2 * i_dststride]
movdqa xmm1, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2
je .store_xmm4_exit
movdqa xmm2, [p_src + 2 * i_srcstride]
SSE2_FilterVerticalw_8px xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3
movdqa xmm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
packuswb xmm4, xmm5
movlps [p_dst], xmm4
movhps [p_dst + i_dststride], xmm4
lea p_dst, [p_dst + 2 * i_dststride]
SSE2_FilterVerticalw_8px xmm6, xmm7, xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm4, [p_src]
SSE2_FilterVerticalw_8px xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
packuswb xmm6, xmm7
movlps [p_dst], xmm6
movhps [p_dst + i_dststride], xmm6
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .height_loop
jl .x_loop_dec
movdqa xmm5, [p_src + i_srcstride]
SSE2_FilterVerticalw_8px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
packuswb xmm0, xmm0
movlps [p_dst], xmm0
.x_loop_dec:
pop p_dst
pop p_src
%assign push_num push_num - 2
sub i_width, 8
jle .done
mov i_height, [r7]
add p_src, 16
add p_dst, 8
jmp .width_loop
.store_xmm4_exit:
packuswb xmm4, xmm4
movlps [p_dst], xmm4
pop p_dst
pop p_src
.done:
pop i_height
%assign push_num push_num - 1
DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_srcstride3
%ifdef HAVE_AVX2
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 maddubsw_p1m5_p1m5_m5p1_m5p1=%4 tmp=%5,%6
%macro AVX2_FilterHorizontalbw_16px 6
vpshufb %5, %1, %3
vpshufb %1, %1, %2
vpshufd %6, %1, 10110001b
vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %5, %5, %4
vpmaddubsw %6, %6, %4
vpaddw %1, %1, %5
vpaddw %1, %1, %6
%endmacro
; pixels=%1 shufb_32435465768798A9=%2 shufb_011267784556ABBC=%3 db20=%4 tmp=%5,%6
%macro AVX2_FilterHorizontal_16px 6
AVX2_FilterHorizontalbw_16px %1, %2, %3, %4, %5, %6
vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 maddubsw_p1m5_p1m5_m5p1_m5p1=%5 tmp=%6,%7
%macro AVX2_FilterHorizontalbw_4x4px 7
vpshufb %6, %1, %4
vpshufb %7, %2, %4
vpshufb %1, %1, %3
vpshufb %2, %2, %3
vpunpcklqdq %1, %1, %2
vpunpcklqdq %6, %6, %7
vpshufd %7, %1, 10110001b
vpmaddubsw %1, %1, [pic(db20_256)]
vpmaddubsw %6, %6, %5
vpmaddubsw %7, %7, %5
vpaddw %1, %1, %6
vpaddw %1, %1, %7
%endmacro
; px0=%1 px1=%2 shufb_32435465768798A9=%3 shufb_011267784556ABBC=%4 db20=%5 tmp=%6,%7
%macro AVX2_FilterHorizontal_4x4px 7
AVX2_FilterHorizontalbw_4x4px %1, %2, %3, %4, %5, %6, %7
vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; pixels=%1 -32768>>scale=%2 tmp=%3
%macro AVX2_FilterHorizontalbw_4px 3
vpmaddubsw %1, %1, [pic(maddubsw_m2p10_m40m40_p10m2_p0p0_256)]
vpmaddwd %1, %1, %2
vpshufd %3, %1, 10110001b
vpaddd %1, %1, %3
%endmacro
; pixels=%1 tmp=%2
%macro AVX2_FilterHorizontal_4px 2
AVX2_FilterHorizontalbw_4px %1, [pic(dwm1024_256)], %2
vpaddd %1, %1, [pic(dd32768_256)]
%endmacro
; px_ab=%1 px_cd=%2 px_ef=%3 maddubsw_ab=%4 maddubsw_cd=%5 maddubsw_ef=%6 tmp=%7
%macro AVX2_FilterVertical_16px 7
vpmaddubsw %1, %1, %4
vpmaddubsw %7, %2, %5
vpaddw %1, %1, %7
vpmaddubsw %7, %3, %6
vpaddw %1, %1, %7
vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; px_a=%1 px_f=%2 px_bc=%3 px_de=%4 maddubsw_bc=%5 maddubsw_de=%6 tmp=%7,%8
%macro AVX2_FilterVertical2_16px 8
vpxor %7, %7, %7
vpunpcklbw %1, %1, %7
vpunpcklbw %8, %2, %7
vpaddw %1, %1, %8
vpmaddubsw %7, %3, %5
vpaddw %1, %1, %7
vpmaddubsw %7, %4, %6
vpaddw %1, %1, %7
vpaddw %1, %1, [pic(h264_w0x10_256)]
vpsraw %1, %1, 5
%endmacro
; px0=%1 px1=%2 px2=%3 px3=%4 px4=%5 px5=%6 tmp=%7
%macro AVX2_FilterVerticalw_16px 7
vpaddw %1, %1, %6
vpaddw %7, %2, %5
vpsubw %1, %1, %7
vpsraw %1, %1, 2
vpsubw %1, %1, %7
vpaddw %7, %3, %4
vpaddw %1, %1, %7
vpsraw %1, %1, 2
vpaddw %7, %7, [pic(dw32_256)]
vpaddw %1, %1, %7
vpsraw %1, %1, 6
%endmacro
;***********************************************************************
; void McHorVer02_avx2(const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight)
;***********************************************************************
WELS_EXTERN McHorVer02_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%ifdef X86_32_PICASM
%define i_width dword arg5
%else
%define i_width r4
%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
%ifdef X86_32
push r6
%assign push_num 1
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
INIT_X86_32_PIC_NOPRESERVE r4
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
cmp i_width, 8
je .width8
jg .width16
; .width4:
vmovd xmm0, [p_src]
vpbroadcastd xmm5, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm0, xmm5
vpbroadcastd ymm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm5, xmm5, xmm1
vpblendd xmm0, xmm0, xmm5, 1100b
vpbroadcastd ymm5, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw ymm1, ymm1, ymm5
vpbroadcastd ymm2, [p_src]
vpunpcklbw ymm5, ymm5, ymm2
vpblendd ymm1, ymm1, ymm5, 11001100b
vpblendd ymm0, ymm0, ymm1, 11110000b
vpbroadcastd ymm5, [p_src + i_srcstride]
lea p_src, [p_src + 2 * i_srcstride]
vpunpcklbw ymm2, ymm2, ymm5
vpbroadcastd ymm3, [p_src]
vpunpcklbw ymm5, ymm5, ymm3
vpblendd ymm2, ymm2, ymm5, 11001100b
vpblendd ymm1, ymm1, ymm2, 11110000b
vpbroadcastd ymm5, [p_src + i_srcstride]
vpunpcklbw ymm3, ymm3, ymm5
vpbroadcastd ymm4, [p_src + 2 * i_srcstride]
vpunpcklbw ymm5, ymm5, ymm4
vpblendd ymm3, ymm3, ymm5, 11001100b
vpblendd ymm2, ymm2, ymm3, 11110000b
vbroadcasti128 ymm6, [pic(db20_128)]
AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm5, xmm0, 32
vmovd [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
vextracti128 xmm0, ymm0, 1
vmovd [p_dst], xmm0
vpsrlq xmm5, xmm0, 32
vmovd [p_dst + i_dststride], xmm5
cmp i_height, 5
jl .width4_done
lea p_dst, [p_dst + 2 * i_dststride]
vpbroadcastd ymm5, [p_src + i_srcstride3]
vpunpcklbw ymm4, ymm4, ymm5
jg .width4_height_ge8
AVX2_FilterVertical_16px xmm2, xmm3, xmm4, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm2, xmm2, xmm2
vmovd [p_dst], xmm2
jmp .width4_done
.width4_height_ge8:
lea p_src, [p_src + 4 * i_srcstride]
vpbroadcastd ymm1, [p_src]
vpunpcklbw ymm5, ymm5, ymm1
vpblendd ymm4, ymm4, ymm5, 11001100b
vpblendd ymm3, ymm3, ymm4, 11110000b
vpbroadcastd ymm5, [p_src + i_srcstride]
vpunpcklbw ymm1, ymm5
vpbroadcastd ymm0, [p_src + 2 * i_srcstride]
vpunpcklbw ymm5, ymm5, ymm0
vpblendd ymm1, ymm1, ymm5, 11001100b
vpblendd ymm4, ymm4, ymm1, 11110000b
AVX2_FilterVertical_16px ymm2, ymm3, ymm4, [pic(maddubsw_p1m5_256)], ymm6, [pic(maddubsw_m5p1_256)], ymm5
vpackuswb ymm2, ymm2, ymm2
vmovd [p_dst], xmm2
vpsrlq xmm5, xmm2, 32
vmovd [p_dst + i_dststride], xmm5
lea p_dst, [p_dst + 2 * i_dststride]
vextracti128 xmm2, ymm2, 1
vmovd [p_dst], xmm2
vpsrlq xmm5, xmm2, 32
vmovd [p_dst + i_dststride], xmm5
cmp i_height, 9
jl .width4_done
lea p_dst, [p_dst + 2 * i_dststride]
vmovd xmm5, [p_src + i_srcstride3]
vpunpcklbw xmm0, xmm0, xmm5
AVX2_FilterVertical_16px xmm4, xmm1, xmm0, [pic(maddubsw_p1m5_256)], xmm6, [pic(maddubsw_m5p1_256)], xmm5
vpackuswb xmm4, xmm4, xmm4
vmovd [p_dst], xmm4
.width4_done:
vzeroupper
DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
.width8:
sub i_height, 1
vmovq xmm0, [p_src]
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm0, xmm4
vmovq xmm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm1
vinserti128 ymm0, ymm0, xmm4, 1
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm1, xmm1, xmm4
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm1, ymm1, xmm4, 1
.width8_yloop:
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm2, xmm6, xmm4
vmovq xmm3, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm3
vinserti128 ymm2, ymm2, xmm4, 1
vbroadcasti128 ymm5, [pic(db20_128)]
AVX2_FilterVertical_16px ymm0, ymm1, ymm2, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm3, xmm3, xmm4
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm3, ymm3, xmm4, 1
AVX2_FilterVertical_16px ymm1, ymm2, ymm3, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm0, ymm0, ymm1
vmovlps [p_dst], xmm0
vextracti128 xmm1, ymm0, 1
vmovlps [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
vmovhps [p_dst], xmm0
vmovhps [p_dst + i_dststride], xmm1
cmp i_height, 4
jl .width8_done
lea p_dst, [p_dst + 2 * i_dststride]
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm0, xmm6, xmm4
jg .width8_height_ge8
AVX2_FilterVertical_16px xmm2, xmm3, xmm0, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm2, xmm2, xmm2
vmovlps [p_dst], xmm2
jmp .width8_done
.width8_height_ge8:
vmovq xmm1, [p_src + 2 * i_srcstride]
vpunpcklbw xmm4, xmm4, xmm1
vinserti128 ymm0, ymm0, xmm4, 1
AVX2_FilterVertical_16px ymm2, ymm3, ymm0, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vmovq xmm4, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklbw xmm1, xmm1, xmm4
vmovq xmm6, [p_src]
vpunpcklbw xmm4, xmm4, xmm6
vinserti128 ymm1, ymm1, xmm4, 1
AVX2_FilterVertical_16px ymm3, ymm0, ymm1, [pic(maddubsw_p1m5_256)], ymm5, [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm2, ymm2, ymm3
vmovlps [p_dst], xmm2
vextracti128 xmm3, ymm2, 1
vmovlps [p_dst + i_dststride], xmm3
lea p_dst, [p_dst + 2 * i_dststride]
vmovhps [p_dst], xmm2
vmovhps [p_dst + i_dststride], xmm3
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .width8_yloop
jl .width8_done
vmovq xmm4, [p_src + i_srcstride]
vpunpcklbw xmm2, xmm6, xmm4
AVX2_FilterVertical_16px xmm0, xmm1, xmm2, [pic(maddubsw_p1m5_256)], xmm5, [pic(maddubsw_m5p1_256)], xmm4
vpackuswb xmm0, xmm0, xmm0
vmovlps [p_dst], xmm0
.width8_done:
vzeroupper
DEINIT_X86_32_PIC_KEEPDEF
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
.width16:
sub i_height, 1
test i_height, 1
jnz .width16_yloop_begin_even
vmovq xmm0, [p_src]
vpbroadcastq ymm1, [p_src + 8]
vpblendd ymm0, ymm0, ymm1, 11110000b
vmovq xmm1, [p_src + i_srcstride]
vpbroadcastq ymm2, [p_src + i_srcstride + 8]
vpblendd ymm1, ymm1, ymm2, 11110000b
vpunpcklbw ymm0, ymm0, ymm1
vmovq xmm2, [p_src + 2 * i_srcstride]
vpbroadcastq ymm3, [p_src + 2 * i_srcstride + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
vmovq xmm3, [p_src + i_srcstride3]
vpbroadcastq ymm4, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
vmovq xmm4, [p_src]
vpbroadcastq ymm5, [p_src + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
vmovq xmm5, [p_src + i_srcstride]
vpbroadcastq ymm6, [p_src + i_srcstride + 8]
lea p_src, [p_src + 2 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm7
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 1000b
vmovdqa [p_dst], xmm0
add p_dst, i_dststride
jmp .width16_yloop
.width16_yloop_begin_even:
vmovq xmm1, [p_src]
vpbroadcastq ymm2, [p_src + 8]
vpblendd ymm1, ymm1, ymm2, 11110000b
vmovq xmm2, [p_src + i_srcstride]
vpbroadcastq ymm3, [p_src + i_srcstride + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
vmovq xmm3, [p_src + 2 * i_srcstride]
vpbroadcastq ymm4, [p_src + 2 * i_srcstride + 8]
add p_src, i_srcstride3
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
vmovq xmm4, [p_src]
vpbroadcastq ymm5, [p_src + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
vmovq xmm5, [p_src + i_srcstride]
vpbroadcastq ymm6, [p_src + i_srcstride + 8]
lea p_src, [p_src + 2 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
.width16_yloop:
vmovq xmm6, [p_src]
vpbroadcastq ymm7, [p_src + 8]
vpblendd ymm6, ymm6, ymm7, 11110000b
AVX2_FilterVertical2_16px ymm1, ymm6, ymm2, ymm4, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm0, ymm7
vmovq xmm7, [p_src + i_srcstride]
vpbroadcastq ymm0, [p_src + i_srcstride + 8]
vpblendd ymm7, ymm7, ymm0, 11110000b
vpunpcklbw ymm6, ymm6, ymm7
AVX2_FilterVertical_16px ymm2, ymm4, ymm6, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm0
vpackuswb ymm1, ymm1, ymm2
vpermq ymm1, ymm1, 11011000b
vmovdqa [p_dst], xmm1
vextracti128 [p_dst + i_dststride], ymm1, 1
lea p_dst, [p_dst + 2 * i_dststride]
vmovq xmm0, [p_src + 2 * i_srcstride]
vpbroadcastq ymm1, [p_src + 2 * i_srcstride + 8]
vpblendd ymm0, ymm0, ymm1, 11110000b
AVX2_FilterVertical2_16px ymm3, ymm0, ymm4, ymm6, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm2, ymm1
vmovq xmm1, [p_src + i_srcstride3]
vpbroadcastq ymm2, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm1, ymm1, ymm2, 11110000b
vpunpcklbw ymm0, ymm0, ymm1
AVX2_FilterVertical_16px ymm4, ymm6, ymm0, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm2
vpackuswb ymm3, ymm3, ymm4
vpermq ymm3, ymm3, 11011000b
vmovdqa [p_dst], xmm3
vextracti128 [p_dst + i_dststride], ymm3, 1
lea p_dst, [p_dst + 2 * i_dststride]
vmovq xmm2, [p_src]
vpbroadcastq ymm3, [p_src + 8]
vpblendd ymm2, ymm2, ymm3, 11110000b
AVX2_FilterVertical2_16px ymm5, ymm2, ymm6, ymm0, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm4, ymm3
vmovq xmm3, [p_src + i_srcstride]
vpbroadcastq ymm4, [p_src + i_srcstride + 8]
vpblendd ymm3, ymm3, ymm4, 11110000b
vpunpcklbw ymm2, ymm2, ymm3
AVX2_FilterVertical_16px ymm6, ymm0, ymm2, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm4
vpackuswb ymm5, ymm5, ymm6
vpermq ymm5, ymm5, 11011000b
vmovdqa [p_dst], xmm5
vextracti128 [p_dst + i_dststride], ymm5, 1
lea p_dst, [p_dst + 2 * i_dststride]
vmovq xmm4, [p_src + 2 * i_srcstride]
vpbroadcastq ymm5, [p_src + 2 * i_srcstride + 8]
vpblendd ymm4, ymm4, ymm5, 11110000b
AVX2_FilterVertical2_16px ymm7, ymm4, ymm0, ymm2, [pic(maddubsw_m5p20_256)], [pic(maddubsw_p20m5_256)], ymm6, ymm5
vmovq xmm5, [p_src + i_srcstride3]
vpbroadcastq ymm6, [p_src + i_srcstride3 + 8]
lea p_src, [p_src + 4 * i_srcstride]
vpblendd ymm5, ymm5, ymm6, 11110000b
vpunpcklbw ymm4, ymm4, ymm5
AVX2_FilterVertical_16px ymm0, ymm2, ymm4, [pic(maddubsw_p1m5_256)], [pic(db20_256)], [pic(maddubsw_m5p1_256)], ymm6
vpackuswb ymm7, ymm7, ymm0
vpermq ymm7, ymm7, 11011000b
vmovdqa [p_dst], xmm7
vextracti128 [p_dst + i_dststride], ymm7, 1
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .width16_yloop
vzeroupper
DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
%undef p_src
%undef i_srcstride
%undef i_srcstride3
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_ycnt
;*******************************************************************************
; void McHorVer20_avx2(const uint8_t *pSrc,
; int iSrcStride,
; uint8_t *pDst,
; int iDstStride,
; int iWidth,
; int iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_width r4
%define i_height r5
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 8
je .width8
jg .width16_yloop
%xdefine i_srcstride3 i_width
%undef i_width
lea i_srcstride3, [3 * i_srcstride]
.width4_yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm1, [p_src + i_srcstride - 2]
vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
lea p_src, [p_src + 4 * i_srcstride]
AVX2_FilterHorizontal_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm1, xmm0, 32
vmovd [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
vextracti128 xmm0, ymm0, 1
vmovd [p_dst], xmm0
vpsrlq xmm1, xmm0, 32
vmovd [p_dst + i_dststride], xmm1
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 4
jg .width4_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width8:
lea i_srcstride3, [3 * i_srcstride]
.width8_yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm1, [p_src + i_srcstride - 2]
vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
lea p_src, [p_src + 4 * i_srcstride]
AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
vpackuswb ymm0, ymm0, ymm1
vmovlps [p_dst], xmm0
vmovhps [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
vextracti128 xmm0, ymm0, 1
vmovlps [p_dst], xmm0
vmovhps [p_dst + i_dststride], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 4
jg .width8_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
%undef i_srcstride3
.width16_yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm1, [p_src + 6]
vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1
vinserti128 ymm1, ymm1, [p_src + i_srcstride + 6], 1
lea p_src, [p_src + 2 * i_srcstride]
AVX2_FilterHorizontal_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
AVX2_FilterHorizontal_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
vpackuswb ymm0, ymm0, ymm1
vmovdqa [p_dst], xmm0
vextracti128 [p_dst + i_dststride], ymm0, 1
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 2
jg .width16_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
;***********************************************************************
; void McHorVer20Width5Or9Or17_avx2(const uint8_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer20Width5Or9Or17_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%define i_width r4
%define i_height r5
%assign push_num 0
INIT_X86_32_PIC r6
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
cmp i_width, 9
je .width9
jg .width17
.width5_yloop:
vmovdqu xmm0, [p_src - 2]
vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1
lea p_src, [p_src + 2 * i_srcstride]
AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
vpackuswb ymm0, ymm0, ymm0
vpsrlq xmm1, xmm0, 8
vmovd [p_dst + 1], xmm1
vmovd [p_dst], xmm0
add p_dst, i_dststride
vextracti128 xmm0, ymm0, 1
vpsrlq xmm1, xmm0, 8
vmovd [p_dst + 1], xmm1
vmovd [p_dst], xmm0
add p_dst, i_dststride
sub i_height, 2
jg .width5_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width9:
%xdefine i_srcstride3 i_width
%undef i_width
lea i_srcstride3, [3 * i_srcstride]
.width9_yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm4, [p_src + i_srcstride - 2]
vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
vinserti128 ymm4, ymm4, [p_src + i_srcstride3 - 2], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm3, ymm0, ymm4
AVX2_FilterHorizontal_4px ymm3, ymm2
AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
vpackuswb ymm3, ymm3, ymm0
vmovd [p_dst + 5], xmm3
vmovhps [p_dst], xmm3
add p_dst, i_dststride
AVX2_FilterHorizontal_16px ymm4, ymm5, ymm6, ymm7, ymm1, ymm2
vpackuswb ymm4, ymm4, ymm4
vpsrlq xmm2, xmm3, 32
vmovd [p_dst + 5], xmm2
vmovlps [p_dst], xmm4
add p_dst, i_dststride
vextracti128 xmm3, ymm3, 1
vextracti128 xmm4, ymm4, 1
vmovd [p_dst + 5], xmm3
vmovhps [p_dst], xmm3
add p_dst, i_dststride
vpsrlq xmm2, xmm3, 32
vmovd [p_dst + 5], xmm2
vmovlps [p_dst], xmm4
add p_dst, i_dststride
sub i_height, 4
jg .width9_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC_KEEPDEF
ret
.width17:
lea i_srcstride3, [3 * i_srcstride]
.width17_yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm3, [p_src + 6]
vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vmovdqa ymm4, ymm3
AVX2_FilterHorizontal_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vpackuswb ymm0, ymm0, ymm3
vmovdqu xmm1, [p_src + 2 * i_srcstride - 2]
vmovdqu xmm3, [p_src + 2 * i_srcstride + 6]
vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
vinserti128 ymm3, ymm3, [p_src + i_srcstride3 + 6], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm4, ymm4, ymm3
AVX2_FilterHorizontal_4px ymm4, ymm2
vpackuswb ymm4, ymm4, ymm4
vmovd [p_dst + 13], xmm4
vmovdqa [p_dst], xmm0
add p_dst, i_dststride
vextracti128 xmm2, ymm4, 1
vmovd [p_dst + 13], xmm2
vextracti128 [p_dst], ymm0, 1
add p_dst, i_dststride
vpsrlq xmm4, xmm4, 32
vmovd [p_dst + 13], xmm4
AVX2_FilterHorizontal_16px ymm1, ymm5, ymm6, ymm7, ymm0, ymm4
AVX2_FilterHorizontal_16px ymm3, ymm5, ymm6, ymm7, ymm0, ymm4
vpackuswb ymm1, ymm1, ymm3
vmovdqa [p_dst], xmm1
add p_dst, i_dststride
vpsrlq xmm2, xmm2, 32
vmovd [p_dst + 13], xmm2
vextracti128 [p_dst], ymm1, 1
add p_dst, i_dststride
sub i_height, 4
jg .width17_yloop
vzeroupper
POP_XMM
LOAD_6_PARA_POP
DEINIT_X86_32_PIC
ret
%undef i_srcstride3
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
;*******************************************************************************
; void McHorVer20Width4U8ToS16_avx2(const uint8_t *pSrc,
; int iSrcStride,
; int16_t *pDst,
; int iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20Width4U8ToS16_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_height r3
%define i_srcstride3 r4
%define i_dststride 8
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm1, [p_src + i_srcstride - 2]
vinserti128 ymm0, ymm0, [p_src + 2 * i_srcstride - 2], 1
vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
lea p_src, [p_src + 4 * i_srcstride]
AVX2_FilterHorizontalbw_4x4px ymm0, ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
vmovdqa [p_dst], ymm0
add p_dst, 4 * i_dststride
sub i_height, 4
jg .yloop
; Height % 4 remaining single.
vmovdqu xmm0, [p_src - 2]
AVX2_FilterHorizontalbw_16px xmm0, xmm4, xmm5, xmm6, xmm2, xmm3
vmovlps [p_dst], xmm0
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_srcstride3
%undef i_dststride
;***********************************************************************
; void McHorVer02Width4S16ToU8_avx2(const int16_t *pSrc,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width4S16ToU8_avx2
%define p_src r0
%define p_dst r1
%define i_dststride r2
%define i_height r3
%define i_dststride3 r4
%define i_srcstride 8
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
lea i_dststride3, [3 * i_dststride]
vmovdqu ymm0, [p_src + 0 * i_srcstride]
vmovdqu ymm1, [p_src + 1 * i_srcstride]
vmovdqu ymm2, [p_src + 2 * i_srcstride]
vmovdqu ymm3, [p_src + 3 * i_srcstride]
vmovdqu ymm4, [p_src + 4 * i_srcstride]
vmovdqu ymm5, [p_src + 5 * i_srcstride]
vmovdqu ymm6, [p_src + 6 * i_srcstride]
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
vpackuswb ymm0, ymm0, ymm0
vmovd [p_dst], xmm0
vpsrlq xmm7, xmm0, 32
vmovd [p_dst + i_dststride], xmm7
vextracti128 xmm0, ymm0, 1
vmovd [p_dst + 2 * i_dststride], xmm0
vpsrlq xmm7, xmm0, 32
vmovd [p_dst + i_dststride3], xmm7
cmp i_height, 4
jle .done
lea p_dst, [p_dst + 4 * i_dststride]
vmovdqu ymm7, [p_src + 7 * i_srcstride]
vmovdqu ymm0, [p_src + 8 * i_srcstride]
vmovdqu ymm1, [p_src + 9 * i_srcstride]
AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
vpackuswb ymm4, ymm4, ymm4
vmovd [p_dst], xmm4
vpsrlq xmm3, xmm4, 32
vmovd [p_dst + i_dststride], xmm3
vextracti128 xmm4, ymm4, 1
vmovd [p_dst + 2 * i_dststride], xmm4
vpsrlq xmm3, xmm4, 32
vmovd [p_dst + i_dststride3], xmm3
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride
%undef i_dststride3
;*******************************************************************************
; void McHorVer20Width8U8ToS16_avx2(const uint8_t *pSrc,
; int iSrcStride,
; int16_t *pDst,
; int iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20Width8U8ToS16_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_height r3
%define i_dststride 16
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 6
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
vbroadcasti128 ymm3, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm4, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm5, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1
lea p_src, [p_src + 2 * i_srcstride]
AVX2_FilterHorizontalbw_16px ymm0, ymm3, ymm4, ymm5, ymm1, ymm2
vmovdqu [p_dst], ymm0
add p_dst, 2 * i_dststride
sub i_height, 2
jg .yloop
jl .done
vmovdqu xmm0, [p_src - 2]
AVX2_FilterHorizontalbw_16px xmm0, xmm3, xmm4, xmm5, xmm1, xmm2
vmovdqa [p_dst], xmm0
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_dststride
;***********************************************************************
; void McHorVer02Width5S16ToU8_avx2(const int16_t *pSrc,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width5S16ToU8_avx2
%define p_src r0
%define p_dst r1
%define i_dststride r2
%define i_height r3
%define i_srcstride 16
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
vmovdqu ymm0, [p_src + 0 * i_srcstride]
vmovdqu ymm2, [p_src + 2 * i_srcstride]
vmovdqu ymm4, [p_src + 4 * i_srcstride]
vmovdqu ymm6, [p_src + 6 * i_srcstride]
vperm2i128 ymm1, ymm0, ymm2, 00100001b
vperm2i128 ymm3, ymm2, ymm4, 00100001b
vperm2i128 ymm5, ymm4, ymm6, 00100001b
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
vpackuswb ymm0, ymm0, ymm0
vpsrlq xmm7, xmm0, 8
vmovd [p_dst + 1], xmm7
vmovd [p_dst], xmm0
add p_dst, i_dststride
vextracti128 xmm0, ymm0, 1
vpsrlq xmm7, xmm0, 8
vmovd [p_dst + 1], xmm7
vmovd [p_dst], xmm0
add p_dst, i_dststride
vmovdqu ymm7, [p_src + 7 * i_srcstride]
vmovdqu ymm0, [p_src + 8 * i_srcstride]
AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
vpackuswb ymm2, ymm2, ymm2
vpsrlq xmm1, xmm2, 8
vmovd [p_dst + 1], xmm1
vmovd [p_dst], xmm2
add p_dst, i_dststride
vextracti128 xmm2, ymm2, 1
vpsrlq xmm1, xmm2, 8
vmovd [p_dst + 1], xmm1
vmovd [p_dst], xmm2
add p_dst, i_dststride
vmovdqu ymm1, [p_src + 9 * i_srcstride]
vmovdqu ymm2, [p_src + 10 * i_srcstride]
AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
vpackuswb ymm4, ymm4, ymm4
vpsrlq xmm3, xmm4, 8
vmovd [p_dst + 1], xmm3
vmovd [p_dst], xmm4
cmp i_height, 5
jle .done
add p_dst, i_dststride
vextracti128 xmm4, ymm4, 1
vpsrlq xmm3, xmm4, 8
vmovd [p_dst + 1], xmm3
vmovd [p_dst], xmm4
add p_dst, i_dststride
vmovdqu ymm3, [p_src + 11 * i_srcstride]
vmovdqu xmm4, [p_src + 12 * i_srcstride]
AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
vpackuswb ymm6, ymm6, ymm6
vpsrlq xmm5, xmm6, 8
vmovd [p_dst + 1], xmm5
vmovd [p_dst], xmm6
add p_dst, i_dststride
vextracti128 xmm6, ymm6, 1
vpsrlq xmm5, xmm6, 8
vmovd [p_dst + 1], xmm5
vmovd [p_dst], xmm6
add p_dst, i_dststride
vmovdqu xmm5, [p_src + 13 * i_srcstride]
AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
vpackuswb xmm0, xmm0, xmm0
vpsrlq xmm7, xmm0, 8
vmovd [p_dst + 1], xmm7
vmovd [p_dst], xmm0
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride
;***********************************************************************
; void McHorVer02Width8S16ToU8_avx2(const int16_t *pSrc,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width8S16ToU8_avx2
%define p_src r0
%define p_dst r1
%define i_dststride r2
%define i_height r3
%define i_dststride3 r4
%define i_srcstride 16
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
lea i_dststride3, [3 * i_dststride]
vmovdqa ymm0, [p_src + 0 * i_srcstride]
vmovdqa ymm2, [p_src + 2 * i_srcstride]
vmovdqa ymm4, [p_src + 4 * i_srcstride]
vperm2i128 ymm1, ymm0, ymm2, 00100001b
vperm2i128 ymm3, ymm2, ymm4, 00100001b
.yloop:
vmovdqa ymm6, [p_src + 6 * i_srcstride]
vperm2i128 ymm5, ymm4, ymm6, 00100001b
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm7
vmovdqu ymm7, [p_src + 7 * i_srcstride]
AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm1
vpackuswb ymm1, ymm0, ymm2
vmovdqa ymm0, [p_src + 8 * i_srcstride]
vextracti128 xmm2, ymm1, 1
vmovlps [p_dst], xmm1
vmovlps [p_dst + i_dststride], xmm2
vmovhps [p_dst + 2 * i_dststride], xmm1
vmovhps [p_dst + i_dststride3], xmm2
cmp i_height, 4
jle .done
lea p_dst, [p_dst + 4 * i_dststride]
vmovdqu ymm1, [p_src + 9 * i_srcstride]
vmovdqa ymm2, [p_src + 10 * i_srcstride]
AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm3
vmovdqu ymm3, [p_src + 11 * i_srcstride]
AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm5
vpackuswb ymm5, ymm4, ymm6
vmovdqa ymm4, [p_src + 12 * i_srcstride]
add p_src, 8 * i_srcstride
vextracti128 xmm6, ymm5, 1
vmovlps [p_dst], xmm5
vmovlps [p_dst + i_dststride], xmm6
vmovhps [p_dst + 2 * i_dststride], xmm5
vmovhps [p_dst + i_dststride3], xmm6
lea p_dst, [p_dst + 4 * i_dststride]
sub i_height, 8
jg .yloop
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
ret
%undef p_src
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_dststride3
%undef i_srcstride
;*******************************************************************************
; void McHorVer20Width16U8ToS16_avx2(const uint8_t *pSrc,
; int32_t iSrcStride,
; int16_t *pDst,
; int32_t iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20Width16U8ToS16_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_height r3
%define i_dststride 32
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 7
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
vbroadcasti128 ymm4, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm5, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm6, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 1
.yloop:
vmovdqu xmm0, [p_src - 2]
vinserti128 ymm0, ymm0, [p_src + 6], 1
vmovdqu xmm1, [p_src + i_srcstride - 2]
vinserti128 ymm1, ymm1, [p_src + i_srcstride + 6], 1
lea p_src, [p_src + 2 * i_srcstride]
AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm2, ymm3
vmovdqa [p_dst], ymm0
AVX2_FilterHorizontalbw_16px ymm1, ymm4, ymm5, ymm6, ymm2, ymm3
vmovdqa [p_dst + i_dststride], ymm1
add p_dst, 2 * i_dststride
sub i_height, 2
jg .yloop
jl .done
vmovdqu xmm0, [p_src - 2]
vinserti128 ymm0, ymm0, [p_src + 6], 1
AVX2_FilterHorizontalbw_16px ymm0, ymm4, ymm5, ymm6, ymm1, ymm2
vmovdqa [p_dst], ymm0
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_height
%undef i_dststride
;***********************************************************************
; void McHorVer02Width9S16ToU8_avx2(const int16_t *pSrc,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width9S16ToU8_avx2
%define p_src r0
%define p_dst r1
%define i_dststride r2
%define i_height r3
%define i_srcstride 32
%assign push_num 0
INIT_X86_32_PIC r4
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r2, r2d
SIGN_EXTENSION r3, r3d
vmovdqa ymm0, [p_src + 0 * i_srcstride]
vmovdqa ymm1, [p_src + 1 * i_srcstride]
vmovdqa ymm2, [p_src + 2 * i_srcstride]
vmovdqa ymm3, [p_src + 3 * i_srcstride]
vmovdqa ymm4, [p_src + 4 * i_srcstride]
sub i_height, 1
.height_loop:
vmovdqa ymm5, [p_src + 5 * i_srcstride]
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
vmovdqa ymm6, [p_src + 6 * i_srcstride]
AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
vmovdqa ymm7, [p_src + 7 * i_srcstride]
vpackuswb ymm0, ymm0, ymm1
vextracti128 xmm1, ymm0, 1
vpsllq xmm1, xmm1, 56
vmovlps [p_dst + 1], xmm1
vmovlps [p_dst], xmm0
add p_dst, i_dststride
vmovhps [p_dst + 1], xmm1
vmovhps [p_dst], xmm0
add p_dst, i_dststride
AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
vmovdqa ymm0, [p_src + 8 * i_srcstride]
AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vextracti128 xmm3, ymm2, 1
vpsllq xmm3, xmm3, 56
vmovlps [p_dst + 1], xmm3
vmovlps [p_dst], xmm2
add p_dst, i_dststride
vmovhps [p_dst + 1], xmm3
vmovhps [p_dst], xmm2
add p_dst, i_dststride
vmovdqa ymm1, [p_src + 9 * i_srcstride]
AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
vmovdqa ymm2, [p_src + 10 * i_srcstride]
AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
vmovdqa ymm3, [p_src + 11 * i_srcstride]
vpackuswb ymm4, ymm4, ymm5
vextracti128 xmm5, ymm4, 1
vpsllq xmm5, xmm5, 56
vmovlps [p_dst + 1], xmm5
vmovlps [p_dst], xmm4
cmp i_height, 4
jle .done
add p_dst, i_dststride
vmovhps [p_dst + 1], xmm5
vmovhps [p_dst], xmm4
add p_dst, i_dststride
AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
vmovdqa ymm4, [p_src + 12 * i_srcstride]
add p_src, 8 * i_srcstride
AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
vpackuswb ymm6, ymm6, ymm7
vextracti128 xmm7, ymm6, 1
vpsllq xmm7, xmm7, 56
vmovlps [p_dst + 1], xmm7
vmovlps [p_dst], xmm6
add p_dst, i_dststride
vmovhps [p_dst + 1], xmm7
vmovhps [p_dst], xmm6
add p_dst, i_dststride
sub i_height, 8
jg .height_loop
vmovdqa ymm5, [p_src + 5 * i_srcstride]
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
vpackuswb ymm0, ymm0, ymm0
vextracti128 xmm1, ymm0, 1
vpsllq xmm1, xmm1, 56
vmovlps [p_dst + 1], xmm1
vmovlps [p_dst], xmm0
.done:
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
;*******************************************************************************
; void McHorVer20Width17U8ToS16_avx2(const uint8_t *pSrc,
; int32_t iSrcStride,
; int16_t *pDst,
; int32_t iHeight);
;*******************************************************************************
WELS_EXTERN McHorVer20Width17U8ToS16_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_height r3
%define i_srcstride3 r4
%define i_dststride 64
%assign push_num 0
%ifdef X86_32
push r4
%assign push_num 1
%endif
INIT_X86_32_PIC r5
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub p_src, i_srcstride
sub p_src, i_srcstride
lea i_srcstride3, [3 * i_srcstride]
vbroadcasti128 ymm5, [pic(shufb_32435465768798A9)]
vbroadcasti128 ymm6, [pic(shufb_011267784556ABBC)]
vbroadcasti128 ymm7, [pic(maddubsw_p1m5_p1m5_m5p1_m5p1_128)]
sub i_height, 3
.yloop:
vmovdqu xmm0, [p_src - 2]
vmovdqu xmm3, [p_src + 6]
vinserti128 ymm0, ymm0, [p_src + i_srcstride - 2], 1
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vmovdqa ymm4, ymm3
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
vmovdqa [p_dst], xmm0
vextracti128 [p_dst + i_dststride], ymm0, 1
AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vmovdqu xmm1, [p_src + 2 * i_srcstride - 2]
vmovdqu xmm0, [p_src + 2 * i_srcstride + 6]
vinserti128 ymm1, ymm1, [p_src + i_srcstride3 - 2], 1
vinserti128 ymm0, ymm0, [p_src + i_srcstride3 + 6], 1
lea p_src, [p_src + 4 * i_srcstride]
vpunpckhqdq ymm4, ymm4, ymm0
AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
vmovlps [p_dst + 26], xmm4
vmovdqa [p_dst + 16], xmm3
vextracti128 xmm2, ymm4, 1
vmovlps [p_dst + i_dststride + 26], xmm2
vextracti128 [p_dst + i_dststride + 16], ymm3, 1
vmovhps [p_dst + 2 * i_dststride + 26], xmm4
AVX2_FilterHorizontalbw_16px ymm1, ymm5, ymm6, ymm7, ymm3, ymm4
vmovdqa [p_dst + 2 * i_dststride], xmm1
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm3, ymm4
vmovdqa [p_dst + 2 * i_dststride + 16], xmm0
vextracti128 [p_dst + 3 * i_dststride], ymm1, 1
vmovhps [p_dst + 3 * i_dststride + 26], xmm2
vextracti128 [p_dst + 3 * i_dststride + 16], ymm0, 1
add p_dst, 4 * i_dststride
sub i_height, 4
jg .yloop
; Handle remaining 2 lines after 4x unrolled loop.
vmovdqu xmm0, [p_src - 2]
vinserti128 ymm0, ymm0, [p_src + 6], 1
vmovdqu xmm3, [p_src + i_srcstride - 2]
vinserti128 ymm3, ymm3, [p_src + i_srcstride + 6], 1
vpunpckhqdq ymm4, ymm0, ymm3
AVX2_FilterHorizontalbw_4px ymm4, [pic(dwm32768_256)], ymm2
AVX2_FilterHorizontalbw_16px ymm0, ymm5, ymm6, ymm7, ymm1, ymm2
AVX2_FilterHorizontalbw_16px ymm3, ymm5, ymm6, ymm7, ymm1, ymm2
vextracti128 xmm4, ymm4, 1
vmovlps [p_dst + 26], xmm4
vmovdqa [p_dst], ymm0
vmovhps [p_dst + i_dststride + 26], xmm4
vmovdqa [p_dst + i_dststride], ymm3
vzeroupper
POP_XMM
LOAD_4_PARA_POP
DEINIT_X86_32_PIC
%ifdef X86_32
pop r4
%endif
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_height
%undef i_srcstride3
;***********************************************************************
; void McHorVer02Width16Or17S16ToU8_avx2(const int16_t *pSrc,
; int32_t iSrcStride,
; uint8_t *pDst,
; int32_t iDstStride,
; int32_t iWidth,
; int32_t iHeight);
;***********************************************************************
WELS_EXTERN McHorVer02Width16Or17S16ToU8_avx2
%define p_src r0
%define i_srcstride r1
%define p_dst r2
%define i_dststride r3
%ifdef X86_32_PICASM
%define i_width dword arg5
%else
%define i_width r4
%endif
%define i_height r5
%define i_srcstride3 r6
%assign push_num 0
%ifdef X86_32
push r6
%assign push_num 1
%endif
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
SIGN_EXTENSION r5, r5d
INIT_X86_32_PIC_NOPRESERVE r4
sub i_height, 1
lea i_srcstride3, [3 * i_srcstride]
test i_width, 1
jz .align_begin
push i_height
push p_src
push p_dst
%assign push_num push_num + 3
%ifdef X86_32_PICASM
add p_src, i_width
add p_src, i_width
sub p_src, 2
%else
lea p_src, [p_src + 2 * i_width - 2]
%endif
add p_dst, i_width
vmovd xmm0, [p_src]
vpunpcklwd xmm0, xmm0, [p_src + i_srcstride]
vmovd xmm1, [p_src + 2 * i_srcstride]
add p_src, i_srcstride3
vpunpcklwd xmm1, xmm1, [p_src]
vpunpckldq xmm0, xmm0, xmm1
vmovd xmm1, [p_src + i_srcstride]
vpunpcklwd xmm1, xmm1, [p_src + 2 * i_srcstride]
vmovd xmm2, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklwd xmm2, xmm2, [p_src]
vpunpckldq xmm1, xmm1, xmm2
vpunpcklqdq xmm0, xmm0, xmm1
.height_loop_unalign:
vmovd xmm1, [p_src + i_srcstride]
vpalignr xmm1, xmm1, xmm0, 2
vmovd xmm2, [p_src + 2 * i_srcstride]
vpalignr xmm2, xmm2, xmm1, 2
vmovd xmm3, [p_src + i_srcstride3]
vpalignr xmm3, xmm3, xmm2, 2
lea p_src, [p_src + 4 * i_srcstride]
vmovd xmm4, [p_src]
vpalignr xmm4, xmm4, xmm3, 2
vmovd xmm5, [p_src + i_srcstride]
vpalignr xmm5, xmm5, xmm4, 2
AVX2_FilterVerticalw_16px xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7
vpackuswb xmm0, xmm0, xmm0
vpslld xmm6, xmm0, 24
vmovd [p_dst - 4], xmm6
vmovlps [p_dst + 4 * i_dststride - 8], xmm6
add p_dst, i_dststride
vpslld xmm6, xmm0, 16
vmovd [p_dst - 4], xmm6
vmovlps [p_dst + 4 * i_dststride - 8], xmm6
add p_dst, i_dststride
vpslld xmm6, xmm0, 8
vmovd [p_dst - 4], xmm6
vmovd [p_dst + i_dststride - 4], xmm0
lea p_dst, [p_dst + 4 * i_dststride]
vmovlps [p_dst - 8], xmm6
vmovlps [p_dst + i_dststride - 8], xmm0
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jle .height_loop_unalign_exit
vmovd xmm1, [p_src + 2 * i_srcstride]
vpalignr xmm1, xmm1, xmm5, 2
vmovd xmm0, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpunpcklwd xmm0, xmm0, [p_src]
vpalignr xmm0, xmm0, xmm1, 4
jmp .height_loop_unalign
.height_loop_unalign_exit:
vpbroadcastq xmm6, [p_src + 2 * i_srcstride - 6]
AVX2_FilterVerticalw_16px xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
vpackuswb xmm1, xmm1, xmm1
vmovlps [p_dst - 8], xmm1
pop p_dst
pop p_src
pop i_height
%assign push_num push_num - 3
.align_begin:
vmovdqa ymm0, [p_src]
vmovdqa ymm1, [p_src + i_srcstride]
vmovdqa ymm2, [p_src + 2 * i_srcstride]
vmovdqa ymm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vmovdqa ymm4, [p_src]
.height_loop:
vmovdqa ymm5, [p_src + i_srcstride]
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
vmovdqa ymm6, [p_src + 2 * i_srcstride]
AVX2_FilterVerticalw_16px ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7
vmovdqa ymm7, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpackuswb ymm0, ymm0, ymm1
vpermq ymm0, ymm0, 11011000b
vmovdqa [p_dst], xmm0
vextracti128 [p_dst + i_dststride], ymm0, 1
lea p_dst, [p_dst + 2 * i_dststride]
AVX2_FilterVerticalw_16px ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm0
vmovdqa ymm0, [p_src]
AVX2_FilterVerticalw_16px ymm3, ymm4, ymm5, ymm6, ymm7, ymm0, ymm1
vpackuswb ymm2, ymm2, ymm3
vpermq ymm2, ymm2, 11011000b
vmovdqa [p_dst], xmm2
vextracti128 [p_dst + i_dststride], ymm2, 1
lea p_dst, [p_dst + 2 * i_dststride]
vmovdqa ymm1, [p_src + i_srcstride]
AVX2_FilterVerticalw_16px ymm4, ymm5, ymm6, ymm7, ymm0, ymm1, ymm2
vmovdqa ymm2, [p_src + 2 * i_srcstride]
AVX2_FilterVerticalw_16px ymm5, ymm6, ymm7, ymm0, ymm1, ymm2, ymm3
vmovdqa ymm3, [p_src + i_srcstride3]
lea p_src, [p_src + 4 * i_srcstride]
vpackuswb ymm4, ymm4, ymm5
vpermq ymm4, ymm4, 11011000b
vmovdqa [p_dst], xmm4
vextracti128 [p_dst + i_dststride], ymm4, 1
lea p_dst, [p_dst + 2 * i_dststride]
AVX2_FilterVerticalw_16px ymm6, ymm7, ymm0, ymm1, ymm2, ymm3, ymm4
vmovdqa ymm4, [p_src]
AVX2_FilterVerticalw_16px ymm7, ymm0, ymm1, ymm2, ymm3, ymm4, ymm5
vpackuswb ymm6, ymm6, ymm7
vpermq ymm6, ymm6, 11011000b
vmovdqa [p_dst], xmm6
vextracti128 [p_dst + i_dststride], ymm6, 1
lea p_dst, [p_dst + 2 * i_dststride]
sub i_height, 8
jg .height_loop
jl .done
vmovdqa ymm5, [p_src + i_srcstride]
AVX2_FilterVerticalw_16px ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6
vpackuswb ymm0, ymm0, ymm0
vpermq ymm0, ymm0, 11011000b
vmovdqa [p_dst], xmm0
.done:
vzeroupper
DEINIT_X86_32_PIC
POP_XMM
LOAD_6_PARA_POP
%ifdef X86_32
pop r6
%endif
ret
%undef p_src
%undef i_srcstride
%undef p_dst
%undef i_dststride
%undef i_width
%undef i_height
%undef i_srcstride3
%endif ; HAVE_AVX2