ref: 9e4ab64c738171f4329a4c1a8c12890d783c5bf5
dir: /codec/common/mc_chroma.asm/
;*!
;* \copy
;*     Copyright (c)  2004-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  mc_chroma.asm
;*
;*  Abstract
;*      mmx motion compensation for chroma
;*
;*  History
;*      10/13/2004 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
;***********************************************************************
; Local Data (Read Only)
;***********************************************************************
SECTION .rodata align=16
;***********************************************************************
; Various memory constants (trigonometric values or rounding values)
;***********************************************************************
ALIGN 16
h264_d0x20_sse2:
	dw 32,32,32,32,32,32,32,32
ALIGN 16
h264_d0x20_mmx:
	dw 32,32,32,32
;=============================================================================
; Code
;=============================================================================
SECTION .text
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq4_mmx( const uint8_t *src,
;							int32_t iSrcStride,
;							uint8_t *pDst,
;							int32_t iDstStride,
;							const uint8_t *pABCD,
;							int32_t iHeigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq4_mmx
McChromaWidthEq4_mmx:
	;push esi
	;push edi
	;push ebx
	%assign  push_num 0
	LOAD_6_PARA
%ifndef X86_32
	movsx	r1, r1d
	movsx	r3, r3d
	movsx	r5, r5d
%endif
	;mov eax, [esp +12 + 20]
	movd mm3, [r4];	[eax]
	WELS_Zero mm7
	punpcklbw mm3, mm3
	movq      mm4, mm3
	punpcklwd mm3, mm3
	punpckhwd mm4, mm4
	movq	  mm5, mm3
	punpcklbw mm3, mm7
	punpckhbw mm5, mm7
	movq	  mm6, mm4
	punpcklbw mm4, mm7
	punpckhbw mm6, mm7
	;mov esi, [esp +12+ 4]
	;mov eax, [esp + 12 + 8]
	;mov edi, [esp + 12 + 12]
	;mov edx, [esp + 12 + 16]
    ;mov ecx, [esp + 12 + 24]
	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
	movd mm0, [r0]
	movd mm1, [r0+1]
	punpcklbw mm0, mm7
	punpcklbw mm1, mm7
.xloop:
	pmullw mm0, mm3
	pmullw mm1, mm5
	paddw  mm0, mm1
	movd  mm1, [r4]
	punpcklbw mm1, mm7
	movq mm2, mm1
	pmullw mm1, mm4
	paddw mm0, mm1
	movd mm1, [r4+1]
	punpcklbw mm1, mm7
	movq mm7, mm1
	pmullw mm1,mm6
	paddw mm0, mm1
	movq mm1,mm7
	paddw mm0, [h264_d0x20_mmx]
	psrlw mm0, 6
	WELS_Zero mm7
	packuswb mm0, mm7
	movd [r2], mm0
	movq mm0, mm2
	lea r2, [r2 + r3]
	lea r4, [r4 + r1]
	dec r5
	jnz near .xloop
	WELSEMMS
	LOAD_6_PARA_POP
	;pop ebx
	;pop edi
	;pop esi
	ret
ALIGN 16
;*******************************************************************************
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
;						int32_t iSrcStride,
;						uint8_t *pDst,
;						int32_t iDstStride,
;						const uint8_t *pABCD,
;						int32_t iheigh );
;*******************************************************************************
WELS_EXTERN McChromaWidthEq8_sse2
McChromaWidthEq8_sse2:
	;push esi
	;push edi
	;push ebx
	%assign  push_num 0
	LOAD_6_PARA
%ifndef X86_32
	movsx	r1, r1d
	movsx	r3, r3d
	movsx	r5, r5d
%endif
	;mov eax, [esp +12 + 20]
	movd xmm3, [r4]
	WELS_Zero xmm7
	punpcklbw  xmm3, xmm3
	punpcklwd  xmm3, xmm3
	movdqa	   xmm4, xmm3
	punpckldq  xmm3, xmm3
	punpckhdq  xmm4, xmm4
	movdqa     xmm5, xmm3
	movdqa	   xmm6, xmm4
	punpcklbw  xmm3, xmm7
	punpckhbw  xmm5, xmm7
	punpcklbw  xmm4, xmm7
	punpckhbw  xmm6, xmm7
	;mov esi, [esp +12+ 4]
	;mov eax, [esp + 12 + 8]
	;mov edi, [esp + 12 + 12]
	;mov edx, [esp + 12 + 16]
    ;mov ecx, [esp + 12 + 24]
	lea r4, [r0 + r1] ;lea ebx, [esi + eax]
	movq xmm0, [r0]
	movq xmm1, [r0+1]
	punpcklbw xmm0, xmm7
	punpcklbw xmm1, xmm7
.xloop:
	pmullw xmm0, xmm3
	pmullw xmm1, xmm5
	paddw  xmm0, xmm1
	movq  xmm1, [r4]
	punpcklbw xmm1, xmm7
	movdqa xmm2, xmm1
	pmullw xmm1, xmm4
	paddw xmm0, xmm1
	movq xmm1, [r4+1]
	punpcklbw xmm1, xmm7
	movdqa xmm7, xmm1
	pmullw xmm1, xmm6
	paddw xmm0, xmm1
	movdqa xmm1,xmm7
	paddw xmm0, [h264_d0x20_sse2]
	psrlw xmm0, 6
	WELS_Zero xmm7
	packuswb xmm0, xmm7
	movq [r2], xmm0
	movdqa xmm0, xmm2
	lea r2, [r2 + r3]
	lea r4, [r4 + r1]
	dec r5
	jnz near .xloop
	LOAD_6_PARA_POP
	;pop ebx
	;pop edi
	;pop esi
	ret
ALIGN 16
;***********************************************************************
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
;						 int32_t iSrcStride,
;                        uint8_t *pDst,
;                        int32_t iDstStride,
;                        const uint8_t *pABCD,
;					     int32_t iHeigh);
;***********************************************************************
WELS_EXTERN McChromaWidthEq8_ssse3
McChromaWidthEq8_ssse3:
	;push ebx
	;push esi
	;push edi
	%assign  push_num 0
	LOAD_6_PARA
%ifndef X86_32
	movsx	r1, r1d
	movsx	r3, r3d
	movsx	r5, r5d
%endif
	;mov eax, [esp + 12 + 20]
    pxor      xmm7, xmm7
    movd   xmm5, [r4]
    punpcklwd xmm5, xmm5
    punpckldq xmm5, xmm5
    movdqa    xmm6, xmm5
    punpcklqdq xmm5, xmm5
    punpckhqdq xmm6, xmm6
	;mov eax, [esp + 12 + 4]
	;mov edx, [esp + 12 + 8]
	;mov esi, [esp + 12 + 12]
	;mov edi, [esp + 12 + 16]
    ;mov ecx, [esp + 12 + 24]
    sub r2, r3 ;sub esi, edi
    sub r2, r3
	movdqa xmm7, [h264_d0x20_sse2]
	movdqu xmm0, [r0]
	movdqa xmm1, xmm0
	psrldq xmm1, 1
	punpcklbw xmm0, xmm1
.hloop_chroma:
	lea	r2, [r2+2*r3]
	movdqu xmm2, [r0+r1]
	movdqa xmm3, xmm2
	psrldq xmm3, 1
	punpcklbw xmm2, xmm3
	movdqa      xmm4, xmm2
    pmaddubsw  xmm0, xmm5
    pmaddubsw  xmm2, xmm6
    paddw      xmm0, xmm2
    paddw      xmm0, xmm7
	psrlw      xmm0, 6
    packuswb   xmm0, xmm0
    movq       [r2],xmm0
    lea r0, [r0+2*r1]
    movdqu xmm2, [r0]
    movdqa xmm3, xmm2
    psrldq xmm3, 1
    punpcklbw xmm2, xmm3
    movdqa      xmm0, xmm2
    pmaddubsw  xmm4, xmm5
    pmaddubsw  xmm2, xmm6
    paddw      xmm4, xmm2
    paddw      xmm4, xmm7
	psrlw      xmm4, 6
    packuswb   xmm4, xmm4
    movq       [r2+r3],xmm4
	sub r5, 2
	jnz .hloop_chroma
	LOAD_6_PARA_POP
	;pop edi
	;pop esi
	;pop ebx
	ret