shithub: openh264

ref: b2066e1dc2c93b00400acfed1ef9ea041a0cf832
dir: /codec/decoder/core/asm/deblock.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  deblock.asm
;*
;*  Abstract
;*      edge loop
;*
;*  History
;*      08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"
BITS 32

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************

%ifdef FORMAT_COFF
SECTION .rodata pData
%else
SECTION .rodata align=16
%endif

SECTION .text

;********************************************************************************
;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                             int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN   DeblockChromaEq4V_sse2

ALIGN  16
DeblockChromaEq4V_sse2:
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,68h
  mov         edx,[ebp+10h]      ;  iStride
  mov         eax,[ebp+8]        ;  pPixCb
  mov         ecx,[ebp+0Ch]      ;  pPixCr
  movq        xmm4,[ecx]
  movq        xmm5,[edx+ecx]
  push        esi
  push        edi
  lea         esi,[edx+edx]
  mov         edi,eax
  sub         edi,esi
  movq        xmm1,[edi]
  mov         edi,ecx
  sub         edi,esi
  movq        xmm2,[edi]
  punpcklqdq  xmm1,xmm2
  mov         esi,eax
  sub         esi,edx
  movq        xmm2,[esi]
  mov         edi,ecx
  sub         edi,edx
  movq        xmm3,[edi]
  punpcklqdq  xmm2,xmm3
  movq        xmm3,[eax]
  punpcklqdq  xmm3,xmm4
  movq        xmm4,[edx+eax]
  mov       edx, [ebp + 14h]
  punpcklqdq  xmm4,xmm5
  movd        xmm5,edx
  mov       edx, [ebp + 18h]
  pxor        xmm0,xmm0
  movdqa      xmm6,xmm5
  punpcklwd   xmm6,xmm5
  pshufd      xmm5,xmm6,0
  movd        xmm6,edx
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      xmm7,xmm1
  punpckhbw   xmm1,xmm0
  punpcklbw   xmm7,xmm0
  movdqa      [esp+40h],xmm1
  movdqa      [esp+60h],xmm7
  movdqa      xmm7,xmm2
  punpcklbw   xmm7,xmm0
  movdqa      [esp+10h],xmm7
  movdqa      xmm7,xmm3
  punpcklbw   xmm7,xmm0
  punpckhbw   xmm3,xmm0
  movdqa      [esp+50h],xmm7
  movdqa      xmm7,xmm4
  punpckhbw   xmm4,xmm0
  punpckhbw   xmm2,xmm0
  punpcklbw   xmm7,xmm0
  movdqa      [esp+30h],xmm3
  movdqa      xmm3,[esp+10h]
  movdqa      xmm1,xmm3
  psubw       xmm1,[esp+50h]
  pabsw       xmm1,xmm1
  movdqa      [esp+20h],xmm4
  movdqa      xmm0,xmm5
  pcmpgtw     xmm0,xmm1
  movdqa      xmm1,[esp+60h]
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  pand        xmm0,xmm4
  movdqa      xmm1,xmm7
  psubw       xmm1,[esp+50h]
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  movdqa      xmm1,xmm2
  psubw       xmm1,[esp+30h]
  pabsw       xmm1,xmm1
  pcmpgtw     xmm5,xmm1
  movdqa      xmm1,[esp+40h]
  pand        xmm0,xmm4
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  movdqa      xmm1,[esp+20h]
  psubw       xmm1,[esp+30h]
  pand        xmm5,xmm4
  pabsw       xmm1,xmm1
  pcmpgtw     xmm6,xmm1
  pand        xmm5,xmm6
  mov         edx,2
  movsx       edx,dx
  movd        xmm1,edx
  movdqa      xmm4,xmm1
  punpcklwd   xmm4,xmm1
  pshufd      xmm1,xmm4,0
  movdqa      xmm4,[esp+60h]
  movdqa      xmm6,xmm4
  paddw       xmm6,xmm4
  paddw       xmm6,xmm3
  paddw       xmm6,xmm7
  movdqa      [esp+10h],xmm1
  paddw       xmm6,[esp+10h]
  psraw       xmm6,2
  movdqa      xmm4,xmm0
  pandn       xmm4,xmm3
  movdqa      xmm3,[esp+40h]
  movdqa      xmm1,xmm0
  pand        xmm1,xmm6
  por         xmm1,xmm4
  movdqa      xmm6,xmm3
  paddw       xmm6,xmm3
  movdqa      xmm3,[esp+10h]
  paddw       xmm6,xmm2
  paddw       xmm6,[esp+20h]
  paddw       xmm6,xmm3
  psraw       xmm6,2
  movdqa      xmm4,xmm5
  pand        xmm4,xmm6
  movdqa      xmm6,xmm5
  pandn       xmm6,xmm2
  por         xmm4,xmm6
  packuswb    xmm1,xmm4
  movdqa      xmm4,[esp+50h]
  movdqa      xmm6,xmm7
  paddw       xmm6,xmm7
  paddw       xmm6,xmm4
  paddw       xmm6,[esp+60h]
  paddw       xmm6,xmm3
  psraw       xmm6,2
  movdqa      xmm2,xmm0
  pand        xmm2,xmm6
  pandn       xmm0,xmm4
  por         xmm2,xmm0
  movdqa      xmm0,[esp+20h]
  movdqa      xmm6,xmm0
  paddw       xmm6,xmm0
  movdqa      xmm0,[esp+30h]
  paddw       xmm6,xmm0
  paddw       xmm6,[esp+40h]
  movdqa      xmm4,xmm5
  paddw       xmm6,xmm3
  movq        [esi],xmm1
  psraw       xmm6,2
  pand        xmm4,xmm6
  pandn       xmm5,xmm0
  por         xmm4,xmm5
  packuswb    xmm2,xmm4
  movq        [eax],xmm2
  psrldq      xmm1,8
  movq        [edi],xmm1
  pop         edi
  psrldq      xmm2,8
  movq        [ecx],xmm2
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret

;******************************************************************************
; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN  DeblockChromaLt4V_sse2

DeblockChromaLt4V_sse2:
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,0E4h
  push        ebx
  push        esi
  mov         esi, [ebp+1Ch]      ;  pTC
  movsx       ebx, byte [esi+2]
  push        edi
  movsx       di,byte [esi+3]
  mov         word [esp+0Ch],bx
  movsx       bx,byte  [esi+1]
  movsx       esi,byte  [esi]
  mov         word  [esp+0Eh],si
  movzx       esi,di
  movd        xmm1,esi
  movzx       esi,di
  movd        xmm2,esi
  mov         si,word  [esp+0Ch]
  mov         edx, [ebp + 10h]
  mov         eax, [ebp + 08h]
  movzx       edi,si
  movzx       esi,si
  mov         ecx, [ebp + 0Ch]
  movd        xmm4,esi
  movzx       esi,bx
  movd        xmm5,esi
  movd        xmm3,edi
  movzx       esi,bx
  movd        xmm6,esi
  mov         si,word [esp+0Eh]
  movzx       edi,si
  movzx       esi,si
  punpcklwd   xmm6,xmm2
  pxor        xmm0,xmm0
  movdqa      [esp+40h],xmm0
  movd        xmm7,edi
  movd        xmm0,esi
  lea         esi,[edx+edx]
  mov         edi,eax
  sub         edi,esi
  punpcklwd   xmm5,xmm1
  movdqa      xmm1,[esp+40h]
  punpcklwd   xmm0,xmm4
  movq        xmm4,[edx+ecx]
  punpcklwd   xmm7,xmm3
  movq        xmm3,[eax]
  punpcklwd   xmm0,xmm6
  movq        xmm6,[edi]
  punpcklwd   xmm7,xmm5
  punpcklwd   xmm0,xmm7
  mov         edi,ecx
  sub         edi,esi
  movdqa      xmm2,xmm1
  psubw       xmm2,xmm0
  movdqa      [esp+60h],xmm2
  movq        xmm2, [edi]
  punpcklqdq  xmm6,xmm2
  mov         esi,eax
  sub         esi,edx
  movq        xmm7,[esi]
  mov         edi,ecx
  sub         edi,edx
  movq        xmm2,[edi]
  punpcklqdq  xmm7,xmm2
  movq        xmm2,[ecx]
  punpcklqdq  xmm3,xmm2
  movq        xmm2,[edx+eax]
  movsx       edx,word [ebp + 14h]
  punpcklqdq  xmm2,xmm4
  movdqa      [esp+0E0h],xmm2
  movd        xmm2,edx
  movsx       edx,word [ebp + 18h]
  movdqa      xmm4,xmm2
  punpcklwd   xmm4,xmm2
  movd        xmm2,edx
  movdqa      xmm5,xmm2
  punpcklwd   xmm5,xmm2
  pshufd      xmm2,xmm5,0
  movdqa      [esp+50h],xmm2
  movdqa      xmm2,xmm6
  punpcklbw   xmm2,xmm1
  movdqa      [esp+0D0h],xmm3
  pshufd      xmm4,xmm4,0
  movdqa      [esp+30h],xmm2
  punpckhbw   xmm6,xmm1
  movdqa      [esp+80h],xmm6
  movdqa      xmm6,[esp+0D0h]
  punpckhbw   xmm6,xmm1
  movdqa      [esp+70h],xmm6
  movdqa      xmm6, [esp+0E0h]
  punpckhbw   xmm6,xmm1
  movdqa     [esp+90h],xmm6
  movdqa      xmm5, [esp+0E0h]
  movdqa      xmm2,xmm7
  punpckhbw   xmm7,xmm1
  punpcklbw   xmm5,xmm1
  movdqa       [esp+0A0h],xmm7
  punpcklbw   xmm3,xmm1
  mov         edx,4
  punpcklbw   xmm2,xmm1
  movsx       edx,dx
  movd        xmm6,edx
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      xmm7,[esp+30h]
  movdqa      [esp+20h],xmm6
  psubw       xmm7,xmm5
  movdqa      xmm6,xmm0
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1,[esp+60h]
  movdqa      [esp+40h],xmm6
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm2
  psllw       xmm6,2
  paddw       xmm6,xmm7
  paddw       xmm6, [esp+20h]
  movdqa      xmm7, [esp+50h]
  psraw       xmm6,3
  pmaxsw      xmm1,xmm6
  movdqa      [esp+10h],xmm0
  movdqa      xmm6, [esp+10h]
  pminsw      xmm6,xmm1
  movdqa      [esp+10h],xmm6
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  movdqa      xmm6,xmm4
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1, [esp+30h]
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  pcmpgtw     xmm7,xmm1
  movdqa      xmm1,[esp+50h]
  pand        xmm6,xmm7
  movdqa      xmm7,[esp+50h]
  psubw       xmm5,xmm3
  pabsw       xmm5,xmm5
  pcmpgtw     xmm1,xmm5
  movdqa      xmm5,[esp+80h]
  psubw       xmm5,[esp+90h]
  pand        xmm6,xmm1
  pand        xmm6,[esp+40h]
  movdqa      xmm1,[esp+10h]
  pand        xmm1,xmm6
  movdqa      xmm6,[esp+70h]
  movdqa      [esp+30h],xmm1
  movdqa      xmm1,[esp+0A0h]
  psubw       xmm6,xmm1
  psllw       xmm6,2
  paddw       xmm6,xmm5
  paddw       xmm6,[esp+20h]
  movdqa      xmm5,[esp+60h]
  psraw       xmm6,3
  pmaxsw      xmm5,xmm6
  pminsw      xmm0,xmm5
  movdqa      xmm5,[esp+70h]
  movdqa      xmm6,xmm1
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm4,xmm6
  movdqa      xmm6,[esp+80h]
  psubw       xmm6,xmm1
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+90h]
  pand        xmm4,xmm7
  movdqa      xmm7,[esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  pand        xmm4,xmm7
  pand        xmm4,[esp+40h]
  pand        xmm0,xmm4
  movdqa      xmm4,[esp+30h]
  paddw       xmm2,xmm4
  paddw       xmm1,xmm0
  packuswb    xmm2,xmm1
  movq        [esi],xmm2
  psubw       xmm3,xmm4
  psubw       xmm5,xmm0
  packuswb    xmm3,xmm5
  movq        [eax],xmm3
  psrldq      xmm2,8
  movq        [edi],xmm2
  pop         edi
  pop         esi
  psrldq      xmm3,8
  movq        [ecx],xmm3
  pop         ebx
  mov         esp,ebp
  pop         ebp
  ret

;***************************************************************************
;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;          int32_t iAlpha, int32_t iBeta)
;***************************************************************************

WELS_EXTERN     DeblockChromaEq4H_sse2

ALIGN  16

DeblockChromaEq4H_sse2:
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,0C8h
  mov         ecx,dword [ebp+8]
  mov         edx,dword [ebp+0Ch]
  mov         eax,dword [ebp+10h]
  sub         ecx,2
  sub         edx,2
  push        esi
  lea         esi,[eax+eax*2]
  mov         dword [esp+18h],ecx
  mov         dword [esp+4],edx
  lea         ecx,[ecx+eax*4]
  lea         edx,[edx+eax*4]
  lea         eax,[esp+7Ch]
  push        edi
  mov         dword [esp+14h],esi
  mov         dword [esp+18h],ecx
  mov         dword [esp+0Ch],edx
  mov         dword [esp+10h],eax
  mov         esi,dword [esp+1Ch]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+14h]
  movd        xmm0,dword [esi]
  movd        xmm1,dword [esi+ecx]
  movd        xmm2,dword [esi+ecx*2]
  movd        xmm3,dword [esi+edx]
  mov         esi,dword  [esp+8]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [esi+ecx]
  movd        xmm6,dword [esi+ecx*2]
  movd        xmm7,dword [esi+edx]
  punpckldq   xmm0,xmm4
  punpckldq   xmm1,xmm5
  punpckldq   xmm2,xmm6
  punpckldq   xmm3,xmm7
  mov         esi,dword [esp+18h]
  mov         edi,dword [esp+0Ch]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [edi]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm0,xmm4
  movd        xmm4,dword [esi+ecx]
  movd        xmm5,dword [edi+ecx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm1,xmm4
  movd        xmm4,dword [esi+ecx*2]
  movd        xmm5,dword [edi+ecx*2]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm2,xmm4
  movd        xmm4,dword [esi+edx]
  movd        xmm5,dword [edi+edx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm3,xmm4
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         edi,dword [esp+10h]
  movdqa      [edi],xmm0
  movdqa      [edi+10h],xmm5
  movdqa      [edi+20h],xmm1
  movdqa      [edi+30h],xmm6
  movsx       ecx,word [ebp+14h]
  movsx       edx,word [ebp+18h]
  movdqa      xmm6,[esp+80h]
  movdqa      xmm4,[esp+90h]
  movdqa      xmm5,[esp+0A0h]
  movdqa      xmm7,[esp+0B0h]
  pxor        xmm0,xmm0
  movd        xmm1,ecx
  movdqa      xmm2,xmm1
  punpcklwd   xmm2,xmm1
  pshufd      xmm1,xmm2,0
  movd        xmm2,edx
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm2,xmm3,0
  movdqa      xmm3,xmm6
  punpckhbw   xmm6,xmm0
  movdqa      [esp+60h],xmm6
  movdqa      xmm6,[esp+90h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+30h],xmm6
  movdqa      xmm6,[esp+0A0h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+40h],xmm6
  movdqa      xmm6,[esp+0B0h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+70h],xmm6
  punpcklbw   xmm7,xmm0
  punpcklbw   xmm4,xmm0
  punpcklbw   xmm5,xmm0
  punpcklbw   xmm3,xmm0
  movdqa      [esp+50h],xmm7
  movdqa      xmm6,xmm4
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  movdqa      xmm0,xmm1
  pcmpgtw     xmm0,xmm6
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm4
  pabsw       xmm6,xmm6
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pand        xmm0,xmm7
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+30h]
  psubw       xmm6,[esp+40h]
  pabsw       xmm6,xmm6
  pcmpgtw     xmm1,xmm6
  movdqa      xmm6,[esp+60h]
  psubw       xmm6,[esp+30h]
  pabsw       xmm6,xmm6
  pand        xmm0,xmm7
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+70h]
  psubw       xmm6,[esp+40h]
  pabsw       xmm6,xmm6
  pand        xmm1,xmm7
  pcmpgtw     xmm2,xmm6
  pand        xmm1,xmm2
  mov         eax,2
  movsx       ecx,ax
  movd        xmm2,ecx
  movdqa      xmm6,xmm2
  punpcklwd   xmm6,xmm2
  pshufd      xmm2,xmm6,0
  movdqa      [esp+20h],xmm2
  movdqa      xmm2,xmm3
  paddw       xmm2,xmm3
  paddw       xmm2,xmm4
  paddw       xmm2,[esp+50h]
  paddw       xmm2,[esp+20h]
  psraw       xmm2,2
  movdqa      xmm6,xmm0
  pand        xmm6,xmm2
  movdqa      xmm2,xmm0
  pandn       xmm2,xmm4
  por         xmm6,xmm2
  movdqa      xmm2,[esp+60h]
  movdqa      xmm7,xmm2
  paddw       xmm7,xmm2
  paddw       xmm7,[esp+30h]
  paddw       xmm7,[esp+70h]
  paddw       xmm7,[esp+20h]
  movdqa      xmm4,xmm1
  movdqa      xmm2,xmm1
  pandn       xmm2,[esp+30h]
  psraw       xmm7,2
  pand        xmm4,xmm7
  por         xmm4,xmm2
  movdqa      xmm2,[esp+50h]
  packuswb    xmm6,xmm4
  movdqa      [esp+90h],xmm6
  movdqa      xmm6,xmm2
  paddw       xmm6,xmm2
  movdqa      xmm2,[esp+20h]
  paddw       xmm6,xmm5
  paddw       xmm6,xmm3
  movdqa      xmm4,xmm0
  pandn       xmm0,xmm5
  paddw       xmm6,xmm2
  psraw       xmm6,2
  pand        xmm4,xmm6
  por         xmm4,xmm0
  movdqa      xmm0,[esp+70h]
  movdqa      xmm5,xmm0
  paddw       xmm5,xmm0
  movdqa      xmm0,[esp+40h]
  paddw       xmm5,xmm0
  paddw       xmm5,[esp+60h]
  movdqa      xmm3,xmm1
  paddw       xmm5,xmm2
  psraw       xmm5,2
  pand        xmm3,xmm5
  pandn       xmm1,xmm0
  por         xmm3,xmm1
  packuswb    xmm4,xmm3
  movdqa      [esp+0A0h],xmm4
  mov         esi,dword [esp+10h]
  movdqa      xmm0,[esi]
  movdqa      xmm1,[esi+10h]
  movdqa      xmm2,[esi+20h]
  movdqa      xmm3,[esi+30h]
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         esi,dword [esp+1Ch]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+14h]
  mov         edi,dword [esp+8]
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         esi,dword [esp+18h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         edi,dword [esp+0Ch]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  pop         edi
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret

;*******************************************************************************
;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN  DeblockChromaLt4H_sse2

ALIGN  16

DeblockChromaLt4H_sse2:
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,108h
  mov         ecx,dword [ebp+8]
  mov         edx,dword [ebp+0Ch]
  mov         eax,dword [ebp+10h]
  sub         ecx,2
  sub         edx,2
  push        esi
  lea         esi,[eax+eax*2]
  mov         dword [esp+10h],ecx
  mov         dword [esp+4],edx
  lea         ecx,[ecx+eax*4]
  lea         edx,[edx+eax*4]
  lea         eax,[esp+6Ch]
  push        edi
  mov         dword [esp+0Ch],esi
  mov         dword [esp+18h],ecx
  mov         dword [esp+10h],edx
  mov         dword [esp+1Ch],eax
  mov         esi,dword [esp+14h]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+0Ch]
  movd        xmm0,dword [esi]
  movd        xmm1,dword [esi+ecx]
  movd        xmm2,dword [esi+ecx*2]
  movd        xmm3,dword [esi+edx]
  mov         esi,dword [esp+8]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [esi+ecx]
  movd        xmm6,dword [esi+ecx*2]
  movd        xmm7,dword [esi+edx]
  punpckldq   xmm0,xmm4
  punpckldq   xmm1,xmm5
  punpckldq   xmm2,xmm6
  punpckldq   xmm3,xmm7
  mov         esi,dword [esp+18h]
  mov         edi,dword [esp+10h]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [edi]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm0,xmm4
  movd        xmm4,dword [esi+ecx]
  movd        xmm5,dword [edi+ecx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm1,xmm4
  movd        xmm4,dword [esi+ecx*2]
  movd        xmm5,dword [edi+ecx*2]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm2,xmm4
  movd        xmm4,dword [esi+edx]
  movd        xmm5,dword [edi+edx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm3,xmm4
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         edi,dword [esp+1Ch]
  movdqa      [edi],xmm0
  movdqa      [edi+10h],xmm5
  movdqa      [edi+20h],xmm1
  movdqa      [edi+30h],xmm6
  mov         eax,dword [ebp+1Ch]
  movsx       cx,byte [eax+3]
  movsx       dx,byte [eax+2]
  movsx       si,byte [eax+1]
  movsx       ax,byte [eax]
  movzx       edi,cx
  movzx       ecx,cx
  movd        xmm2,ecx
  movzx       ecx,dx
  movzx       edx,dx
  movd        xmm3,ecx
  movd        xmm4,edx
  movzx       ecx,si
  movzx       edx,si
  movd        xmm5,ecx
  pxor        xmm0,xmm0
  movd        xmm6,edx
  movzx       ecx,ax
  movdqa      [esp+60h],xmm0
  movzx       edx,ax
  movsx       eax,word [ebp+14h]
  punpcklwd   xmm6,xmm2
  movd        xmm1,edi
  movd        xmm7,ecx
  movsx       ecx,word [ebp+18h]
  movd        xmm0,edx
  punpcklwd   xmm7,xmm3
  punpcklwd   xmm5,xmm1
  movdqa      xmm1,[esp+60h]
  punpcklwd   xmm7,xmm5
  movdqa      xmm5,[esp+0A0h]
  punpcklwd   xmm0,xmm4
  punpcklwd   xmm0,xmm6
  movdqa      xmm6, [esp+70h]
  punpcklwd   xmm0,xmm7
  movdqa      xmm7,[esp+80h]
  movdqa      xmm2,xmm1
  psubw       xmm2,xmm0
  movdqa      [esp+0D0h],xmm2
  movd        xmm2,eax
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm4,xmm3,0
  movd        xmm2,ecx
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm2,xmm3,0
  movdqa      xmm3, [esp+90h]
  movdqa      [esp+50h],xmm2
  movdqa      xmm2,xmm6
  punpcklbw   xmm2,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      [esp+40h],xmm2
  movdqa      [esp+0B0h],xmm6
  movdqa      xmm6,[esp+90h]
  movdqa      xmm2,xmm7
  punpckhbw   xmm7,xmm1
  punpckhbw   xmm6,xmm1
  punpcklbw   xmm2,xmm1
  punpcklbw   xmm3,xmm1
  punpcklbw   xmm5,xmm1
  movdqa      [esp+0F0h],xmm7
  movdqa      [esp+0C0h],xmm6
  movdqa      xmm6, [esp+0A0h]
  punpckhbw   xmm6,xmm1
  movdqa      [esp+0E0h],xmm6
  mov         edx,4
  movsx       eax,dx
  movd        xmm6,eax
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      [esp+30h],xmm6
  movdqa      xmm7, [esp+40h]
  psubw       xmm7,xmm5
  movdqa      xmm6,xmm0
  pcmpgtw     xmm6,xmm1
  movdqa      [esp+60h],xmm6
  movdqa      xmm1, [esp+0D0h]
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm2
  psllw       xmm6,2
  paddw       xmm6,xmm7
  paddw       xmm6,[esp+30h]
  psraw       xmm6,3
  pmaxsw      xmm1,xmm6
  movdqa      xmm7,[esp+50h]
  movdqa      [esp+20h],xmm0
  movdqa      xmm6, [esp+20h]
  pminsw      xmm6,xmm1
  movdqa      [esp+20h],xmm6
  movdqa      xmm6,xmm4
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1, [esp+40h]
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  pcmpgtw     xmm7,xmm1
  movdqa      xmm1, [esp+50h]
  pand        xmm6,xmm7
  movdqa      xmm7, [esp+50h]
  psubw       xmm5,xmm3
  pabsw       xmm5,xmm5
  pcmpgtw     xmm1,xmm5
  movdqa      xmm5, [esp+0B0h]
  psubw       xmm5,[esp+0E0h]
  pand        xmm6,xmm1
  pand        xmm6, [esp+60h]
  movdqa      xmm1, [esp+20h]
  pand        xmm1,xmm6
  movdqa      xmm6, [esp+0C0h]
  movdqa      [esp+40h],xmm1
  movdqa      xmm1, [esp+0F0h]
  psubw       xmm6,xmm1
  psllw       xmm6,2
  paddw       xmm6,xmm5
  paddw       xmm6, [esp+30h]
  movdqa      xmm5, [esp+0D0h]
  psraw       xmm6,3
  pmaxsw      xmm5,xmm6
  pminsw      xmm0,xmm5
  movdqa      xmm5,[esp+0C0h]
  movdqa      xmm6,xmm1
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm4,xmm6
  movdqa      xmm6,[esp+0B0h]
  psubw       xmm6,xmm1
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6, [esp+0E0h]
  pand        xmm4,xmm7
  movdqa      xmm7, [esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  pand        xmm4,xmm7
  pand        xmm4,[esp+60h]
  pand        xmm0,xmm4
  movdqa      xmm4, [esp+40h]
  paddw       xmm2,xmm4
  paddw       xmm1,xmm0
  psubw       xmm3,xmm4
  psubw       xmm5,xmm0
  packuswb    xmm2,xmm1
  packuswb    xmm3,xmm5
  movdqa      [esp+80h],xmm2
  movdqa      [esp+90h],xmm3
  mov         esi,dword [esp+1Ch]
  movdqa      xmm0, [esi]
  movdqa      xmm1, [esi+10h]
  movdqa      xmm2, [esi+20h]
  movdqa      xmm3, [esi+30h]
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         esi,dword [esp+14h]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+0Ch]
  mov         edi,dword [esp+8]
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         esi,dword [esp+18h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         edi,dword [esp+10h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  pop         edi
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret



;*******************************************************************************
;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta, int8_t * pTC)
;*******************************************************************************


WELS_EXTERN  DeblockLumaLt4V_sse2

ALIGN  16

DeblockLumaLt4V_sse2:
    push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 420				; 000001a4H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]

	pxor	xmm0, xmm0
	push	ebx
	mov	edx, dword [ebp+24]
	movdqa	[esp+424-384], xmm0
	push	esi

	lea	esi, [ecx+ecx*2]
	push	edi
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]

	lea	esi, [ecx+ecx]
	movdqa	[esp+432-208], xmm0
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]
	movdqa	[esp+448-208], xmm0

	mov	ebx, eax
	sub	ebx, ecx
	movdqa	xmm0, [ebx]
	movdqa	[esp+464-208], xmm0

	movdqa	xmm0, [eax]

	add	ecx, eax
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [ecx]
	mov	dword [esp+432-404], ecx

	movsx	ecx, word [ebp+16]
	movdqa	[esp+496-208], xmm0
	movdqa	xmm0, [esi+eax]

	movsx	si, byte [edx]
	movdqa	[esp+512-208], xmm0
	movd	xmm0, ecx
	movsx	ecx, word [ebp+20]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	pshufd	xmm0, xmm1, 0
	movdqa	[esp+432-112], xmm0
	movd	xmm0, ecx
	movsx	cx, byte [edx+1]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	mov	dword [esp+432-408], ebx
	movzx	ebx, cx
	pshufd	xmm0, xmm1, 0
	movd	xmm1, ebx
	movzx	ebx, cx
	movd	xmm2, ebx
	movzx	ebx, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, si
	movd	xmm5, ecx
	movzx	ecx, si
	movd	xmm6, ecx
	movzx	ecx, si
	movd	xmm7, ecx
	movzx	ecx, si
	movdqa	[esp+432-336], xmm0
	movd	xmm0, ecx

	movsx	cx, byte [edx+3]
	movsx	dx, byte [edx+2]
	movd	xmm3, ebx
	punpcklwd xmm0, xmm4
	movzx	esi, cx
	punpcklwd xmm6, xmm2
	punpcklwd xmm5, xmm1
	punpcklwd xmm0, xmm6
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	punpcklwd xmm0, xmm7
	movdqa	[esp+432-400], xmm0
	movd	xmm0, esi
	movzx	esi, cx
	movd	xmm2, esi
	movzx	esi, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, dx
	movd	xmm3, esi
	movd	xmm5, ecx
	punpcklwd xmm5, xmm0

	movdqa	xmm0, [esp+432-384]
	movzx	ecx, dx
	movd	xmm6, ecx
	movzx	ecx, dx
	movzx	edx, dx
	punpcklwd xmm6, xmm2
	movd	xmm7, ecx
	movd	xmm1, edx

	movdqa	xmm2, [esp+448-208]
	punpcklbw xmm2, xmm0

	mov	ecx, 4
	movsx	edx, cx
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	movdqa	xmm5, [esp+496-208]
	movdqa	xmm3, [esp+464-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-240], xmm5
	movdqa	xmm5, [esp+512-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-352], xmm5
	punpcklwd xmm1, xmm4
	movdqa	xmm4, [esp+432-208]
	punpcklwd xmm1, xmm6
	movdqa	xmm6, [esp+480-208]
	punpcklwd xmm1, xmm7
	punpcklbw xmm6, xmm0
	punpcklbw xmm3, xmm0
	punpcklbw xmm4, xmm0
	movdqa	xmm7, xmm3
	psubw	xmm7, xmm4
	pabsw	xmm7, xmm7
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-336]
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5
	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-352]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5
	movdqa	xmm5, xmm3
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5
	movdqa	xmm5, [esp+432-400]
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, xmm3
	movdqa	[esp+432-32], xmm6
	psubw	xmm6, [esp+432-240]
	movdqa	xmm7, xmm5
	movdqa	[esp+432-384], xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6

	pand	xmm5, xmm7
	movdqa	xmm6, xmm3
	psubw	xmm6, xmm2
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-400]
	pand	xmm5, xmm7
	movdqa	xmm7, xmm6
	pcmpeqw	xmm6, xmm0
	pcmpgtw	xmm7, xmm0
	por	xmm7, xmm6
	pand	xmm5, xmm7
	movdqa	[esp+432-320], xmm5
	movd	xmm5, edx
	movdqa	xmm6, xmm5
	punpcklwd xmm6, xmm5
	pshufd	xmm5, xmm6, 0
	movdqa	[esp+432-336], xmm5
	movdqa	xmm5, [esp+432-224]
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm0
	psubw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	psllw	xmm5, 2
	movdqa	xmm7, xmm2
	psubw	xmm7, [esp+432-240]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	psraw	xmm7, 3
	pmaxsw	xmm6, xmm7
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	movdqa	xmm6, [esp+432-400]
	movdqa	[esp+432-64], xmm5
	movdqa	[esp+432-384], xmm6
	movdqa	xmm5, xmm0
	psubw	xmm5, xmm6
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm2
	paddw	xmm7, xmm2
	psubw	xmm5, xmm7
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-288]
	movdqa	xmm6, [esp+432-240]
	movdqa	[esp+432-96], xmm5
	movdqa	xmm5, [esp+432-352]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm6
	paddw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]
	psubw	xmm5, xmm7

	movdqa	xmm7, [esp+496-208]
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-400]
	pminsw	xmm5, xmm6
	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-256]
	movdqa	xmm6, [esp+448-208]
	punpckhbw xmm7, xmm0
	movdqa	[esp+432-352], xmm7

	movdqa	xmm7, [esp+512-208]
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-48], xmm5
	movdqa	xmm5, [esp+432-208]
	movdqa	[esp+432-368], xmm6
	movdqa	xmm6, [esp+464-208]
	punpckhbw xmm7, xmm0
	punpckhbw xmm5, xmm0
	movdqa	[esp+432-384], xmm7
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-400], xmm6

	movdqa	xmm7, [esp+432-400]
	movdqa	xmm6, [esp+480-208]
	psubw	xmm7, xmm5
	movdqa	[esp+432-16], xmm5
	pabsw	xmm7, xmm7
	punpckhbw xmm6, xmm0
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5

	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-384]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5

	movdqa	xmm5, [esp+432-400]
	movdqa	[esp+432-80], xmm6
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5

	movdqa	xmm5, xmm1
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, [esp+432-400]
	psubw	xmm6, [esp+432-352]
	movdqa	[esp+432-272], xmm5
	movdqa	xmm7, xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	movdqa	xmm7, xmm4
	pabsw	xmm6, xmm6
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]

	pand	xmm5, xmm7
	movdqa	xmm7, [esp+432-400]
	psubw	xmm7, xmm6
	psubw	xmm6, [esp+432-352]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7
	pand	xmm5, xmm4

	paddw	xmm2, [esp+432-96]
	movdqa	xmm4, xmm1
	pcmpgtw	xmm4, xmm0
	movdqa	xmm7, xmm1
	pcmpeqw	xmm7, xmm0
	por	xmm4, xmm7
	pand	xmm5, xmm4
	movdqa	xmm4, [esp+432-224]
	movdqa	[esp+432-320], xmm5
	movdqa	xmm5, [esp+432-272]
	movdqa	xmm7, xmm0
	psubw	xmm7, xmm4
	psubw	xmm0, xmm1
	psllw	xmm5, 2
	paddw	xmm6, xmm5
	paddw	xmm6, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	movdqa	[esp+432-336], xmm0
	psraw	xmm6, 3
	pmaxsw	xmm7, xmm6
	pminsw	xmm4, xmm7
	pand	xmm4, [esp+432-320]
	movdqa	xmm6, xmm0
	movdqa	xmm0, [esp+432-16]
	paddw	xmm0, [esp+432-304]
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-368]
	paddw	xmm4, xmm4
	psubw	xmm0, xmm4

	movdqa	xmm4, [esp+432-64]
	psraw	xmm0, 1
	pmaxsw	xmm6, xmm0
	movdqa	xmm0, [esp+432-400]
	movdqa	xmm7, xmm1
	pminsw	xmm7, xmm6
	movdqa	xmm6, [esp+432-320]
	pand	xmm7, xmm6
	pand	xmm7, [esp+432-288]
	paddw	xmm5, xmm7
	packuswb xmm2, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm0, xmm5
	paddw	xmm3, xmm4
	packuswb xmm3, xmm0

	movdqa	xmm0, [esp+432-32]
	psubw	xmm0, xmm4
	movdqa	xmm4, [esp+432-80]
	psubw	xmm4, xmm5

	movdqa	xmm5, [esp+432-240]
	paddw	xmm5, [esp+432-48]
	packuswb xmm0, xmm4
	movdqa	xmm4, [esp+432-384]
	paddw	xmm4, [esp+432-304]
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [esp+432-352]
	movdqa	xmm7, xmm0
	paddw	xmm0, xmm0

	mov	ecx, dword [esp+432-408]

	mov	edx, dword [esp+432-404]
	psubw	xmm4, xmm0
	movdqa	xmm0, [esp+432-336]
	movdqa	[edi], xmm2
	psraw	xmm4, 1
	pmaxsw	xmm0, xmm4
	pminsw	xmm1, xmm0
	movdqa	xmm0, [esp+480-208]

	pop	edi
	pand	xmm1, xmm6
	pand	xmm1, [esp+428-256]
	movdqa	[ecx], xmm3
	paddw	xmm7, xmm1
	pop	esi
	packuswb xmm5, xmm7
	movdqa	[eax], xmm0
	movdqa	[edx], xmm5
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret


;*******************************************************************************
;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta)
;*******************************************************************************

WELS_EXTERN  DeblockLumaEq4V_sse2

ALIGN  16

DeblockLumaEq4V_sse2:

	push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 628				; 00000274H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]
	push	ebx
	push	esi

	lea	edx, [ecx*4]
	pxor	xmm0, xmm0
	movdqa	xmm2, xmm0

	movdqa	xmm0, [ecx+eax]
	mov	esi, eax
	sub	esi, edx
	movdqa	xmm3, [esi]
	movdqa	xmm5, [eax]
	push	edi
	lea	edi, [ecx+ecx]
	lea	ebx, [ecx+ecx*2]
	mov	dword [esp+640-600], edi
	mov	esi, eax
	sub	esi, edi
	movdqa	xmm1, [esi]
	movdqa	 [esp+720-272], xmm0
	mov	edi, eax
	sub	edi, ecx
	movdqa	xmm4, [edi]
	add	ecx, eax
	mov	dword [esp+640-596], ecx

	mov	ecx, dword [esp+640-600]
	movdqa	xmm0, [ecx+eax]
	movdqa	 [esp+736-272], xmm0

	movdqa	xmm0, [eax+ebx]
	mov	edx, eax
	sub	edx, ebx

	movsx	ebx, word [ebp+16]
	movdqa	xmm6, [edx]
	add	ecx, eax
	movdqa	 [esp+752-272], xmm0
	movd	xmm0, ebx

	movsx	ebx, word [ebp+20]
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0
	movdqa	 [esp+640-320], xmm0
	movd	xmm0, ebx
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0

	movdqa	xmm7, [esp+736-272]
	punpcklbw xmm7, xmm2
	movdqa	 [esp+640-416], xmm7
	movdqa	 [esp+640-512], xmm0
	movdqa	xmm0, xmm1
	movdqa	 [esp+672-272], xmm1
	movdqa	xmm1, xmm4
	movdqa	 [esp+704-272], xmm5
	punpcklbw xmm5, xmm2
	punpcklbw xmm1, xmm2

	movdqa	xmm7, xmm5
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	punpcklbw xmm0, xmm2
	movdqa	 [esp+688-272], xmm4
	movdqa	xmm4, [esp+720-272]
	movdqa	 [esp+640-480], xmm0

	movdqa	xmm7, xmm1
	psubw	xmm7, xmm0

	movdqa	xmm0, [esp+640-512]
	pabsw	xmm7, xmm7
	punpcklbw xmm4, xmm2
	pcmpgtw	xmm0, xmm7
	movdqa	 [esp+640-384], xmm4
	movdqa	xmm7, xmm5
	psubw	xmm7, xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+656-272], xmm6
	punpcklbw xmm6, xmm2
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-48], xmm2
	movdqa	 [esp+640-368], xmm6
	movdqa	 [esp+640-144], xmm1
	movdqa	 [esp+640-400], xmm5
	pcmpgtw	xmm4, xmm7
	pand	xmm0, xmm4
	movdqa	xmm4, [esp+640-320]
	pcmpgtw	xmm4, [esp+640-560]
	pand	xmm0, xmm4

	mov	ebx, 2
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, [esp+640-320]
	psraw	xmm4, 2
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm7
	movdqa	 [esp+640-576], xmm4
	pcmpgtw	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+640-624], xmm7
	movdqa	xmm7, xmm1
	psubw	xmm7, xmm6
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-544], xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	xmm7, xmm5
	psubw	xmm7, [esp+640-416]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-544]
	pandn	xmm4, xmm6
	movdqa	 [esp+640-16], xmm4
	mov	ebx, 4
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, xmm3
	punpcklbw xmm4, xmm2
	psllw	xmm4, 1
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, [esp+640-480]

	movdqa	xmm6, [esp+640-560]
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm1
	movdqa	 [esp+640-592], xmm7
	paddw	xmm4, xmm5
	paddw	xmm4, xmm7
	movdqa	xmm7, [esp+640-416]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-80], xmm6
	movdqa	xmm6, [esp+752-272]
	punpcklbw xmm6, xmm2
	psllw	xmm6, 1
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-384]

	movdqa	xmm7, [esp+640-480]
	paddw	xmm6, xmm5
	paddw	xmm6, xmm1
	paddw	xmm6, [esp+640-592]
	psraw	xmm6, 3
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-112], xmm6
	movdqa	xmm6, [esp+640-544]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-336], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-528], xmm6
	movdqa	xmm6, [esp+640-368]
	paddw	xmm6, xmm7
	movdqa	xmm7, xmm1
	psraw	xmm4, 3
	pand	xmm4, [esp+640-544]
	paddw	xmm7, xmm5
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]

	paddw	xmm5, xmm1
	psraw	xmm6, 2
	pand	xmm7, xmm6

	movdqa	xmm6, [esp+640-384]
	movdqa	 [esp+640-64], xmm7
	movdqa	xmm7, [esp+640-560]
	pandn	xmm7, xmm6
	movdqa	 [esp+640-304], xmm7
	movdqa	xmm7, [esp+640-560]
	movdqa	 [esp+640-528], xmm7
	movdqa	xmm7, [esp+640-416]
	paddw	xmm7, xmm6
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pand	xmm5, xmm7
	movdqa	 [esp+640-32], xmm5

	movdqa	xmm5, [esp+640-544]
	movdqa	 [esp+640-528], xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	xmm7, xmm5
	paddw	xmm7, xmm5
	movdqa	xmm5, xmm1
	paddw	xmm5, xmm6
	paddw	xmm6, [esp+640-592]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pandn	xmm5, xmm7
	movdqa	xmm7, [esp+640-480]
	paddw	xmm7, xmm1
	paddw	xmm7, [esp+640-400]
	movdqa	xmm1, [esp+640-544]
	movdqa	 [esp+640-352], xmm5
	movdqa	xmm5, [esp+640-368]
	psllw	xmm7, 1
	paddw	xmm7, xmm6
	paddw	xmm5, xmm7

	movdqa	xmm7, [esp+640-400]
	psraw	xmm5, 3
	pand	xmm1, xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	 [esp+640-96], xmm1
	movdqa	xmm1, [esp+640-560]
	movdqa	 [esp+640-528], xmm1
	movdqa	xmm1, [esp+640-384]
	movdqa	xmm6, xmm1
	paddw	xmm6, xmm1
	paddw	xmm1, [esp+640-400]
	paddw	xmm1, [esp+640-144]
	paddw	xmm7, xmm5
	paddw	xmm5, [esp+640-592]
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]
	psraw	xmm6, 2
	psllw	xmm1, 1
	paddw	xmm1, xmm5

	movdqa	xmm5, [esp+656-272]
	pandn	xmm7, xmm6
	movdqa	xmm6, [esp+640-416]
	paddw	xmm6, xmm1
	movdqa	xmm1, [esp+640-560]
	psraw	xmm6, 3
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+704-272]
	movdqa	 [esp+640-128], xmm1
	movdqa	xmm1, [esp+672-272]
	punpckhbw xmm1, xmm2
	movdqa	 [esp+640-448], xmm1
	movdqa	xmm1, [esp+688-272]
	punpckhbw xmm1, xmm2
	punpckhbw xmm6, xmm2
	movdqa	 [esp+640-288], xmm7
	punpckhbw xmm5, xmm2
	movdqa	 [esp+640-496], xmm1
	movdqa	 [esp+640-432], xmm6

	movdqa	xmm7, [esp+720-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-464], xmm7

	movdqa	xmm7, [esp+736-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-528], xmm7

	movdqa	xmm7, xmm6

	psubw	xmm6, [esp+640-464]
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	por	xmm4, [esp+640-16]
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm1
	psubw	xmm7, [esp+640-448]

	movdqa	xmm1, [esp+640-512]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm1, xmm7
	movdqa	xmm7, [esp+640-512]
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+640-320]
	pand	xmm1, xmm7
	movdqa	xmm7, [esp+640-560]
	pcmpgtw	xmm6, xmm7
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+640-576]
	pcmpgtw	xmm6, xmm7

	movdqa	xmm7, [esp+640-496]
	punpckhbw xmm3, xmm2
	movdqa	 [esp+640-560], xmm6
	movdqa	xmm6, [esp+640-512]
	psubw	xmm7, xmm5
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7

	pand	xmm6, [esp+640-560]
	movdqa	xmm7, [esp+640-432]
	psubw	xmm7, [esp+640-528]

	psllw	xmm3, 1
	movdqa	 [esp+640-544], xmm6
	movdqa	xmm6, [esp+640-512]

	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, [esp+640-448]
	paddw	xmm3, [esp+640-496]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-560], xmm6

	movdqa	xmm6, xmm0
	pand	xmm6, xmm4
	movdqa	xmm4, xmm0
	pandn	xmm4, [esp+640-368]
	por	xmm6, xmm4
	movdqa	xmm4, [esp+640-432]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-592]
	psraw	xmm3, 3
	pand	xmm3, xmm2
	pandn	xmm2, xmm5
	por	xmm3, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm3
	movdqa	xmm3, [esp+640-64]
	por	xmm3, [esp+640-336]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm5
	por	xmm7, xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-480]
	por	xmm2, xmm3
	packuswb xmm6, xmm7
	movdqa	 [esp+640-336], xmm2
	movdqa	 [esp+656-272], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	xmm2, xmm5
	paddw	xmm2, [esp+640-448]
	movdqa	xmm3, xmm1
	movdqa	xmm7, [esp+640-496]
	paddw	xmm7, xmm4
	paddw	xmm2, xmm7
	paddw	xmm2, [esp+640-624]
	movdqa	xmm7, [esp+640-544]
	psraw	xmm2, 2
	pand	xmm6, xmm2
	movdqa	xmm2, [esp+640-448]
	pandn	xmm7, xmm2
	por	xmm6, xmm7
	pand	xmm3, xmm6
	movdqa	xmm6, xmm1
	pandn	xmm6, xmm2
	paddw	xmm2, [esp+640-496]
	paddw	xmm2, xmm4
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-336]
	packuswb xmm6, xmm3
	psllw	xmm2, 1
	movdqa	 [esp+672-272], xmm6
	movdqa	xmm6, [esp+640-96]
	por	xmm6, [esp+640-352]

	movdqa	xmm3, xmm0
	pand	xmm3, xmm6
	movdqa	xmm6, xmm0
	pandn	xmm6, [esp+640-144]
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-352], xmm3
	movdqa	xmm3, [esp+640-464]
	paddw	xmm3, [esp+640-592]
	paddw	xmm2, xmm3
	movdqa	xmm3, [esp+640-448]
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-496]
	psraw	xmm5, 3
	pand	xmm6, xmm5
	movdqa	xmm5, [esp+640-464]
	paddw	xmm2, xmm5
	paddw	xmm5, [esp+640-432]
	movdqa	xmm4, xmm3
	paddw	xmm4, xmm3
	paddw	xmm4, xmm2
	paddw	xmm4, [esp+640-624]
	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, [esp+640-592]
	psraw	xmm4, 2
	pandn	xmm2, xmm4
	por	xmm6, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-496]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm6
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-352]
	packuswb xmm2, xmm7
	movdqa	 [esp+688-272], xmm2
	movdqa	xmm2, [esp+640-128]
	por	xmm2, [esp+640-288]

	movdqa	xmm4, xmm0
	pand	xmm4, xmm2
	paddw	xmm5, xmm6
	movdqa	xmm2, xmm0
	pandn	xmm2, [esp+640-400]
	por	xmm4, xmm2
	movdqa	xmm2, [esp+640-528]
	psllw	xmm5, 1
	paddw	xmm5, xmm3
	movdqa	xmm3, [esp+640-560]
	paddw	xmm2, xmm5
	psraw	xmm2, 3
	movdqa	 [esp+640-288], xmm4
	movdqa	xmm4, [esp+640-560]
	pand	xmm4, xmm2
	movdqa	xmm2, [esp+640-464]
	movdqa	xmm5, xmm2
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-432]
	paddw	xmm2, [esp+640-448]
	movdqa	xmm7, xmm1
	paddw	xmm5, xmm2
	paddw	xmm5, [esp+640-624]
	movdqa	xmm6, [esp+640-560]
	psraw	xmm5, 2
	pandn	xmm3, xmm5
	por	xmm4, xmm3
	movdqa	xmm3, [esp+640-32]
	por	xmm3, [esp+640-304]
	pand	xmm7, xmm4
	movdqa	xmm4, [esp+640-432]
	movdqa	xmm5, [esp+640-464]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm4
	paddw	xmm4, [esp+640-496]
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-288]
	packuswb xmm2, xmm7
	movdqa	 [esp+704-272], xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-384]
	por	xmm2, xmm3
	movdqa	 [esp+640-304], xmm2
	movdqa	xmm2, [esp+640-528]
	movdqa	xmm3, xmm2
	paddw	xmm3, [esp+640-464]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-624]
	psraw	xmm3, 2
	pand	xmm6, xmm3
	movdqa	xmm3, [esp+640-560]
	movdqa	xmm4, xmm3
	pandn	xmm4, xmm5
	por	xmm6, xmm4
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-304]
	movdqa	xmm4, xmm1
	pandn	xmm4, xmm5
	por	xmm7, xmm4

	movdqa	xmm4, xmm0
	pandn	xmm0, [esp+640-416]
	packuswb xmm6, xmm7
	movdqa	xmm7, [esp+640-112]
	por	xmm7, [esp+640-80]
	pand	xmm4, xmm7
	por	xmm4, xmm0
	movdqa	xmm0, [esp+752-272]
	punpckhbw xmm0, [esp+640-48]
	psllw	xmm0, 1
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm5
	paddw	xmm0, [esp+640-432]
	paddw	xmm0, [esp+640-496]
	paddw	xmm0, [esp+640-592]
	psraw	xmm0, 3
	pand	xmm0, xmm3
	movdqa	xmm7, xmm1
	pandn	xmm3, xmm2
	por	xmm0, xmm3
	pand	xmm7, xmm0

	movdqa	xmm0, [esp+656-272]
	movdqa	 [edx], xmm0

	movdqa	xmm0, [esp+672-272]

	mov	edx, dword [esp+640-596]
	movdqa	 [esi], xmm0
	movdqa	xmm0, [esp+688-272]
	movdqa	 [edi], xmm0
	movdqa	xmm0, [esp+704-272]

	pop	edi
	pandn	xmm1, xmm2
	movdqa	 [eax], xmm0
	por	xmm7, xmm1
	pop	esi
	packuswb xmm4, xmm7
	movdqa	 [edx], xmm6
	movdqa	 [ecx], xmm4
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret


;********************************************************************************
;
;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************

WELS_EXTERN  DeblockLumaTransposeH2V_sse2

ALIGN  16

DeblockLumaTransposeH2V_sse2:
    push    ebp
    push    ebx
    mov     ebp,   esp
    and     esp,0FFFFFFF0h
    sub     esp,   10h

    mov     eax,   [ebp + 0Ch]
    mov     ecx,   [ebp + 10h]
    lea     edx,   [eax + ecx * 8]
    lea     ebx,   [ecx*3]

    movq    xmm0,  [eax]
    movq    xmm7,  [edx]
    punpcklqdq   xmm0,  xmm7
    movq    xmm1,  [eax + ecx]
    movq    xmm7,  [edx + ecx]
    punpcklqdq   xmm1,  xmm7
    movq    xmm2,  [eax + ecx*2]
    movq    xmm7,  [edx + ecx*2]
    punpcklqdq   xmm2,  xmm7
    movq    xmm3,  [eax + ebx]
    movq    xmm7,  [edx + ebx]
    punpcklqdq   xmm3,  xmm7

    lea     eax,   [eax + ecx * 4]
    lea     edx,   [edx + ecx * 4]
    movq    xmm4,  [eax]
    movq    xmm7,  [edx]
    punpcklqdq   xmm4,  xmm7
    movq    xmm5,  [eax + ecx]
    movq    xmm7,  [edx + ecx]
    punpcklqdq   xmm5,  xmm7
    movq    xmm6,  [eax + ecx*2]
    movq    xmm7,  [edx + ecx*2]
    punpcklqdq   xmm6,  xmm7

    movdqa  [esp],   xmm0
    movq    xmm7,  [eax + ebx]
    movq    xmm0,  [edx + ebx]
    punpcklqdq   xmm7,  xmm0
    movdqa  xmm0,   [esp]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    mov    eax,   [ebp + 14h]
    movdqa  [eax],    xmm4
    movdqa  [eax + 10h],  xmm2
    movdqa  [eax + 20h],  xmm3
    movdqa  [eax + 30h],  xmm7
    movdqa  [eax + 40h],  xmm5
    movdqa  [eax + 50h],  xmm1
    movdqa  [eax + 60h],  xmm6
    movdqa  [eax + 70h],  xmm0

    mov     esp,   ebp
    pop     ebx
    pop     ebp
    ret



;*******************************************************************************************
;
;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************

WELS_EXTERN   DeblockLumaTransposeV2H_sse2

ALIGN  16

DeblockLumaTransposeV2H_sse2:
    push     ebp
    mov      ebp,   esp

    and     esp,  0FFFFFFF0h
    sub     esp,   10h

    mov      eax,   [ebp + 10h]
    mov      ecx,   [ebp + 0Ch]
    mov      edx,   [ebp + 08h]

    movdqa   xmm0,  [eax]
    movdqa   xmm1,  [eax + 10h]
    movdqa   xmm2,  [eax + 20h]
    movdqa   xmm3,	[eax + 30h]
    movdqa   xmm4,	[eax + 40h]
    movdqa   xmm5,	[eax + 50h]
    movdqa   xmm6,	[eax + 60h]
    movdqa   xmm7,	[eax + 70h]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    lea      eax,   [ecx * 3]

    movq     [edx],  xmm4
    movq     [edx + ecx],  xmm2
    movq     [edx + ecx*2],  xmm3
    movq     [edx + eax],  xmm7

    lea      edx,   [edx + ecx*4]
    movq     [edx],  xmm5
    movq     [edx + ecx],  xmm1
    movq     [edx + ecx*2],  xmm6
    movq     [edx + eax],  xmm0

    psrldq    xmm4,   8
    psrldq    xmm2,   8
    psrldq    xmm3,   8
    psrldq    xmm7,   8
    psrldq    xmm5,   8
    psrldq    xmm1,   8
    psrldq    xmm6,   8
    psrldq    xmm0,   8

    lea       edx,  [edx + ecx*4]
    movq     [edx],  xmm4
    movq     [edx + ecx],  xmm2
    movq     [edx + ecx*2],  xmm3
    movq     [edx + eax],  xmm7

    lea      edx,   [edx + ecx*4]
    movq     [edx],  xmm5
    movq     [edx + ecx],  xmm1
    movq     [edx + ecx*2],  xmm6
    movq     [edx + eax],  xmm0


    mov      esp,   ebp
    pop      ebp
    ret