shithub: openh264

ref: 416bb224e095c91b83f2730ca2b2197bf3429c59
dir: /codec/common/x86/deblock.asm/

View raw version
;*!
;* \copy
;*     Copyright (c)  2009-2013, Cisco Systems
;*     All rights reserved.
;*
;*     Redistribution and use in source and binary forms, with or without
;*     modification, are permitted provided that the following conditions
;*     are met:
;*
;*        * Redistributions of source code must retain the above copyright
;*          notice, this list of conditions and the following disclaimer.
;*
;*        * Redistributions in binary form must reproduce the above copyright
;*          notice, this list of conditions and the following disclaimer in
;*          the documentation and/or other materials provided with the
;*          distribution.
;*
;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;*     POSSIBILITY OF SUCH DAMAGE.
;*
;*
;*  deblock.asm
;*
;*  Abstract
;*      edge loop
;*
;*  History
;*      08/07/2009 Created
;*
;*
;*************************************************************************/
%include "asm_inc.asm"

;*******************************************************************************
; Macros and other preprocessor constants
;*******************************************************************************

SECTION .rodata align=16

ALIGN   16
FOUR_16B_SSE2:   dw   4, 4, 4, 4, 4, 4, 4, 4


SECTION .text

%ifdef  WIN64


WELS_EXTERN   DeblockLumaLt4V_ssse3
  push        rbp
  mov         r11,[rsp + 16 + 20h]  ; pTC
  PUSH_XMM 16
  sub         rsp,1B0h
  lea         rbp,[rsp+20h]
  movd        xmm4,r8d
  movd        xmm2,r9d
  mov         qword [rbp+180h],r12
  mov         r10,rcx
  movsxd      r12,edx
  add         edx,edx
  movsxd      rdx,edx
  sub         r10,r12
  movsx       r8d,byte [r11]
  pxor        xmm3,xmm3
  punpcklwd   xmm2,xmm2
  movaps      [rbp+50h],xmm14
  lea         rax,[r12+r12*2]
  movdqa      xmm14,[rdx+rcx]
  neg         rax
  pshufd      xmm0,xmm2,0
  movd        xmm2,r8d
  movsx       edx,byte [r11+1]
  movsx       r8d,byte [r11+2]
  movsx       r11d,byte [r11+3]
  movaps      [rbp+70h],xmm12
  movd        xmm1,edx
  movaps      [rbp+80h],xmm11
  movd        xmm12,r8d
  movd        xmm11,r11d
  movdqa      xmm5, [rax+rcx]
  lea         rax,[r12+r12]
  punpcklwd   xmm12,xmm12
  neg         rax
  punpcklwd   xmm11,xmm11
  movaps      [rbp],xmm8
  movdqa      xmm8, [r10]
  punpcklwd   xmm2,xmm2
  punpcklwd   xmm1,xmm1
  punpcklqdq  xmm12,xmm12
  punpcklqdq  xmm11,xmm11
  punpcklqdq  xmm2,xmm2
  punpcklqdq  xmm1,xmm1
  shufps      xmm12,xmm11,88h
  movdqa      xmm11,xmm8
  movaps      [rbp+30h],xmm9
  movdqa      xmm9,[rcx]
  shufps      xmm2,xmm1,88h
  movdqa      xmm1,xmm5
  punpcklbw   xmm11,xmm3
  movaps      [rbp+20h],xmm6
  movaps      [rbp+60h],xmm13
  movdqa      xmm13,xmm11
  movaps      [rbp+90h],xmm10
  movdqa      xmm10,xmm9
  movdqa      xmm6,[rax+rcx]
  punpcklbw   xmm1,xmm3
  movaps      [rbp+0A0h],xmm12
  psubw       xmm13,xmm1
  movaps      [rbp+40h],xmm15
  movdqa      xmm15,xmm14
  movaps      [rbp+10h],xmm7
  movdqa      xmm7,xmm6
  punpcklbw   xmm10,xmm3
  movdqa      xmm12,[r12+rcx]
  punpcklbw   xmm7,xmm3
  punpcklbw   xmm12,xmm3
  punpcklbw   xmm15,xmm3
  pabsw       xmm3,xmm13
  movdqa      xmm13,xmm10
  psubw       xmm13,xmm15
  movdqa      [rbp+0F0h],xmm15
  pabsw       xmm15,xmm13
  movdqa      xmm13,xmm11
  movdqa      [rbp+0B0h],xmm1
  movdqa      xmm1,xmm0
  pavgw       xmm13,xmm10
  pcmpgtw     xmm1,xmm3
  movdqa      [rbp+120h],xmm13
  movaps      xmm13,xmm2
  punpcklwd   xmm4,xmm4
  movdqa      xmm3,xmm0
  movdqa      [rbp+100h],xmm1
  psubw       xmm13,xmm1
  movdqa      xmm1,xmm10
  pcmpgtw     xmm3,xmm15
  pshufd      xmm4,xmm4,0
  psubw       xmm1,xmm11
  movdqa      [rbp+0D0h],xmm10
  psubw       xmm13,xmm3
  movdqa      [rbp+110h],xmm3
  pabsw       xmm15,xmm1
  movdqa      xmm3,xmm4
  psubw       xmm10,xmm12
  pcmpgtw     xmm3,xmm15
  pabsw       xmm15,xmm10
  movdqa      xmm10,xmm0
  psllw       xmm1,2
  movdqa      [rbp+0C0h],xmm11
  psubw       xmm11,xmm7
  pcmpgtw     xmm10,xmm15
  pabsw       xmm11,xmm11
  movdqa      xmm15,xmm0
  pand        xmm3,xmm10
  pcmpgtw     xmm15,xmm11
  movaps      xmm11,xmm2
  pxor        xmm10,xmm10
  pand        xmm3,xmm15
  pcmpgtw     xmm11,xmm10
  pcmpeqw     xmm10,xmm2
  por         xmm11,xmm10
  pand        xmm3,xmm11
  movdqa      xmm11,xmm7
  psubw       xmm11,xmm12
  pxor        xmm15,xmm15
  paddw       xmm11,xmm1
  psubw       xmm15,xmm13
  movdqa      [rbp+0E0h],xmm12
  paddw       xmm11,[FOUR_16B_SSE2]
  pxor        xmm12,xmm12
  psraw       xmm11,3
  punpckhbw   xmm8,xmm12
  pmaxsw      xmm15,xmm11
  punpckhbw   xmm5,xmm12
  movdqa      xmm11,xmm8
  pminsw      xmm13,xmm15
  psubw       xmm11,xmm5
  punpckhbw   xmm9,xmm12
  pand        xmm13,xmm3
  movdqa      [rbp+130h],xmm13
  pabsw       xmm13,xmm11
  punpckhbw   xmm14,xmm12
  movdqa      xmm11,xmm9
  psubw       xmm11,xmm14
  movdqa      xmm15,xmm0
  movdqa      [rbp+140h],xmm14
  pabsw       xmm14,xmm11
  movdqa      xmm11,xmm8
  pcmpgtw     xmm15,xmm14
  movdqa      xmm1,[r12+rcx]
  pavgw       xmm11,xmm9
  movdqa      [rbp+170h],xmm11
  movdqa      xmm10,xmm9
  punpckhbw   xmm6,xmm12
  psubw       xmm10,xmm8
  punpckhbw   xmm1,xmm12
  movdqa      xmm12,xmm0
  movaps      xmm11,[rbp+0A0h]
  pcmpgtw     xmm12,xmm13
  movaps      xmm13,xmm11
  psubw       xmm13,xmm12
  movdqa      [rbp+160h],xmm15
  psubw       xmm13,xmm15
  movdqa      xmm15,xmm9
  psubw       xmm15,xmm1
  movdqa      [rbp+150h],xmm12
  pabsw       xmm12,xmm10
  pabsw       xmm14,xmm15
  movdqa      xmm15,xmm8
  pcmpgtw     xmm4,xmm12
  movdqa      xmm12,xmm0
  psubw       xmm15,xmm6
  pcmpgtw     xmm12,xmm14
  pabsw       xmm14,xmm15
  psllw       xmm10,2
  pcmpgtw     xmm0,xmm14
  movdqa      xmm14,xmm6
  psubw       xmm14,xmm1
  pand        xmm4,xmm12
  paddw       xmm14,xmm10
  pand        xmm4,xmm0
  paddw       xmm14,[FOUR_16B_SSE2]
  pxor        xmm15,xmm15
  movaps      xmm12,xmm11
  psubw       xmm15,xmm13
  pxor        xmm0,xmm0
  psraw       xmm14,3
  pcmpgtw     xmm12,xmm0
  pcmpeqw     xmm0,xmm11
  pmaxsw      xmm15,xmm14
  por         xmm12,xmm0
  movdqa      xmm0,[rbp+120h]
  pminsw      xmm13,xmm15
  movdqa      xmm15,[rbp+0B0h]
  movdqa      xmm10,xmm7
  pand        xmm4,xmm12
  paddw       xmm15,xmm0
  pxor        xmm12,xmm12
  paddw       xmm10,xmm7
  movdqa      xmm14,xmm12
  psubw       xmm15,xmm10
  psubw       xmm14,xmm2
  psraw       xmm15,1
  pmaxsw      xmm15,xmm14
  movdqa      xmm10,xmm6
  pminsw      xmm15,xmm2
  paddw       xmm10,xmm6
  pand        xmm15,xmm3
  psubw       xmm12,xmm11
  pand        xmm15,[rbp+100h]
  pand        xmm13,xmm4
  paddw       xmm7,xmm15
  paddw       xmm8,xmm13
  movdqa      xmm15,[rbp+170h]
  psubw       xmm9,xmm13
  paddw       xmm5,xmm15
  psubw       xmm5,xmm10
  psraw       xmm5,1
  pmaxsw      xmm5,xmm12
  pminsw      xmm5,xmm11
  pand        xmm5,xmm4
  pand        xmm5,[rbp+150h]
  paddw       xmm6,xmm5
  movdqa      xmm5,[rbp+0C0h]
  packuswb    xmm7,xmm6
  movdqa      xmm6,[rbp+130h]
  paddw       xmm5,xmm6
  packuswb    xmm5,xmm8
  movdqa      xmm8,[rbp+0D0h]
  psubw       xmm8,xmm6
  movdqa      xmm6,[rbp+0F0h]
  paddw       xmm6,xmm0
  movdqa      xmm0,[rbp+0E0h]
  packuswb    xmm8,xmm9
  movdqa      xmm9,xmm0
  paddw       xmm9,xmm0
  psubw       xmm6,xmm9
  psraw       xmm6,1
  pmaxsw      xmm14,xmm6
  pminsw      xmm2,xmm14
  pand        xmm2,xmm3
  pand        xmm2,[rbp+110h]
  paddw       xmm0,xmm2
  movdqa      xmm2,[rbp+140h]
  paddw       xmm2,xmm15
  movdqa      xmm15,xmm1
  paddw       xmm15,xmm1
  psubw       xmm2,xmm15
  psraw       xmm2,1
  pmaxsw      xmm12,xmm2
  pminsw      xmm11,xmm12
  pand        xmm11,xmm4
  pand        xmm11,[rbp+160h]
  paddw       xmm1,xmm11
  movdqa      [rax+rcx],xmm7
  movdqa      [r10],xmm5
  packuswb    xmm0,xmm1
  movdqa      [rcx],xmm8
  movdqa      [r12+rcx],xmm0
  mov         r12,qword [rbp+180h]
  lea         rsp,[rbp+190h]
  POP_XMM
  pop         rbp
  ret


WELS_EXTERN   DeblockLumaEq4V_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  push        rsi
  push        rdi
  sub         rsp,1D8h
  movaps      [rax-38h],xmm6
  movaps      [rax-48h],xmm7
  movaps      [rax-58h],xmm8
  pxor        xmm1,xmm1
  movsxd      r10,edx
  mov         rbp,rcx
  mov         r11d,r8d
  mov         rdx,rcx
  mov         rdi,rbp
  mov         rbx,rbp
  movdqa      xmm5,[rbp]
  movaps      [rax-68h],xmm9
  movaps      [rax-78h],xmm10
  punpcklbw   xmm5,xmm1
  movaps      [rax-88h],xmm11
  movaps      [rax-98h],xmm12
  movaps      [rax-0A8h],xmm13
  movaps      [rax-0B8h],xmm14
  movdqa      xmm14,[r10+rbp]
  movaps      [rax-0C8h],xmm15
  lea         eax,[r10*4]
  movsxd      r8,eax
  lea         eax,[r10+r10*2]
  movsxd      rcx,eax
  lea         eax,[r10+r10]
  sub         rdx,r8
  punpcklbw   xmm14,xmm1
  movdqa      [rsp+90h],xmm5
  movdqa      [rsp+30h],xmm14
  movsxd      rsi,eax
  movsx       eax,r11w
  sub         rdi,rcx
  sub         rbx,rsi
  mov         r8,rbp
  sub         r8,r10
  movd        xmm0,eax
  movsx       eax,r9w
  movdqa      xmm12,[rdi]
  movdqa      xmm6, [rsi+rbp]
  movdqa      xmm13,[rbx]
  punpcklwd   xmm0,xmm0
  pshufd      xmm11,xmm0,0
  punpcklbw   xmm13,xmm1
  punpcklbw   xmm6,xmm1
  movdqa      xmm8,[r8]
  movd        xmm0,eax
  movdqa      xmm10,xmm11
  mov         eax,2
  punpcklbw   xmm8,xmm1
  punpcklbw   xmm12,xmm1
  cwde
  punpcklwd   xmm0,xmm0
  psraw       xmm10,2
  movdqa      xmm1,xmm8
  movdqa      [rsp+0F0h],xmm13
  movdqa      [rsp+0B0h],xmm8
  pshufd      xmm7,xmm0,0
  psubw       xmm1,xmm13
  movdqa      xmm0,xmm5
  movdqa      xmm4,xmm7
  movdqa      xmm2,xmm7
  psubw       xmm0,xmm8
  pabsw       xmm3,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm5
  movdqa      [rsp+40h],xmm7
  movdqa      [rsp+60h],xmm6
  pcmpgtw     xmm4,xmm0
  psubw       xmm1,xmm14
  pabsw       xmm0,xmm1
  pcmpgtw     xmm2,xmm0
  pand        xmm4,xmm2
  movdqa      xmm0,xmm11
  pcmpgtw     xmm0,xmm3
  pand        xmm4,xmm0
  movd        xmm0,eax
  movdqa      [rsp+20h],xmm4
  punpcklwd   xmm0,xmm0
  pshufd      xmm2,xmm0,0
  paddw       xmm10,xmm2
  movdqa      [rsp+0A0h],xmm2
  movdqa      xmm15,xmm7
  pxor        xmm4,xmm4
  movdqa      xmm0,xmm8
  psubw       xmm0,xmm12
  mov         eax,4
  pabsw       xmm0,xmm0
  movdqa      xmm1,xmm10
  cwde
  pcmpgtw     xmm15,xmm0
  pcmpgtw     xmm1,xmm3
  movdqa      xmm3,xmm7
  movdqa      xmm7,[rdx]
  movdqa      xmm0,xmm5
  psubw       xmm0,xmm6
  pand        xmm15,xmm1
  punpcklbw   xmm7,xmm4
  movdqa      xmm9,xmm15
  pabsw       xmm0,xmm0
  psllw       xmm7,1
  pandn       xmm9,xmm12
  pcmpgtw     xmm3,xmm0
  paddw       xmm7,xmm12
  movd        xmm0,eax
  pand        xmm3,xmm1
  paddw       xmm7,xmm12
  punpcklwd   xmm0,xmm0
  paddw       xmm7,xmm12
  pshufd      xmm1,xmm0,0
  paddw       xmm7,xmm13
  movdqa      xmm0,xmm3
  pandn       xmm0,xmm6
  paddw       xmm7,xmm8
  movdqa      [rsp+70h],xmm1
  paddw       xmm7,xmm5
  movdqa      [rsp+120h],xmm0
  movdqa      xmm0,[rcx+rbp]
  punpcklbw   xmm0,xmm4
  paddw       xmm7,xmm1
  movdqa      xmm4,xmm15
  psllw       xmm0,1
  psraw       xmm7,3
  paddw       xmm0,xmm6
  pand        xmm7,xmm15
  paddw       xmm0,xmm6
  paddw       xmm0,xmm6
  paddw       xmm0,xmm14
  movdqa      xmm6,xmm15
  paddw       xmm0,xmm5
  pandn       xmm6,xmm13
  paddw       xmm0,xmm8
  paddw       xmm0,xmm1
  psraw       xmm0,3
  movdqa      xmm1,xmm12
  paddw       xmm1,xmm13
  pand        xmm0,xmm3
  movdqa      [rsp+100h],xmm0
  movdqa      xmm0,xmm8
  paddw       xmm0,xmm5
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm3
  paddw       xmm1,xmm2
  psraw       xmm1,2
  pandn       xmm0,xmm14
  pand        xmm4,xmm1
  movdqa      [rsp+0E0h],xmm0
  movdqa      xmm0,xmm5
  paddw       xmm0,xmm8
  movdqa      xmm1,[rsp+60h]
  paddw       xmm1,xmm14
  movdqa      xmm14,xmm3
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm8
  paddw       xmm0,[rsp+30h]
  paddw       xmm1,xmm2
  psraw       xmm1,2
  pand        xmm14,xmm1
  movdqa      xmm1,xmm13
  paddw       xmm1,xmm13
  paddw       xmm1,xmm0
  paddw       xmm1,xmm2
  psraw       xmm1,2
  movdqa      xmm0,[rsp+30h]
  movdqa      xmm2,xmm13
  movdqa      xmm5,xmm15
  paddw       xmm0,[rsp+70h]
  pandn       xmm5,xmm1
  paddw       xmm2,xmm8
  movdqa      xmm8,[rsp+90h]
  movdqa      xmm1,xmm12
  paddw       xmm2,xmm8
  psllw       xmm2,1
  paddw       xmm2,xmm0
  paddw       xmm1,xmm2
  movdqa      xmm0,xmm8
  movdqa      xmm8,xmm3
  movdqa      xmm2,[rsp+30h]
  paddw       xmm0,xmm13
  psraw       xmm1,3
  pand        xmm15,xmm1
  movdqa      xmm1,xmm2
  paddw       xmm1,xmm2
  paddw       xmm2,[rsp+90h]
  paddw       xmm2,[rsp+0B0h]
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm13
  movdqa      xmm13,[r8]
  paddw       xmm0, [rsp+70h]
  paddw       xmm1, [rsp+0A0h]
  psllw       xmm2,1
  paddw       xmm2,xmm0
  psraw       xmm1,2
  movdqa      xmm0, [rdi]
  pandn       xmm8,xmm1
  movdqa      xmm1, [rsp+60h]
  paddw       xmm1,xmm2
  movdqa      xmm2, [rbx]
  psraw       xmm1,3
  pand        xmm3,xmm1
  movdqa      xmm1, [rbp]
  movdqa      [rsp+0D0h],xmm3
  pxor        xmm3,xmm3
  punpckhbw   xmm0,xmm3
  punpckhbw   xmm1,xmm3
  punpckhbw   xmm13,xmm3
  movdqa      [rsp+0C0h],xmm0
  movdqa      xmm0,[r10+rbp]
  movdqa      [rsp],xmm1
  punpckhbw   xmm0,xmm3
  punpckhbw   xmm2,xmm3
  movdqa      [rsp+80h],xmm0
  movdqa      xmm0,[rsi+rbp]
  movdqa      [rsp+10h],xmm13
  punpckhbw   xmm0,xmm3
  movdqa      [rsp+50h],xmm0
  movdqa      xmm0,xmm1
  movdqa      xmm1,xmm13
  psubw       xmm0,xmm13
  psubw       xmm1,xmm2
  pabsw       xmm3,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,[rsp]
  movdqa      xmm13,[rsp+40h]
  movdqa      [rsp+110h],xmm2
  psubw       xmm1, [rsp+80h]
  pcmpgtw     xmm13,xmm0
  pcmpgtw     xmm11,xmm3
  pabsw       xmm0,xmm1
  pcmpgtw     xmm10,xmm3
  movdqa      xmm1, [rsp+40h]
  movdqa      xmm2,xmm1
  movdqa      xmm3,xmm1
  pcmpgtw     xmm2,xmm0
  movdqa      xmm0, [rsp+10h]
  pand        xmm13,xmm2
  pand        xmm13,xmm11
  movdqa      xmm11,[rsp+0C0h]
  psubw       xmm0,xmm11
  pabsw       xmm0,xmm0
  pcmpgtw     xmm3,xmm0
  pand        xmm3,xmm10
  movdqa      xmm0,[rsp]
  psubw       xmm0,[rsp+50h]
  movdqa      xmm2,[rdx]
  pabsw       xmm0,xmm0
  por         xmm7,xmm9
  movdqa      xmm9,[rsp+20h]
  pcmpgtw     xmm1,xmm0
  pand        xmm9,xmm7
  movdqa      xmm7,[rsp+20h]
  movdqa      xmm0,xmm7
  pandn       xmm0,xmm12
  movdqa      xmm12,[rsp+110h]
  pand        xmm1,xmm10
  movdqa      xmm10,[rsp+70h]
  movdqa      [rsp+40h],xmm1
  movdqa      xmm1,xmm13
  por         xmm9,xmm0
  pxor        xmm0,xmm0
  por         xmm4,xmm6
  movdqa      xmm6,xmm7
  punpckhbw   xmm2,xmm0
  por         xmm15,xmm5
  movdqa      xmm5,[rsp+20h]
  movdqa      xmm0,xmm3
  psllw       xmm2,1
  pandn       xmm0,xmm11
  pand        xmm6,xmm4
  movdqa      xmm4,[rsp]
  paddw       xmm2,xmm11
  pand        xmm5,xmm15
  movdqa      xmm15,[rsp+20h]
  paddw       xmm2,xmm11
  paddw       xmm2,xmm11
  paddw       xmm2,xmm12
  paddw       xmm2,[rsp+10h]
  paddw       xmm2,[rsp]
  paddw       xmm2,xmm10
  psraw       xmm2,3
  pand        xmm2,xmm3
  por         xmm2,xmm0
  pand        xmm1,xmm2
  movdqa      xmm0,xmm13
  movdqa      xmm2,xmm11
  pandn       xmm0,xmm11
  paddw       xmm2,xmm12
  por         xmm1,xmm0
  packuswb    xmm9,xmm1
  movdqa      xmm0,xmm7
  movdqa      xmm7,[rsp+0A0h]
  pandn       xmm0,[rsp+0F0h]
  movdqa      xmm1,xmm3
  por         xmm6,xmm0
  movdqa      xmm0,[rsp+10h]
  paddw       xmm0,xmm4
  paddw       xmm2,xmm0
  paddw       xmm2,xmm7
  movdqa      xmm0,xmm3
  pandn       xmm0,xmm12
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  movdqa      xmm2,xmm13
  movdqa      xmm0,xmm13
  pand        xmm2,xmm1
  pandn       xmm0,xmm12
  movdqa      xmm1,xmm12
  paddw       xmm1,[rsp+10h]
  por         xmm2,xmm0
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+0B0h]
  paddw       xmm1,xmm4
  packuswb    xmm6,xmm2
  movdqa      xmm2,xmm3
  psllw       xmm1,1
  por         xmm5,xmm0
  movdqa      xmm0,[rsp+80h]
  paddw       xmm0,xmm10
  paddw       xmm1,xmm0
  paddw       xmm11,xmm1
  psraw       xmm11,3
  movdqa      xmm1,xmm12
  pand        xmm2,xmm11
  paddw       xmm1,xmm12
  movdqa      xmm11,[rsp+80h]
  movdqa      xmm0, [rsp+10h]
  por         xmm14,[rsp+0E0h]
  paddw       xmm0,xmm11
  movdqa      xmm4,xmm15
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm13
  paddw       xmm1,xmm7
  psraw       xmm1,2
  pandn       xmm3,xmm1
  por         xmm2,xmm3
  movdqa      xmm1,xmm13
  movdqa      xmm3,[rsp+10h]
  pandn       xmm0,xmm3
  pand        xmm1,xmm2
  movdqa      xmm2,xmm11
  paddw       xmm2,[rsp]
  por         xmm1,xmm0
  movdqa      xmm0,[rsp+0D0h]
  por         xmm0,xmm8
  paddw       xmm2,xmm3
  packuswb    xmm5,xmm1
  movdqa      xmm8,[rsp+40h]
  movdqa      xmm1,[rsp+50h]
  movdqa      xmm3,xmm8
  pand        xmm4,xmm0
  psllw       xmm2,1
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+90h]
  por         xmm4,xmm0
  movdqa      xmm0,xmm12
  paddw       xmm0,xmm10
  paddw       xmm2,xmm0
  paddw       xmm1,xmm2
  movdqa      xmm0,[rsp]
  movdqa      xmm2,xmm11
  paddw       xmm0,xmm12
  movdqa      xmm12,[rsp]
  paddw       xmm2,xmm11
  paddw       xmm2,xmm0
  psraw       xmm1,3
  movdqa      xmm0,xmm8
  pand        xmm3,xmm1
  paddw       xmm2,xmm7
  movdqa      xmm1,xmm13
  psraw       xmm2,2
  pandn       xmm0,xmm2
  por         xmm3,xmm0
  movdqa      xmm2,[rsp+50h]
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm12
  pand        xmm1,xmm3
  paddw       xmm2,xmm11
  movdqa      xmm3,xmm15
  por         xmm1,xmm0
  pand        xmm3,xmm14
  movdqa      xmm14,[rsp+10h]
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+30h]
  packuswb    xmm4,xmm1
  movdqa      xmm1,xmm8
  por         xmm3,xmm0
  movdqa      xmm0,xmm12
  paddw       xmm0,xmm14
  paddw       xmm2,xmm0
  paddw       xmm2,xmm7
  movdqa      xmm0,xmm8
  pandn       xmm0,xmm11
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  movdqa      xmm2,xmm13
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm11
  pand        xmm2,xmm1
  movdqa      xmm1,xmm15
  por         xmm2,xmm0
  packuswb    xmm3,xmm2
  movdqa      xmm0,[rsp+100h]
  por         xmm0,[rsp+120h]
  pand        xmm1,xmm0
  movdqa      xmm2,[rcx+rbp]
  movdqa      xmm7,[rsp+50h]
  pandn       xmm15,[rsp+60h]
  lea         r11,[rsp+1D8h]
  pxor        xmm0,xmm0
  por         xmm1,xmm15
  movaps      xmm15,[r11-0A8h]
  movdqa      [rdi],xmm9
  movaps      xmm9,[r11-48h]
  punpckhbw   xmm2,xmm0
  psllw       xmm2,1
  paddw       xmm2,xmm7
  paddw       xmm2,xmm7
  movdqa      [rbx],xmm6
  movaps      xmm6,[r11-18h]
  paddw       xmm2,xmm7
  paddw       xmm2,xmm11
  movaps      xmm11,[r11-68h]
  paddw       xmm2,xmm12
  movaps      xmm12,[r11-78h]
  paddw       xmm2,xmm14
  paddw       xmm2,xmm10
  psraw       xmm2,3
  movaps      xmm10,[r11-58h]
  movaps      xmm14,[r11-98h]
  movdqa      xmm0,xmm13
  pand        xmm2,xmm8
  pandn       xmm8,xmm7
  pandn       xmm13,xmm7
  por         xmm2,xmm8
  movaps      xmm7,[r11-28h]
  movaps      xmm8,[r11-38h]
  movdqa      [r8],xmm5
  pand        xmm0,xmm2
  por         xmm0,xmm13
  packuswb    xmm1,xmm0
  movaps      xmm13,[r11-88h]
  movdqa      [rbp],xmm4
  movdqa      [r10+rbp],xmm3
  movdqa      [rsi+rbp],xmm1
  mov         rsp,r11
  pop         rdi
  pop         rsi
  pop         rbp
  pop         rbx
  ret


WELS_EXTERN  DeblockChromaLt4V_ssse3
  mov         rax,rsp
  push        rbx
  push        rdi
  PUSH_XMM 16
  sub         rsp,0C8h
  mov         r10,qword [rax + 30h]  ; pTC
  pxor        xmm1,xmm1
  mov         rbx,rcx
  movsxd      r11,r8d
  movsx       ecx,byte [r10]
  movsx       r8d,byte [r10+2]
  mov         rdi,rdx
  movq        xmm2,[rbx]
  movq        xmm9,[r11+rbx]
  movsx       edx,byte [r10+1]
  mov         word [rsp+2],cx
  mov         word [rsp],cx
  movsx       eax,byte [r10+3]
  mov         word [rsp+6],dx
  mov         word [rsp+4],dx
  movdqa      xmm11,xmm1
  mov         word [rsp+0Eh],ax
  mov         word [rsp+0Ch],ax
  lea         eax,[r11+r11]
  movsxd      rcx,eax
  mov         rax,rbx
  mov         rdx,rdi
  sub         rax,rcx
  mov         word [rsp+0Ah],r8w
  mov         word [rsp+8],r8w
  movdqa      xmm6,[rsp]
  movdqa      xmm7,xmm6
  movq        xmm13, [rax]
  mov         rax,rdi
  sub         rax,rcx
  mov         rcx,rbx
  pcmpgtw     xmm7,xmm1
  psubw       xmm11,xmm6
  sub         rcx,r11
  sub         rdx,r11
  movq        xmm0,[rax]
  movsx       eax,r9w
  movq        xmm15,[rcx]
  punpcklqdq  xmm13,xmm0
  movq        xmm0, [rdx]
  movdqa      xmm4,xmm13
  punpcklqdq  xmm15,xmm0
  movq        xmm0, [rdi]
  punpcklbw   xmm4,xmm1
  movdqa      xmm12,xmm15
  punpcklqdq  xmm2,xmm0
  movq        xmm0, [r11+rdi]
  punpcklbw   xmm12,xmm1
  movdqa      xmm14,xmm2
  punpcklqdq  xmm9,xmm0
  punpckhbw   xmm2,xmm1
  punpcklbw   xmm14,xmm1
  movd        xmm0,eax
  movsx       eax,word [rsp + 0C8h + 38h + 160] ; iBeta
  punpckhbw   xmm13,xmm1
  punpckhbw   xmm15,xmm1
  movdqa      xmm3,xmm9
  movdqa      [rsp+10h],xmm2
  punpcklwd   xmm0,xmm0
  punpckhbw   xmm9,xmm1
  punpcklbw   xmm3,xmm1
  movdqa      xmm1,xmm14
  pshufd      xmm10,xmm0,0
  movd        xmm0,eax
  mov         eax,4
  cwde
  punpcklwd   xmm0,xmm0
  pshufd      xmm8,xmm0,0
  movd        xmm0,eax
  punpcklwd   xmm0,xmm0
  pshufd      xmm5,xmm0,0
  psubw       xmm1,xmm12
  movdqa      xmm2,xmm10
  lea         r11,[rsp+0C8h]
  psllw       xmm1,2
  movdqa      xmm0,xmm4
  psubw       xmm4,xmm12
  psubw       xmm0,xmm3
  psubw       xmm3,xmm14
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm11
  psraw       xmm1,3
  pmaxsw      xmm0,xmm1
  pminsw      xmm6,xmm0
  movdqa      xmm1,xmm8
  movdqa      xmm0,xmm12
  psubw       xmm0,xmm14
  pabsw       xmm0,xmm0
  pcmpgtw     xmm2,xmm0
  pabsw       xmm0,xmm4
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm3
  movdqa      xmm3,[rsp]
  pand        xmm2,xmm1
  movdqa      xmm1,xmm8
  pcmpgtw     xmm1,xmm0
  movdqa      xmm0,xmm13
  pand        xmm2,xmm1
  psubw       xmm0,xmm9
  psubw       xmm13,xmm15
  pand        xmm2,xmm7
  pand        xmm6,xmm2
  paddw       xmm12,xmm6
  psubw       xmm14,xmm6
  movdqa      xmm2,[rsp+10h]
  movaps      xmm6,[r11-18h]
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm15
  psubw       xmm9,xmm2
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm15
  psubw       xmm0,xmm2
  psraw       xmm1,3
  pmaxsw      xmm11,xmm1
  pabsw       xmm0,xmm0
  movdqa      xmm1,xmm8
  pcmpgtw     xmm10,xmm0
  pabsw       xmm0,xmm13
  pminsw      xmm3,xmm11
  movaps      xmm11,[r11-68h]
  movaps      xmm13,[rsp+40h]
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm9
  movaps      xmm9, [r11-48h]
  pand        xmm10,xmm1
  pcmpgtw     xmm8,xmm0
  pand        xmm10,xmm8
  pand        xmm10,xmm7
  movaps      xmm8,[r11-38h]
  movaps      xmm7,[r11-28h]
  pand        xmm3,xmm10
  paddw       xmm15,xmm3
  psubw       xmm2,xmm3
  movaps      xmm10,[r11-58h]
  packuswb    xmm12,xmm15
  movaps      xmm15,[rsp+20h]
  packuswb    xmm14,xmm2
  movq        [rcx],xmm12
  movq        [rbx],xmm14
  psrldq      xmm12,8
  psrldq      xmm14,8
  movq        [rdx],xmm12
  movaps      xmm12,[r11-78h]
  movq        [rdi],xmm14
  movaps      xmm14,[rsp+30h]
  mov         rsp,r11
  POP_XMM
  pop         rdi
  pop         rbx
  ret


WELS_EXTERN   DeblockChromaEq4V_ssse3
  mov         rax,rsp
  push        rbx
  PUSH_XMM 15
  sub         rsp,90h
  pxor        xmm1,xmm1
  mov         r11,rcx
  mov         rbx,rdx
  mov         r10d,r9d
  movq        xmm13,[r11]
  lea         eax,[r8+r8]
  movsxd      r9,eax
  mov         rax,rcx
  sub         rax,r9
  movq        xmm14,[rax]
  mov         rax,rdx
  sub         rax,r9
  movq        xmm0,[rax]
  movsxd      rax,r8d
  sub         rcx,rax
  sub         rdx,rax
  movq        xmm12,[rax+r11]
  movq        xmm10,[rcx]
  punpcklqdq  xmm14,xmm0
  movdqa      xmm8,xmm14
  movq        xmm0,[rdx]
  punpcklbw   xmm8,xmm1
  punpckhbw   xmm14,xmm1
  punpcklqdq  xmm10,xmm0
  movq        xmm0,[rbx]
  movdqa      xmm5,xmm10
  punpcklqdq  xmm13,xmm0
  movq        xmm0, [rax+rbx]
  punpcklbw   xmm5,xmm1
  movsx       eax,r10w
  movdqa      xmm9,xmm13
  punpcklqdq  xmm12,xmm0
  punpcklbw   xmm9,xmm1
  punpckhbw   xmm10,xmm1
  movd        xmm0,eax
  movsx       eax,word [rsp + 90h + 8h + 28h + 144]   ; iBeta
  punpckhbw   xmm13,xmm1
  movdqa      xmm7,xmm12
  punpcklwd   xmm0,xmm0
  punpckhbw   xmm12,xmm1
  pshufd      xmm11,xmm0,0
  punpcklbw   xmm7,xmm1
  movd        xmm0,eax
  movdqa      xmm1,xmm8
  psubw       xmm1,xmm5
  punpcklwd   xmm0,xmm0
  movdqa      xmm6,xmm11
  pshufd      xmm3,xmm0,0
  movdqa      xmm0,xmm5
  psubw       xmm0,xmm9
  movdqa      xmm2,xmm3
  pabsw       xmm0,xmm0
  pcmpgtw     xmm6,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm3
  pcmpgtw     xmm2,xmm0
  pand        xmm6,xmm2
  movdqa      xmm0,xmm7
  movdqa      xmm2,xmm3
  psubw       xmm0,xmm9
  pabsw       xmm0,xmm0
  pcmpgtw     xmm1,xmm0
  pand        xmm6,xmm1
  movdqa      xmm0,xmm10
  movdqa      xmm1,xmm14
  psubw       xmm0,xmm13
  psubw       xmm1,xmm10
  pabsw       xmm0,xmm0
  pcmpgtw     xmm11,xmm0
  pabsw       xmm0,xmm1
  pcmpgtw     xmm2,xmm0
  pand        xmm11,xmm2
  movdqa      xmm0,xmm12
  movdqa      xmm4,xmm6
  movdqa      xmm1,xmm8
  mov         eax,2
  cwde
  paddw       xmm1,xmm8
  psubw       xmm0,xmm13
  paddw       xmm1,xmm5
  pabsw       xmm0,xmm0
  movdqa      xmm2,xmm14
  paddw       xmm1,xmm7
  pcmpgtw     xmm3,xmm0
  paddw       xmm2,xmm14
  movd        xmm0,eax
  pand        xmm11,xmm3
  paddw       xmm7,xmm7
  paddw       xmm2,xmm10
  punpcklwd   xmm0,xmm0
  paddw       xmm2,xmm12
  paddw       xmm12,xmm12
  pshufd      xmm3,xmm0,0
  paddw       xmm7,xmm9
  paddw       xmm12,xmm13
  movdqa      xmm0,xmm6
  paddw       xmm1,xmm3
  pandn       xmm0,xmm5
  paddw       xmm7,xmm8
  psraw       xmm1,2
  paddw       xmm12,xmm14
  paddw       xmm7,xmm3
  movaps      xmm14,[rsp]
  pand        xmm4,xmm1
  paddw       xmm12,xmm3
  psraw       xmm7,2
  movdqa      xmm1,xmm11
  por         xmm4,xmm0
  psraw       xmm12,2
  paddw       xmm2,xmm3
  movdqa      xmm0,xmm11
  pandn       xmm0,xmm10
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  packuswb    xmm4,xmm1
  movdqa      xmm0,xmm11
  movdqa      xmm1,xmm6
  pand        xmm1,xmm7
  movaps      xmm7,[rsp+70h]
  movq        [rcx],xmm4
  pandn       xmm6,xmm9
  pandn       xmm11,xmm13
  pand        xmm0,xmm12
  por         xmm1,xmm6
  por         xmm0,xmm11
  psrldq      xmm4,8
  packuswb    xmm1,xmm0
  movq        [r11],xmm1
  psrldq      xmm1,8
  movq        [rdx],xmm4
  lea         r11,[rsp+90h]
  movaps      xmm6,[r11-10h]
  movaps      xmm8,[r11-30h]
  movaps      xmm9,[r11-40h]
  movq        [rbx],xmm1
  movaps      xmm10,[r11-50h]
  movaps      xmm11,[r11-60h]
  movaps      xmm12,[r11-70h]
  movaps      xmm13,[r11-80h]
  mov         rsp,r11
  POP_XMM
  pop         rbx
  ret





WELS_EXTERN   DeblockChromaEq4H_ssse3
  mov         rax,rsp
  mov         [rax+20h],rbx
  push        rdi
  PUSH_XMM 16
  sub         rsp,140h
  mov         rdi,rdx
  lea         eax,[r8*4]
  movsxd      r10,eax
  mov         eax,[rcx-2]
  mov         [rsp+10h],eax
  lea         rbx,[r10+rdx-2]
  lea         r11,[r10+rcx-2]
  movdqa      xmm5,[rsp+10h]
  movsxd      r10,r8d
  mov         eax,[r10+rcx-2]
  lea         rdx,[r10+r10*2]
  mov         [rsp+20h],eax
  mov         eax,[rcx+r10*2-2]
  mov         [rsp+30h],eax
  mov         eax,[rdx+rcx-2]
  movdqa      xmm2,[rsp+20h]
  mov         [rsp+40h],eax
  mov         eax, [rdi-2]
  movdqa      xmm4,[rsp+30h]
  mov         [rsp+50h],eax
  mov         eax,[r10+rdi-2]
  movdqa      xmm3,[rsp+40h]
  mov         [rsp+60h],eax
  mov         eax,[rdi+r10*2-2]
  punpckldq   xmm5,[rsp+50h]
  mov         [rsp+70h],eax
  mov         eax, [rdx+rdi-2]
  punpckldq   xmm2, [rsp+60h]
  mov          [rsp+80h],eax
  mov         eax,[r11]
  punpckldq   xmm4, [rsp+70h]
  mov         [rsp+50h],eax
  mov         eax,[rbx]
  punpckldq   xmm3,[rsp+80h]
  mov         [rsp+60h],eax
  mov         eax,[r10+r11]
  movdqa      xmm0, [rsp+50h]
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm5,xmm0
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax,[r10+rbx]
  movdqa      xmm0,[rsp+50h]
  movdqa      xmm1,xmm5
  mov         [rsp+60h],eax
  mov         eax,[r11+r10*2]
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm2,xmm0
  punpcklbw   xmm1,xmm2
  punpckhbw   xmm5,xmm2
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax,[rbx+r10*2]
  movdqa      xmm0,[rsp+50h]
  mov         [rsp+60h],eax
  mov         eax, [rdx+r11]
  movdqa      xmm15,xmm1
  punpckldq   xmm0,[rsp+60h]
  punpcklqdq  xmm4,xmm0
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax, [rdx+rbx]
  movdqa      xmm0,[rsp+50h]
  mov         [rsp+60h],eax
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm3,xmm0
  movdqa      xmm0,xmm4
  punpcklbw   xmm0,xmm3
  punpckhbw   xmm4,xmm3
  punpcklwd   xmm15,xmm0
  punpckhwd   xmm1,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm12,xmm15
  punpcklwd   xmm0,xmm4
  punpckhwd   xmm5,xmm4
  punpckldq   xmm12,xmm0
  punpckhdq   xmm15,xmm0
  movdqa      xmm0,xmm1
  movdqa      xmm11,xmm12
  punpckldq   xmm0,xmm5
  punpckhdq   xmm1,xmm5
  punpcklqdq  xmm11,xmm0
  punpckhqdq  xmm12,xmm0
  movsx       eax,r9w
  movdqa      xmm14,xmm15
  punpcklqdq  xmm14,xmm1
  punpckhqdq  xmm15,xmm1
  pxor        xmm1,xmm1
  movd        xmm0,eax
  movdqa      xmm4,xmm12
  movdqa      xmm8,xmm11
  movsx       eax,word [rsp+170h + 160] ; iBeta
  punpcklwd   xmm0,xmm0
  punpcklbw   xmm4,xmm1
  punpckhbw   xmm12,xmm1
  movdqa      xmm9,xmm14
  movdqa      xmm7,xmm15
  movdqa      xmm10,xmm15
  pshufd      xmm13,xmm0,0
  punpcklbw   xmm9,xmm1
  punpckhbw   xmm14,xmm1
  movdqa      xmm6,xmm13
  movd        xmm0,eax
  movdqa      [rsp],xmm11
  mov         eax,2
  cwde
  punpckhbw   xmm11,xmm1
  punpckhbw   xmm10,xmm1
  punpcklbw   xmm7,xmm1
  punpcklwd   xmm0,xmm0
  punpcklbw   xmm8,xmm1
  pshufd      xmm3,xmm0,0
  movdqa      xmm1,xmm8
  movdqa      xmm0,xmm4
  psubw       xmm0,xmm9
  psubw       xmm1,xmm4
  movdqa      xmm2,xmm3
  pabsw       xmm0,xmm0
  pcmpgtw     xmm6,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm3
  pcmpgtw     xmm2,xmm0
  pand        xmm6,xmm2
  movdqa      xmm0,xmm7
  movdqa      xmm2,xmm3
  psubw       xmm0,xmm9
  pabsw       xmm0,xmm0
  pcmpgtw     xmm1,xmm0
  pand        xmm6,xmm1
  movdqa      xmm0,xmm12
  movdqa      xmm1,xmm11
  psubw       xmm0,xmm14
  psubw       xmm1,xmm12
  movdqa      xmm5,xmm6
  pabsw       xmm0,xmm0
  pcmpgtw     xmm13,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm8
  pcmpgtw     xmm2,xmm0
  paddw       xmm1,xmm8
  movdqa      xmm0,xmm10
  pand        xmm13,xmm2
  psubw       xmm0,xmm14
  paddw       xmm1,xmm4
  movdqa      xmm2,xmm11
  pabsw       xmm0,xmm0
  paddw       xmm2,xmm11
  paddw       xmm1,xmm7
  pcmpgtw     xmm3,xmm0
  paddw       xmm2,xmm12
  movd        xmm0,eax
  pand        xmm13,xmm3
  paddw       xmm2,xmm10
  punpcklwd   xmm0,xmm0
  pshufd      xmm3,xmm0,0
  movdqa      xmm0,xmm6
  paddw       xmm1,xmm3
  pandn       xmm0,xmm4
  paddw       xmm2,xmm3
  psraw       xmm1,2
  pand        xmm5,xmm1
  por         xmm5,xmm0
  paddw       xmm7,xmm7
  paddw       xmm10,xmm10
  psraw       xmm2,2
  movdqa      xmm1,xmm13
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm12
  pand        xmm1,xmm2
  paddw       xmm7,xmm9
  por         xmm1,xmm0
  paddw       xmm10,xmm14
  paddw       xmm7,xmm8
  movdqa      xmm0,xmm13
  packuswb    xmm5,xmm1
  paddw       xmm7,xmm3
  paddw       xmm10,xmm11
  movdqa      xmm1,xmm6
  paddw       xmm10,xmm3
  pandn       xmm6,xmm9
  psraw       xmm7,2
  pand        xmm1,xmm7
  psraw       xmm10,2
  pandn       xmm13,xmm14
  pand        xmm0,xmm10
  por         xmm1,xmm6
  movdqa      xmm6,[rsp]
  movdqa      xmm4,xmm6
  por         xmm0,xmm13
  punpcklbw   xmm4,xmm5
  punpckhbw   xmm6,xmm5
  movdqa      xmm3,xmm4
  packuswb    xmm1,xmm0
  movdqa      xmm0,xmm1
  punpckhbw   xmm1,xmm15
  punpcklbw   xmm0,xmm15
  punpcklwd   xmm3,xmm0
  punpckhwd   xmm4,xmm0
  movdqa      xmm0,xmm6
  movdqa      xmm2,xmm3
  punpcklwd   xmm0,xmm1
  punpckhwd   xmm6,xmm1
  movdqa      xmm1,xmm4
  punpckldq   xmm2,xmm0
  punpckhdq   xmm3,xmm0
  punpckldq   xmm1,xmm6
  movdqa      xmm0,xmm2
  punpcklqdq  xmm0,xmm1
  punpckhdq   xmm4,xmm6
  punpckhqdq  xmm2,xmm1
  movdqa      [rsp+10h],xmm0
  movdqa      [rsp+60h],xmm2
  movdqa      xmm0,xmm3
  mov         eax,[rsp+10h]
  mov         [rcx-2],eax
  mov         eax,[rsp+60h]
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm3,xmm4
  mov         [r10+rcx-2],eax
  movdqa      [rsp+20h],xmm0
  mov         eax, [rsp+20h]
  movdqa      [rsp+70h],xmm3
  mov         [rcx+r10*2-2],eax
  mov         eax,[rsp+70h]
  mov         [rdx+rcx-2],eax
  mov         eax,[rsp+18h]
  mov         [r11],eax
  mov         eax,[rsp+68h]
  mov         [r10+r11],eax
  mov         eax,[rsp+28h]
  mov         [r11+r10*2],eax
  mov         eax,[rsp+78h]
  mov         [rdx+r11],eax
  mov         eax,[rsp+14h]
  mov         [rdi-2],eax
  mov         eax,[rsp+64h]
  mov         [r10+rdi-2],eax
  mov         eax,[rsp+24h]
  mov         [rdi+r10*2-2],eax
  mov         eax, [rsp+74h]
  mov         [rdx+rdi-2],eax
  mov         eax, [rsp+1Ch]
  mov         [rbx],eax
  mov         eax, [rsp+6Ch]
  mov         [r10+rbx],eax
  mov         eax,[rsp+2Ch]
  mov         [rbx+r10*2],eax
  mov         eax,[rsp+7Ch]
  mov         [rdx+rbx],eax
  lea         rsp,[rsp+140h]
  POP_XMM
  mov         rbx, [rsp+28h]
  pop         rdi
  ret



WELS_EXTERN DeblockChromaLt4H_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  push        rsi
  push        rdi
  push        r12
  PUSH_XMM 16
  sub         rsp,170h

  movsxd      rsi,r8d
  lea         eax,[r8*4]
  mov         r11d,r9d
  movsxd      r10,eax
  mov         eax, [rcx-2]
  mov         r12,rdx
  mov         [rsp+40h],eax
  mov         eax, [rsi+rcx-2]
  lea         rbx,[r10+rcx-2]
  movdqa      xmm5,[rsp+40h]
  mov         [rsp+50h],eax
  mov         eax, [rcx+rsi*2-2]
  lea         rbp,[r10+rdx-2]
  movdqa      xmm2, [rsp+50h]
  mov         [rsp+60h],eax
  lea         r10,[rsi+rsi*2]
  mov         rdi,rcx
  mov         eax,[r10+rcx-2]
  movdqa      xmm4,[rsp+60h]
  mov         [rsp+70h],eax
  mov         eax,[rdx-2]
  mov         [rsp+80h],eax
  mov         eax, [rsi+rdx-2]
  movdqa      xmm3,[rsp+70h]
  mov         [rsp+90h],eax
  mov         eax,[rdx+rsi*2-2]
  punpckldq   xmm5,[rsp+80h]
  mov         [rsp+0A0h],eax
  mov         eax, [r10+rdx-2]
  punpckldq   xmm2,[rsp+90h]
  mov         [rsp+0B0h],eax
  mov         eax, [rbx]
  punpckldq   xmm4,[rsp+0A0h]
  mov         [rsp+80h],eax
  mov         eax,[rbp]
  punpckldq   xmm3,[rsp+0B0h]
  mov         [rsp+90h],eax
  mov         eax,[rsi+rbx]
  movdqa      xmm0,[rsp+80h]
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm5,xmm0
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax,[rsi+rbp]
  movdqa      xmm0,[rsp+80h]
  movdqa      xmm1,xmm5
  mov         [rsp+90h],eax
  mov         eax,[rbx+rsi*2]
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm2,xmm0
  punpcklbw   xmm1,xmm2
  punpckhbw   xmm5,xmm2
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax,[rbp+rsi*2]
  movdqa      xmm0, [rsp+80h]
  mov         [rsp+90h],eax
  mov         eax,[r10+rbx]
  movdqa      xmm7,xmm1
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm4,xmm0
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax, [r10+rbp]
  movdqa      xmm0,[rsp+80h]
  mov         [rsp+90h],eax
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm3,xmm0
  movdqa      xmm0,xmm4
  punpcklbw   xmm0,xmm3
  punpckhbw   xmm4,xmm3
  punpcklwd   xmm7,xmm0
  punpckhwd   xmm1,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm6,xmm7
  punpcklwd   xmm0,xmm4
  punpckhwd   xmm5,xmm4
  punpckldq   xmm6,xmm0
  punpckhdq   xmm7,xmm0
  movdqa      xmm0,xmm1
  punpckldq   xmm0,xmm5
  mov         rax, [rsp+1C8h+160]    ; pTC
  punpckhdq   xmm1,xmm5
  movdqa      xmm9,xmm6
  punpckhqdq  xmm6,xmm0
  punpcklqdq  xmm9,xmm0
  movdqa      xmm2,xmm7
  movdqa      xmm13,xmm6
  movdqa      xmm4,xmm9
  movdqa      [rsp+10h],xmm9
  punpcklqdq  xmm2,xmm1
  punpckhqdq  xmm7,xmm1
  pxor        xmm1,xmm1
  movsx       ecx,byte [rax+3]
  movsx       edx,byte [rax+2]
  movsx       r8d,byte [rax+1]
  movsx       r9d,byte [rax]
  movdqa      xmm10,xmm1
  movdqa      xmm15,xmm2
  punpckhbw   xmm2,xmm1
  punpckhbw   xmm6,xmm1
  punpcklbw   xmm4,xmm1
  movsx       eax,r11w
  mov         word [rsp+0Eh],cx
  mov         word [rsp+0Ch],cx
  movdqa      xmm3,xmm7
  movdqa      xmm8,xmm7
  movdqa      [rsp+20h],xmm7
  punpcklbw   xmm15,xmm1
  punpcklbw   xmm13,xmm1
  punpcklbw   xmm3,xmm1
  mov         word [rsp+0Ah],dx
  mov         word [rsp+8],dx
  mov         word [rsp+6],r8w
  movd        xmm0,eax
  movdqa      [rsp+30h],xmm6
  punpckhbw   xmm9,xmm1
  punpckhbw   xmm8,xmm1
  punpcklwd   xmm0,xmm0
  movsx       eax,word [rsp+1C0h+160]   ; iBeta
  mov         word [rsp+4],r8w
  mov         word [rsp+2],r9w
  pshufd      xmm12,xmm0,0
  mov         word [rsp],r9w
  movd        xmm0,eax
  mov         eax,4
  cwde
  movdqa      xmm14, [rsp]
  movdqa      [rsp],xmm2
  movdqa      xmm2,xmm12
  punpcklwd   xmm0,xmm0
  pshufd      xmm11,xmm0,0
  psubw       xmm10,xmm14
  movd        xmm0,eax
  movdqa      xmm7,xmm14
  movdqa      xmm6,xmm14
  pcmpgtw     xmm7,xmm1
  punpcklwd   xmm0,xmm0
  pshufd      xmm5,xmm0,0
  movdqa      xmm0,xmm4
  movdqa      xmm1,xmm15
  psubw       xmm4,xmm13
  psubw       xmm0,xmm3
  psubw       xmm1,xmm13
  psubw       xmm3,xmm15
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm10
  psraw       xmm1,3
  pmaxsw      xmm0,xmm1
  pminsw      xmm6,xmm0
  movdqa      xmm1,xmm11
  movdqa      xmm0,xmm13
  psubw       xmm0,xmm15
  pabsw       xmm0,xmm0
  pcmpgtw     xmm2,xmm0
  pabsw       xmm0,xmm4
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm3
  pand        xmm2,xmm1
  movdqa      xmm1,xmm11
  movdqa      xmm3,[rsp+30h]
  pcmpgtw     xmm1,xmm0
  movdqa      xmm0,xmm9
  pand        xmm2,xmm1
  psubw       xmm0,xmm8
  psubw       xmm9,xmm3
  pand        xmm2,xmm7
  pand        xmm6,xmm2
  psubw       xmm15,xmm6
  paddw       xmm13,xmm6
  movdqa      xmm2,[rsp]
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  psubw       xmm8,xmm2
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm3
  movdqa      xmm5,[rsp+10h]
  psubw       xmm0,xmm2
  psraw       xmm1,3
  movdqa      xmm4,xmm5
  pabsw       xmm0,xmm0
  pmaxsw      xmm10,xmm1
  movdqa      xmm1,xmm11
  pcmpgtw     xmm12,xmm0
  pabsw       xmm0,xmm9
  pminsw      xmm14,xmm10
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm8
  pcmpgtw     xmm11,xmm0
  pand        xmm12,xmm1
  movdqa      xmm1,[rsp+20h]
  pand        xmm12,xmm11
  pand        xmm12,xmm7
  pand        xmm14,xmm12
  paddw       xmm3,xmm14
  psubw       xmm2,xmm14
  packuswb    xmm13,xmm3
  packuswb    xmm15,xmm2
  punpcklbw   xmm4,xmm13
  punpckhbw   xmm5,xmm13
  movdqa      xmm0,xmm15
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm15,xmm1
  movdqa      xmm3,xmm4
  punpcklwd   xmm3,xmm0
  punpckhwd   xmm4,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm2,xmm3
  movdqa      xmm1,xmm4
  punpcklwd   xmm0,xmm15
  punpckhwd   xmm5,xmm15
  punpckldq   xmm2,xmm0
  punpckhdq   xmm3,xmm0
  punpckldq   xmm1,xmm5
  movdqa      xmm0,xmm2
  punpcklqdq  xmm0,xmm1
  punpckhdq   xmm4,xmm5
  punpckhqdq  xmm2,xmm1
  movdqa      [rsp+40h],xmm0
  movdqa      xmm0,xmm3
  movdqa      [rsp+90h],xmm2
  mov         eax,[rsp+40h]
  mov         [rdi-2],eax
  mov         eax, [rsp+90h]
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm3,xmm4
  mov         [rsi+rdi-2],eax
  movdqa      [rsp+50h],xmm0
  mov         eax,[rsp+50h]
  movdqa      [rsp+0A0h],xmm3
  mov         [rdi+rsi*2-2],eax
  mov         eax,[rsp+0A0h]
  mov         [r10+rdi-2],eax
  mov         eax,[rsp+48h]
  mov         [rbx],eax
  mov         eax,[rsp+98h]
  mov         [rsi+rbx],eax
  mov         eax,[rsp+58h]
  mov         [rbx+rsi*2],eax
  mov         eax, [rsp+0A8h]
  mov         [r10+rbx],eax
  mov         eax, [rsp+44h]
  mov         [r12-2],eax
  mov         eax,[rsp+94h]
  mov         [rsi+r12-2],eax
  mov         eax,[rsp+54h]
  mov         [r12+rsi*2-2],eax
  mov         eax, [rsp+0A4h]
  mov         [r10+r12-2],eax
  mov         eax,[rsp+4Ch]
  mov         [rbp],eax
  mov         eax,[rsp+9Ch]
  mov         [rsi+rbp],eax
  mov         eax, [rsp+5Ch]
  mov         [rbp+rsi*2],eax
  mov         eax,[rsp+0ACh]
  mov         [r10+rbp],eax
  lea         r11,[rsp+170h]
  mov         rsp,r11
  POP_XMM
  pop         r12
  pop         rdi
  pop         rsi
  pop         rbp
  pop         rbx
  ret



%elifdef  UNIX64


WELS_EXTERN   DeblockLumaLt4V_ssse3
  push        rbp
  mov         r11,r8  ; pTC
  sub         rsp,1B0h
  lea         rbp,[rsp+20h]
  movd        xmm4,edx
  movd        xmm2,ecx
  mov         qword [rbp+180h],r12
  mov         r10,rdi
  movsxd      r12,esi
  add         rsi,rsi
  movsxd      rdx,esi
  sub         r10,r12
  movsx       r8d,byte [r11]
  pxor        xmm3,xmm3
  punpcklwd   xmm2,xmm2
  movaps      [rbp+50h],xmm14
  lea         rax,[r12+r12*2]
  movdqa      xmm14,[rdx+rdi]
  neg         rax
  pshufd      xmm0,xmm2,0
  movd        xmm2,r8d
  movsx       rsi,byte [r11+1]
  movsx       r8d,byte [r11+2]
  movsx       r11d,byte [r11+3]
  movaps      [rbp+70h],xmm12
  movd        xmm1,esi
  movaps      [rbp+80h],xmm11
  movd        xmm12,r8d
  movd        xmm11,r11d
  movdqa      xmm5, [rax+rdi]
  lea         rax,[r12+r12]
  punpcklwd   xmm12,xmm12
  neg         rax
  punpcklwd   xmm11,xmm11
  movaps      [rbp],xmm8
  movdqa      xmm8, [r10]
  punpcklwd   xmm2,xmm2
  punpcklwd   xmm1,xmm1
  punpcklqdq  xmm12,xmm12
  punpcklqdq  xmm11,xmm11
  punpcklqdq  xmm2,xmm2
  punpcklqdq  xmm1,xmm1
  shufps      xmm12,xmm11,88h
  movdqa      xmm11,xmm8
  movaps      [rbp+30h],xmm9
  movdqa      xmm9,[rdi]
  shufps      xmm2,xmm1,88h
  movdqa      xmm1,xmm5
  punpcklbw   xmm11,xmm3
  movaps      [rbp+20h],xmm6
  movaps      [rbp+60h],xmm13
  movdqa      xmm13,xmm11
  movaps      [rbp+90h],xmm10
  movdqa      xmm10,xmm9
  movdqa      xmm6,[rax+rdi]
  punpcklbw   xmm1,xmm3
  movaps      [rbp+0A0h],xmm12
  psubw       xmm13,xmm1
  movaps      [rbp+40h],xmm15
  movdqa      xmm15,xmm14
  movaps      [rbp+10h],xmm7
  movdqa      xmm7,xmm6
  punpcklbw   xmm10,xmm3
  movdqa      xmm12,[r12+rdi]
  punpcklbw   xmm7,xmm3
  punpcklbw   xmm12,xmm3
  punpcklbw   xmm15,xmm3
  pabsw       xmm3,xmm13
  movdqa      xmm13,xmm10
  psubw       xmm13,xmm15
  movdqa      [rbp+0F0h],xmm15
  pabsw       xmm15,xmm13
  movdqa      xmm13,xmm11
  movdqa      [rbp+0B0h],xmm1
  movdqa      xmm1,xmm0
  pavgw       xmm13,xmm10
  pcmpgtw     xmm1,xmm3
  movdqa      [rbp+120h],xmm13
  movaps      xmm13,xmm2
  punpcklwd   xmm4,xmm4
  movdqa      xmm3,xmm0
  movdqa      [rbp+100h],xmm1
  psubw       xmm13,xmm1
  movdqa      xmm1,xmm10
  pcmpgtw     xmm3,xmm15
  pshufd      xmm4,xmm4,0
  psubw       xmm1,xmm11
  movdqa      [rbp+0D0h],xmm10
  psubw       xmm13,xmm3
  movdqa      [rbp+110h],xmm3
  pabsw       xmm15,xmm1
  movdqa      xmm3,xmm4
  psubw       xmm10,xmm12
  pcmpgtw     xmm3,xmm15
  pabsw       xmm15,xmm10
  movdqa      xmm10,xmm0
  psllw       xmm1,2
  movdqa      [rbp+0C0h],xmm11
  psubw       xmm11,xmm7
  pcmpgtw     xmm10,xmm15
  pabsw       xmm11,xmm11
  movdqa      xmm15,xmm0
  pand        xmm3,xmm10
  pcmpgtw     xmm15,xmm11
  movaps      xmm11,xmm2
  pxor        xmm10,xmm10
  pand        xmm3,xmm15
  pcmpgtw     xmm11,xmm10
  pcmpeqw     xmm10,xmm2
  por         xmm11,xmm10
  pand        xmm3,xmm11
  movdqa      xmm11,xmm7
  psubw       xmm11,xmm12
  pxor        xmm15,xmm15
  paddw       xmm11,xmm1
  psubw       xmm15,xmm13
  movdqa      [rbp+0E0h],xmm12
  paddw       xmm11,[FOUR_16B_SSE2]
  pxor        xmm12,xmm12
  psraw       xmm11,3
  punpckhbw   xmm8,xmm12
  pmaxsw      xmm15,xmm11
  punpckhbw   xmm5,xmm12
  movdqa      xmm11,xmm8
  pminsw      xmm13,xmm15
  psubw       xmm11,xmm5
  punpckhbw   xmm9,xmm12
  pand        xmm13,xmm3
  movdqa      [rbp+130h],xmm13
  pabsw       xmm13,xmm11
  punpckhbw   xmm14,xmm12
  movdqa      xmm11,xmm9
  psubw       xmm11,xmm14
  movdqa      xmm15,xmm0
  movdqa      [rbp+140h],xmm14
  pabsw       xmm14,xmm11
  movdqa      xmm11,xmm8
  pcmpgtw     xmm15,xmm14
  movdqa      xmm1,[r12+rdi]
  pavgw       xmm11,xmm9
  movdqa      [rbp+170h],xmm11
  movdqa      xmm10,xmm9
  punpckhbw   xmm6,xmm12
  psubw       xmm10,xmm8
  punpckhbw   xmm1,xmm12
  movdqa      xmm12,xmm0
  movaps      xmm11,[rbp+0A0h]
  pcmpgtw     xmm12,xmm13
  movaps      xmm13,xmm11
  psubw       xmm13,xmm12
  movdqa      [rbp+160h],xmm15
  psubw       xmm13,xmm15
  movdqa      xmm15,xmm9
  psubw       xmm15,xmm1
  movdqa      [rbp+150h],xmm12
  pabsw       xmm12,xmm10
  pabsw       xmm14,xmm15
  movdqa      xmm15,xmm8
  pcmpgtw     xmm4,xmm12
  movdqa      xmm12,xmm0
  psubw       xmm15,xmm6
  pcmpgtw     xmm12,xmm14
  pabsw       xmm14,xmm15
  psllw       xmm10,2
  pcmpgtw     xmm0,xmm14
  movdqa      xmm14,xmm6
  psubw       xmm14,xmm1
  pand        xmm4,xmm12
  paddw       xmm14,xmm10
  pand        xmm4,xmm0
  paddw       xmm14,[FOUR_16B_SSE2]
  pxor        xmm15,xmm15
  movaps      xmm12,xmm11
  psubw       xmm15,xmm13
  pxor        xmm0,xmm0
  psraw       xmm14,3
  pcmpgtw     xmm12,xmm0
  pcmpeqw     xmm0,xmm11
  pmaxsw      xmm15,xmm14
  por         xmm12,xmm0
  movdqa      xmm0,[rbp+120h]
  pminsw      xmm13,xmm15
  movdqa      xmm15,[rbp+0B0h]
  movdqa      xmm10,xmm7
  pand        xmm4,xmm12
  paddw       xmm15,xmm0
  pxor        xmm12,xmm12
  paddw       xmm10,xmm7
  movdqa      xmm14,xmm12
  psubw       xmm15,xmm10
  psubw       xmm14,xmm2
  psraw       xmm15,1
  pmaxsw      xmm15,xmm14
  movdqa      xmm10,xmm6
  pminsw      xmm15,xmm2
  paddw       xmm10,xmm6
  pand        xmm15,xmm3
  psubw       xmm12,xmm11
  pand        xmm15,[rbp+100h]
  pand        xmm13,xmm4
  paddw       xmm7,xmm15
  paddw       xmm8,xmm13
  movdqa      xmm15,[rbp+170h]
  psubw       xmm9,xmm13
  paddw       xmm5,xmm15
  psubw       xmm5,xmm10
  psraw       xmm5,1
  pmaxsw      xmm5,xmm12
  pminsw      xmm5,xmm11
  pand        xmm5,xmm4
  pand        xmm5,[rbp+150h]
  paddw       xmm6,xmm5
  movdqa      xmm5,[rbp+0C0h]
  packuswb    xmm7,xmm6
  movdqa      xmm6,[rbp+130h]
  paddw       xmm5,xmm6
  packuswb    xmm5,xmm8
  movdqa      xmm8,[rbp+0D0h]
  psubw       xmm8,xmm6
  movdqa      xmm6,[rbp+0F0h]
  paddw       xmm6,xmm0
  movdqa      xmm0,[rbp+0E0h]
  packuswb    xmm8,xmm9
  movdqa      xmm9,xmm0
  paddw       xmm9,xmm0
  psubw       xmm6,xmm9
  psraw       xmm6,1
  pmaxsw      xmm14,xmm6
  pminsw      xmm2,xmm14
  pand        xmm2,xmm3
  pand        xmm2,[rbp+110h]
  paddw       xmm0,xmm2
  movdqa      xmm2,[rbp+140h]
  paddw       xmm2,xmm15
  movdqa      xmm15,xmm1
  paddw       xmm15,xmm1
  psubw       xmm2,xmm15
  psraw       xmm2,1
  pmaxsw      xmm12,xmm2
  pminsw      xmm11,xmm12
  pand        xmm11,xmm4
  pand        xmm11,[rbp+160h]
  paddw       xmm1,xmm11
  movdqa      [rax+rdi],xmm7
  movdqa      [r10],xmm5
  packuswb    xmm0,xmm1
  movdqa      [rdi],xmm8
  movdqa      [r12+rdi],xmm0
  mov         r12,qword [rbp+180h]
  lea         rsp,[rbp+190h]
  pop         rbp
  ret


WELS_EXTERN DeblockLumaEq4V_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  mov         r8,   rdx
  mov         r9,   rcx
  mov         rcx,  rdi
  mov         rdx,  rsi
  sub         rsp,1D8h
  movaps      [rax-38h],xmm6
  movaps      [rax-48h],xmm7
  movaps      [rax-58h],xmm8
  pxor        xmm1,xmm1
  movsxd      r10,edx
  mov         rbp,rcx
  mov         r11d,r8d
  mov         rdx,rcx
  mov         rdi,rbp
  mov         rbx,rbp
  movdqa      xmm5,[rbp]
  movaps      [rax-68h],xmm9
  movaps      [rax-78h],xmm10
  punpcklbw   xmm5,xmm1
  movaps      [rax-88h],xmm11
  movaps      [rax-98h],xmm12
  movaps      [rax-0A8h],xmm13
  movaps      [rax-0B8h],xmm14
  movdqa      xmm14,[r10+rbp]
  movaps      [rax-0C8h],xmm15
  lea         eax,[r10*4]
  movsxd      r8,eax
  lea         eax,[r10+r10*2]
  movsxd      rcx,eax
  lea         eax,[r10+r10]
  sub         rdx,r8
  punpcklbw   xmm14,xmm1
  movdqa      [rsp+90h],xmm5
  movdqa      [rsp+30h],xmm14
  movsxd      rsi,eax
  movsx       eax,r11w
  sub         rdi,rcx
  sub         rbx,rsi
  mov         r8,rbp
  sub         r8,r10
  movd        xmm0,eax
  movsx       eax,r9w
  movdqa      xmm12,[rdi]
  movdqa      xmm6, [rsi+rbp]
  movdqa      xmm13,[rbx]
  punpcklwd   xmm0,xmm0
  pshufd      xmm11,xmm0,0
  punpcklbw   xmm13,xmm1
  punpcklbw   xmm6,xmm1
  movdqa      xmm8,[r8]
  movd        xmm0,eax
  movdqa      xmm10,xmm11
  mov         eax,2
  punpcklbw   xmm8,xmm1
  punpcklbw   xmm12,xmm1
  cwde
  punpcklwd   xmm0,xmm0
  psraw       xmm10,2
  movdqa      xmm1,xmm8
  movdqa      [rsp+0F0h],xmm13
  movdqa      [rsp+0B0h],xmm8
  pshufd      xmm7,xmm0,0
  psubw       xmm1,xmm13
  movdqa      xmm0,xmm5
  movdqa      xmm4,xmm7
  movdqa      xmm2,xmm7
  psubw       xmm0,xmm8
  pabsw       xmm3,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm5
  movdqa      [rsp+40h],xmm7
  movdqa      [rsp+60h],xmm6
  pcmpgtw     xmm4,xmm0
  psubw       xmm1,xmm14
  pabsw       xmm0,xmm1
  pcmpgtw     xmm2,xmm0
  pand        xmm4,xmm2
  movdqa      xmm0,xmm11
  pcmpgtw     xmm0,xmm3
  pand        xmm4,xmm0
  movd        xmm0,eax
  movdqa      [rsp+20h],xmm4
  punpcklwd   xmm0,xmm0
  pshufd      xmm2,xmm0,0
  paddw       xmm10,xmm2
  movdqa      [rsp+0A0h],xmm2
  movdqa      xmm15,xmm7
  pxor        xmm4,xmm4
  movdqa      xmm0,xmm8
  psubw       xmm0,xmm12
  mov         eax,4
  pabsw       xmm0,xmm0
  movdqa      xmm1,xmm10
  cwde
  pcmpgtw     xmm15,xmm0
  pcmpgtw     xmm1,xmm3
  movdqa      xmm3,xmm7
  movdqa      xmm7,[rdx]
  movdqa      xmm0,xmm5
  psubw       xmm0,xmm6
  pand        xmm15,xmm1
  punpcklbw   xmm7,xmm4
  movdqa      xmm9,xmm15
  pabsw       xmm0,xmm0
  psllw       xmm7,1
  pandn       xmm9,xmm12
  pcmpgtw     xmm3,xmm0
  paddw       xmm7,xmm12
  movd        xmm0,eax
  pand        xmm3,xmm1
  paddw       xmm7,xmm12
  punpcklwd   xmm0,xmm0
  paddw       xmm7,xmm12
  pshufd      xmm1,xmm0,0
  paddw       xmm7,xmm13
  movdqa      xmm0,xmm3
  pandn       xmm0,xmm6
  paddw       xmm7,xmm8
  movdqa      [rsp+70h],xmm1
  paddw       xmm7,xmm5
  movdqa      [rsp+120h],xmm0
  movdqa      xmm0,[rcx+rbp]
  punpcklbw   xmm0,xmm4
  paddw       xmm7,xmm1
  movdqa      xmm4,xmm15
  psllw       xmm0,1
  psraw       xmm7,3
  paddw       xmm0,xmm6
  pand        xmm7,xmm15
  paddw       xmm0,xmm6
  paddw       xmm0,xmm6
  paddw       xmm0,xmm14
  movdqa      xmm6,xmm15
  paddw       xmm0,xmm5
  pandn       xmm6,xmm13
  paddw       xmm0,xmm8
  paddw       xmm0,xmm1
  psraw       xmm0,3
  movdqa      xmm1,xmm12
  paddw       xmm1,xmm13
  pand        xmm0,xmm3
  movdqa      [rsp+100h],xmm0
  movdqa      xmm0,xmm8
  paddw       xmm0,xmm5
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm3
  paddw       xmm1,xmm2
  psraw       xmm1,2
  pandn       xmm0,xmm14
  pand        xmm4,xmm1
  movdqa      [rsp+0E0h],xmm0
  movdqa      xmm0,xmm5
  paddw       xmm0,xmm8
  movdqa      xmm1,[rsp+60h]
  paddw       xmm1,xmm14
  movdqa      xmm14,xmm3
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm8
  paddw       xmm0,[rsp+30h]
  paddw       xmm1,xmm2
  psraw       xmm1,2
  pand        xmm14,xmm1
  movdqa      xmm1,xmm13
  paddw       xmm1,xmm13
  paddw       xmm1,xmm0
  paddw       xmm1,xmm2
  psraw       xmm1,2
  movdqa      xmm0,[rsp+30h]
  movdqa      xmm2,xmm13
  movdqa      xmm5,xmm15
  paddw       xmm0,[rsp+70h]
  pandn       xmm5,xmm1
  paddw       xmm2,xmm8
  movdqa      xmm8,[rsp+90h]
  movdqa      xmm1,xmm12
  paddw       xmm2,xmm8
  psllw       xmm2,1
  paddw       xmm2,xmm0
  paddw       xmm1,xmm2
  movdqa      xmm0,xmm8
  movdqa      xmm8,xmm3
  movdqa      xmm2,[rsp+30h]
  paddw       xmm0,xmm13
  psraw       xmm1,3
  pand        xmm15,xmm1
  movdqa      xmm1,xmm2
  paddw       xmm1,xmm2
  paddw       xmm2,[rsp+90h]
  paddw       xmm2,[rsp+0B0h]
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm13
  movdqa      xmm13,[r8]
  paddw       xmm0, [rsp+70h]
  paddw       xmm1, [rsp+0A0h]
  psllw       xmm2,1
  paddw       xmm2,xmm0
  psraw       xmm1,2
  movdqa      xmm0, [rdi]
  pandn       xmm8,xmm1
  movdqa      xmm1, [rsp+60h]
  paddw       xmm1,xmm2
  movdqa      xmm2, [rbx]
  psraw       xmm1,3
  pand        xmm3,xmm1
  movdqa      xmm1, [rbp]
  movdqa      [rsp+0D0h],xmm3
  pxor        xmm3,xmm3
  punpckhbw   xmm0,xmm3
  punpckhbw   xmm1,xmm3
  punpckhbw   xmm13,xmm3
  movdqa      [rsp+0C0h],xmm0
  movdqa      xmm0,[r10+rbp]
  movdqa      [rsp],xmm1
  punpckhbw   xmm0,xmm3
  punpckhbw   xmm2,xmm3
  movdqa      [rsp+80h],xmm0
  movdqa      xmm0,[rsi+rbp]
  movdqa      [rsp+10h],xmm13
  punpckhbw   xmm0,xmm3
  movdqa      [rsp+50h],xmm0
  movdqa      xmm0,xmm1
  movdqa      xmm1,xmm13
  psubw       xmm0,xmm13
  psubw       xmm1,xmm2
  pabsw       xmm3,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,[rsp]
  movdqa      xmm13,[rsp+40h]
  movdqa      [rsp+110h],xmm2
  psubw       xmm1, [rsp+80h]
  pcmpgtw     xmm13,xmm0
  pcmpgtw     xmm11,xmm3
  pabsw       xmm0,xmm1
  pcmpgtw     xmm10,xmm3
  movdqa      xmm1, [rsp+40h]
  movdqa      xmm2,xmm1
  movdqa      xmm3,xmm1
  pcmpgtw     xmm2,xmm0
  movdqa      xmm0, [rsp+10h]
  pand        xmm13,xmm2
  pand        xmm13,xmm11
  movdqa      xmm11,[rsp+0C0h]
  psubw       xmm0,xmm11
  pabsw       xmm0,xmm0
  pcmpgtw     xmm3,xmm0
  pand        xmm3,xmm10
  movdqa      xmm0,[rsp]
  psubw       xmm0,[rsp+50h]
  movdqa      xmm2,[rdx]
  pabsw       xmm0,xmm0
  por         xmm7,xmm9
  movdqa      xmm9,[rsp+20h]
  pcmpgtw     xmm1,xmm0
  pand        xmm9,xmm7
  movdqa      xmm7,[rsp+20h]
  movdqa      xmm0,xmm7
  pandn       xmm0,xmm12
  movdqa      xmm12,[rsp+110h]
  pand        xmm1,xmm10
  movdqa      xmm10,[rsp+70h]
  movdqa      [rsp+40h],xmm1
  movdqa      xmm1,xmm13
  por         xmm9,xmm0
  pxor        xmm0,xmm0
  por         xmm4,xmm6
  movdqa      xmm6,xmm7
  punpckhbw   xmm2,xmm0
  por         xmm15,xmm5
  movdqa      xmm5,[rsp+20h]
  movdqa      xmm0,xmm3
  psllw       xmm2,1
  pandn       xmm0,xmm11
  pand        xmm6,xmm4
  movdqa      xmm4,[rsp]
  paddw       xmm2,xmm11
  pand        xmm5,xmm15
  movdqa      xmm15,[rsp+20h]
  paddw       xmm2,xmm11
  paddw       xmm2,xmm11
  paddw       xmm2,xmm12
  paddw       xmm2,[rsp+10h]
  paddw       xmm2,[rsp]
  paddw       xmm2,xmm10
  psraw       xmm2,3
  pand        xmm2,xmm3
  por         xmm2,xmm0
  pand        xmm1,xmm2
  movdqa      xmm0,xmm13
  movdqa      xmm2,xmm11
  pandn       xmm0,xmm11
  paddw       xmm2,xmm12
  por         xmm1,xmm0
  packuswb    xmm9,xmm1
  movdqa      xmm0,xmm7
  movdqa      xmm7,[rsp+0A0h]
  pandn       xmm0,[rsp+0F0h]
  movdqa      xmm1,xmm3
  por         xmm6,xmm0
  movdqa      xmm0,[rsp+10h]
  paddw       xmm0,xmm4
  paddw       xmm2,xmm0
  paddw       xmm2,xmm7
  movdqa      xmm0,xmm3
  pandn       xmm0,xmm12
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  movdqa      xmm2,xmm13
  movdqa      xmm0,xmm13
  pand        xmm2,xmm1
  pandn       xmm0,xmm12
  movdqa      xmm1,xmm12
  paddw       xmm1,[rsp+10h]
  por         xmm2,xmm0
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+0B0h]
  paddw       xmm1,xmm4
  packuswb    xmm6,xmm2
  movdqa      xmm2,xmm3
  psllw       xmm1,1
  por         xmm5,xmm0
  movdqa      xmm0,[rsp+80h]
  paddw       xmm0,xmm10
  paddw       xmm1,xmm0
  paddw       xmm11,xmm1
  psraw       xmm11,3
  movdqa      xmm1,xmm12
  pand        xmm2,xmm11
  paddw       xmm1,xmm12
  movdqa      xmm11,[rsp+80h]
  movdqa      xmm0, [rsp+10h]
  por         xmm14,[rsp+0E0h]
  paddw       xmm0,xmm11
  movdqa      xmm4,xmm15
  paddw       xmm1,xmm0
  movdqa      xmm0,xmm13
  paddw       xmm1,xmm7
  psraw       xmm1,2
  pandn       xmm3,xmm1
  por         xmm2,xmm3
  movdqa      xmm1,xmm13
  movdqa      xmm3,[rsp+10h]
  pandn       xmm0,xmm3
  pand        xmm1,xmm2
  movdqa      xmm2,xmm11
  paddw       xmm2,[rsp]
  por         xmm1,xmm0
  movdqa      xmm0,[rsp+0D0h]
  por         xmm0,xmm8
  paddw       xmm2,xmm3
  packuswb    xmm5,xmm1
  movdqa      xmm8,[rsp+40h]
  movdqa      xmm1,[rsp+50h]
  movdqa      xmm3,xmm8
  pand        xmm4,xmm0
  psllw       xmm2,1
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+90h]
  por         xmm4,xmm0
  movdqa      xmm0,xmm12
  paddw       xmm0,xmm10
  paddw       xmm2,xmm0
  paddw       xmm1,xmm2
  movdqa      xmm0,[rsp]
  movdqa      xmm2,xmm11
  paddw       xmm0,xmm12
  movdqa      xmm12,[rsp]
  paddw       xmm2,xmm11
  paddw       xmm2,xmm0
  psraw       xmm1,3
  movdqa      xmm0,xmm8
  pand        xmm3,xmm1
  paddw       xmm2,xmm7
  movdqa      xmm1,xmm13
  psraw       xmm2,2
  pandn       xmm0,xmm2
  por         xmm3,xmm0
  movdqa      xmm2,[rsp+50h]
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm12
  pand        xmm1,xmm3
  paddw       xmm2,xmm11
  movdqa      xmm3,xmm15
  por         xmm1,xmm0
  pand        xmm3,xmm14
  movdqa      xmm14,[rsp+10h]
  movdqa      xmm0,xmm15
  pandn       xmm0,[rsp+30h]
  packuswb    xmm4,xmm1
  movdqa      xmm1,xmm8
  por         xmm3,xmm0
  movdqa      xmm0,xmm12
  paddw       xmm0,xmm14
  paddw       xmm2,xmm0
  paddw       xmm2,xmm7
  movdqa      xmm0,xmm8
  pandn       xmm0,xmm11
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  movdqa      xmm2,xmm13
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm11
  pand        xmm2,xmm1
  movdqa      xmm1,xmm15
  por         xmm2,xmm0
  packuswb    xmm3,xmm2
  movdqa      xmm0,[rsp+100h]
  por         xmm0,[rsp+120h]
  pand        xmm1,xmm0
  movdqa      xmm2,[rcx+rbp]
  movdqa      xmm7,[rsp+50h]
  pandn       xmm15,[rsp+60h]
  lea         r11,[rsp+1D8h]
  pxor        xmm0,xmm0
  por         xmm1,xmm15
  movaps      xmm15,[r11-0A8h]
  movdqa      [rdi],xmm9
  movaps      xmm9,[r11-48h]
  punpckhbw   xmm2,xmm0
  psllw       xmm2,1
  paddw       xmm2,xmm7
  paddw       xmm2,xmm7
  movdqa      [rbx],xmm6
  movaps      xmm6,[r11-18h]
  paddw       xmm2,xmm7
  paddw       xmm2,xmm11
  movaps      xmm11,[r11-68h]
  paddw       xmm2,xmm12
  movaps      xmm12,[r11-78h]
  paddw       xmm2,xmm14
  paddw       xmm2,xmm10
  psraw       xmm2,3
  movaps      xmm10,[r11-58h]
  movaps      xmm14,[r11-98h]
  movdqa      xmm0,xmm13
  pand        xmm2,xmm8
  pandn       xmm8,xmm7
  pandn       xmm13,xmm7
  por         xmm2,xmm8
  movaps      xmm7,[r11-28h]
  movaps      xmm8,[r11-38h]
  movdqa      [r8],xmm5
  pand        xmm0,xmm2
  por         xmm0,xmm13
  packuswb    xmm1,xmm0
  movaps      xmm13,[r11-88h]
  movdqa      [rbp],xmm4
  movdqa      [r10+rbp],xmm3
  movdqa      [rsi+rbp],xmm1
  mov         rsp,r11
  pop         rbp
  pop         rbx
  ret

WELS_EXTERN  DeblockChromaLt4V_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  mov         r10,  rdx
  mov         r11,  rcx
  mov         rcx,  rdi
  mov         rdx,  rsi
  mov         rsi,  r10
  mov         r10,  r9
  mov         rbp,  r8
  mov         r8,   rsi
  mov         r9,   r11
  sub         rsp,0C8h
  pxor        xmm1,xmm1
  mov         rbx,rcx
  movsxd      r11,r8d
  movsx       ecx,byte [r10]
  movsx       r8d,byte [r10+2]
  mov         rdi,rdx
  movq        xmm2,[rbx]
  movq        xmm9,[r11+rbx]
  movsx       edx,byte [r10+1]
  mov         word [rsp+2],cx
  mov         word [rsp],cx
  movsx       eax,byte [r10+3]
  mov         word [rsp+6],dx
  mov         word [rsp+4],dx
  movdqa      xmm11,xmm1
  mov         word [rsp+0Eh],ax
  mov         word [rsp+0Ch],ax
  lea         eax,[r11+r11]
  movsxd      rcx,eax
  mov         rax,rbx
  mov         rdx,rdi
  sub         rax,rcx
  mov         word [rsp+0Ah],r8w
  mov         word [rsp+8],r8w
  movdqa      xmm6,[rsp]
  movdqa      xmm7,xmm6
  movq        xmm13, [rax]
  mov         rax,rdi
  sub         rax,rcx
  mov         rcx,rbx
  pcmpgtw     xmm7,xmm1
  psubw       xmm11,xmm6
  sub         rcx,r11
  sub         rdx,r11
  movq        xmm0,[rax]
  movsx       eax,r9w
  movq        xmm15,[rcx]
  punpcklqdq  xmm13,xmm0
  movq        xmm0, [rdx]
  movdqa      xmm4,xmm13
  punpcklqdq  xmm15,xmm0
  movq        xmm0, [rdi]
  punpcklbw   xmm4,xmm1
  movdqa      xmm12,xmm15
  punpcklqdq  xmm2,xmm0
  movq        xmm0, [r11+rdi]
  punpcklbw   xmm12,xmm1
  movdqa      xmm14,xmm2
  punpcklqdq  xmm9,xmm0
  punpckhbw   xmm2,xmm1
  punpcklbw   xmm14,xmm1
  movd        xmm0,eax
  mov         eax, ebp ; iBeta
  punpckhbw   xmm13,xmm1
  punpckhbw   xmm15,xmm1
  movdqa      xmm3,xmm9
  movdqa      [rsp+10h],xmm2
  punpcklwd   xmm0,xmm0
  punpckhbw   xmm9,xmm1
  punpcklbw   xmm3,xmm1
  movdqa      xmm1,xmm14
  pshufd      xmm10,xmm0,0
  movd        xmm0,eax
  mov         eax,4
  cwde
  punpcklwd   xmm0,xmm0
  pshufd      xmm8,xmm0,0
  movd        xmm0,eax
  punpcklwd   xmm0,xmm0
  pshufd      xmm5,xmm0,0
  psubw       xmm1,xmm12
  movdqa      xmm2,xmm10
  lea         r11,[rsp+0C8h]
  psllw       xmm1,2
  movdqa      xmm0,xmm4
  psubw       xmm4,xmm12
  psubw       xmm0,xmm3
  psubw       xmm3,xmm14
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm11
  psraw       xmm1,3
  pmaxsw      xmm0,xmm1
  pminsw      xmm6,xmm0
  movdqa      xmm1,xmm8
  movdqa      xmm0,xmm12
  psubw       xmm0,xmm14
  pabsw       xmm0,xmm0
  pcmpgtw     xmm2,xmm0
  pabsw       xmm0,xmm4
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm3
  movdqa      xmm3,[rsp]
  pand        xmm2,xmm1
  movdqa      xmm1,xmm8
  pcmpgtw     xmm1,xmm0
  movdqa      xmm0,xmm13
  pand        xmm2,xmm1
  psubw       xmm0,xmm9
  psubw       xmm13,xmm15
  pand        xmm2,xmm7
  pand        xmm6,xmm2
  paddw       xmm12,xmm6
  psubw       xmm14,xmm6
  movdqa      xmm2,[rsp+10h]
  movaps      xmm6,[r11-18h]
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm15
  psubw       xmm9,xmm2
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm15
  psubw       xmm0,xmm2
  psraw       xmm1,3
  pmaxsw      xmm11,xmm1
  pabsw       xmm0,xmm0
  movdqa      xmm1,xmm8
  pcmpgtw     xmm10,xmm0
  pabsw       xmm0,xmm13
  pminsw      xmm3,xmm11
  movaps      xmm11,[r11-68h]
  movaps      xmm13,[rsp+40h]
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm9
  movaps      xmm9, [r11-48h]
  pand        xmm10,xmm1
  pcmpgtw     xmm8,xmm0
  pand        xmm10,xmm8
  pand        xmm10,xmm7
  movaps      xmm8,[r11-38h]
  movaps      xmm7,[r11-28h]
  pand        xmm3,xmm10
  paddw       xmm15,xmm3
  psubw       xmm2,xmm3
  movaps      xmm10,[r11-58h]
  packuswb    xmm12,xmm15
  movaps      xmm15,[rsp+20h]
  packuswb    xmm14,xmm2
  movq        [rcx],xmm12
  movq        [rbx],xmm14
  psrldq      xmm12,8
  psrldq      xmm14,8
  movq        [rdx],xmm12
  movaps      xmm12,[r11-78h]
  movq        [rdi],xmm14
  movaps      xmm14,[rsp+30h]
  mov         rsp,r11
  pop         rbp
  pop         rbx
  ret

WELS_EXTERN DeblockChromaEq4V_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp

  mov         rbp, r8
  mov         r8, rdx
  mov         r9, rcx
  mov         rcx, rdi
  mov         rdx, rsi

  sub         rsp,90h
  pxor        xmm1,xmm1
  mov         r11,rcx
  mov         rbx,rdx
  mov         r10d,r9d
  movq        xmm13,[r11]
  lea         eax,[r8+r8]
  movsxd      r9,eax
  mov         rax,rcx
  sub         rax,r9
  movq        xmm14,[rax]
  mov         rax,rdx
  sub         rax,r9
  movq        xmm0,[rax]
  movsxd      rax,r8d
  sub         rcx,rax
  sub         rdx,rax
  movq        xmm12,[rax+r11]
  movq        xmm10,[rcx]
  punpcklqdq  xmm14,xmm0
  movdqa      xmm8,xmm14
  movq        xmm0,[rdx]
  punpcklbw   xmm8,xmm1
  punpckhbw   xmm14,xmm1
  punpcklqdq  xmm10,xmm0
  movq        xmm0,[rbx]
  movdqa      xmm5,xmm10
  punpcklqdq  xmm13,xmm0
  movq        xmm0, [rax+rbx]
  punpcklbw   xmm5,xmm1
  movsx       eax,r10w
  movdqa      xmm9,xmm13
  punpcklqdq  xmm12,xmm0
  punpcklbw   xmm9,xmm1
  punpckhbw   xmm10,xmm1
  movd        xmm0,eax
  mov         eax, ebp   ; iBeta
  punpckhbw   xmm13,xmm1
  movdqa      xmm7,xmm12
  punpcklwd   xmm0,xmm0
  punpckhbw   xmm12,xmm1
  pshufd      xmm11,xmm0,0
  punpcklbw   xmm7,xmm1
  movd        xmm0,eax
  movdqa      xmm1,xmm8
  psubw       xmm1,xmm5
  punpcklwd   xmm0,xmm0
  movdqa      xmm6,xmm11
  pshufd      xmm3,xmm0,0
  movdqa      xmm0,xmm5
  psubw       xmm0,xmm9
  movdqa      xmm2,xmm3
  pabsw       xmm0,xmm0
  pcmpgtw     xmm6,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm3
  pcmpgtw     xmm2,xmm0
  pand        xmm6,xmm2
  movdqa      xmm0,xmm7
  movdqa      xmm2,xmm3
  psubw       xmm0,xmm9
  pabsw       xmm0,xmm0
  pcmpgtw     xmm1,xmm0
  pand        xmm6,xmm1
  movdqa      xmm0,xmm10
  movdqa      xmm1,xmm14
  psubw       xmm0,xmm13
  psubw       xmm1,xmm10
  pabsw       xmm0,xmm0
  pcmpgtw     xmm11,xmm0
  pabsw       xmm0,xmm1
  pcmpgtw     xmm2,xmm0
  pand        xmm11,xmm2
  movdqa      xmm0,xmm12
  movdqa      xmm4,xmm6
  movdqa      xmm1,xmm8
  mov         eax,2
  cwde
  paddw       xmm1,xmm8
  psubw       xmm0,xmm13
  paddw       xmm1,xmm5
  pabsw       xmm0,xmm0
  movdqa      xmm2,xmm14
  paddw       xmm1,xmm7
  pcmpgtw     xmm3,xmm0
  paddw       xmm2,xmm14
  movd        xmm0,eax
  pand        xmm11,xmm3
  paddw       xmm7,xmm7
  paddw       xmm2,xmm10
  punpcklwd   xmm0,xmm0
  paddw       xmm2,xmm12
  paddw       xmm12,xmm12
  pshufd      xmm3,xmm0,0
  paddw       xmm7,xmm9
  paddw       xmm12,xmm13
  movdqa      xmm0,xmm6
  paddw       xmm1,xmm3
  pandn       xmm0,xmm5
  paddw       xmm7,xmm8
  psraw       xmm1,2
  paddw       xmm12,xmm14
  paddw       xmm7,xmm3
  ;movaps      xmm14,[rsp]
  pand        xmm4,xmm1
  paddw       xmm12,xmm3
  psraw       xmm7,2
  movdqa      xmm1,xmm11
  por         xmm4,xmm0
  psraw       xmm12,2
  paddw       xmm2,xmm3
  movdqa      xmm0,xmm11
  pandn       xmm0,xmm10
  psraw       xmm2,2
  pand        xmm1,xmm2
  por         xmm1,xmm0
  packuswb    xmm4,xmm1
  movdqa      xmm0,xmm11
  movdqa      xmm1,xmm6
  pand        xmm1,xmm7
  movq        [rcx],xmm4
  pandn       xmm6,xmm9
  pandn       xmm11,xmm13
  pand        xmm0,xmm12
  por         xmm1,xmm6
  por         xmm0,xmm11
  psrldq      xmm4,8
  packuswb    xmm1,xmm0
  movq        [r11],xmm1
  psrldq      xmm1,8
  movq        [rdx],xmm4
  lea         r11,[rsp+90h]
  movq        [rbx],xmm1
  mov         rsp,r11
  pop         rbp
  pop         rbx
  ret

WELS_EXTERN DeblockChromaEq4H_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  push        r12

  mov         rbp,   r8
  mov         r8,    rdx
  mov         r9,    rcx
  mov         rcx,   rdi
  mov         rdx,   rsi
  mov         rdi,   rdx

  sub         rsp,140h
  lea         eax,[r8*4]
  movsxd      r10,eax
  mov         eax,[rcx-2]
  mov         [rsp+10h],eax
  lea         rbx,[r10+rdx-2]
  lea         r11,[r10+rcx-2]

  movdqa      xmm5,[rsp+10h]
  movsxd      r10,r8d
  mov         eax,[r10+rcx-2]
  lea         rdx,[r10+r10*2]
  mov         [rsp+20h],eax
  mov         eax,[rcx+r10*2-2]
  mov         [rsp+30h],eax
  mov         eax,[rdx+rcx-2]
  movdqa      xmm2,[rsp+20h]
  mov         [rsp+40h],eax
  mov         eax, [rdi-2]
  movdqa      xmm4,[rsp+30h]
  mov         [rsp+50h],eax
  mov         eax,[r10+rdi-2]
  movdqa      xmm3,[rsp+40h]
  mov         [rsp+60h],eax
  mov         eax,[rdi+r10*2-2]
  punpckldq   xmm5,[rsp+50h]
  mov         [rsp+70h],eax
  mov         eax, [rdx+rdi-2]
  punpckldq   xmm2, [rsp+60h]
  mov          [rsp+80h],eax
  mov         eax,[r11]
  punpckldq   xmm4, [rsp+70h]
  mov         [rsp+50h],eax
  mov         eax,[rbx]
  punpckldq   xmm3,[rsp+80h]
  mov         [rsp+60h],eax
  mov         eax,[r10+r11]
  movdqa      xmm0, [rsp+50h]
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm5,xmm0
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax,[r10+rbx]
  movdqa      xmm0,[rsp+50h]
  movdqa      xmm1,xmm5
  mov         [rsp+60h],eax
  mov         eax,[r11+r10*2]
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm2,xmm0
  punpcklbw   xmm1,xmm2
  punpckhbw   xmm5,xmm2
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax,[rbx+r10*2]
  movdqa      xmm0,[rsp+50h]
  mov         [rsp+60h],eax
  mov         eax, [rdx+r11]
  movdqa      xmm15,xmm1
  punpckldq   xmm0,[rsp+60h]
  punpcklqdq  xmm4,xmm0
  movdqa      [rsp+50h],xmm0
  mov         [rsp+50h],eax
  mov         eax, [rdx+rbx]
  movdqa      xmm0,[rsp+50h]
  mov         [rsp+60h],eax
  punpckldq   xmm0, [rsp+60h]
  punpcklqdq  xmm3,xmm0
  movdqa      xmm0,xmm4
  punpcklbw   xmm0,xmm3
  punpckhbw   xmm4,xmm3
  punpcklwd   xmm15,xmm0
  punpckhwd   xmm1,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm12,xmm15
  punpcklwd   xmm0,xmm4
  punpckhwd   xmm5,xmm4
  punpckldq   xmm12,xmm0
  punpckhdq   xmm15,xmm0
  movdqa      xmm0,xmm1
  movdqa      xmm11,xmm12
  punpckldq   xmm0,xmm5
  punpckhdq   xmm1,xmm5
  punpcklqdq  xmm11,xmm0
  punpckhqdq  xmm12,xmm0
  movsx       eax,r9w
  movdqa      xmm14,xmm15
  punpcklqdq  xmm14,xmm1
  punpckhqdq  xmm15,xmm1
  pxor        xmm1,xmm1
  movd        xmm0,eax
  movdqa      xmm4,xmm12
  movdqa      xmm8,xmm11
  mov         eax, ebp ; iBeta
  punpcklwd   xmm0,xmm0
  punpcklbw   xmm4,xmm1
  punpckhbw   xmm12,xmm1
  movdqa      xmm9,xmm14
  movdqa      xmm7,xmm15
  movdqa      xmm10,xmm15
  pshufd      xmm13,xmm0,0
  punpcklbw   xmm9,xmm1
  punpckhbw   xmm14,xmm1
  movdqa      xmm6,xmm13
  movd        xmm0,eax
  movdqa      [rsp],xmm11
  mov         eax,2
  cwde
  punpckhbw   xmm11,xmm1
  punpckhbw   xmm10,xmm1
  punpcklbw   xmm7,xmm1
  punpcklwd   xmm0,xmm0
  punpcklbw   xmm8,xmm1
  pshufd      xmm3,xmm0,0
  movdqa      xmm1,xmm8
  movdqa      xmm0,xmm4
  psubw       xmm0,xmm9
  psubw       xmm1,xmm4
  movdqa      xmm2,xmm3
  pabsw       xmm0,xmm0
  pcmpgtw     xmm6,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm3
  pcmpgtw     xmm2,xmm0
  pand        xmm6,xmm2
  movdqa      xmm0,xmm7
  movdqa      xmm2,xmm3
  psubw       xmm0,xmm9
  pabsw       xmm0,xmm0
  pcmpgtw     xmm1,xmm0
  pand        xmm6,xmm1
  movdqa      xmm0,xmm12
  movdqa      xmm1,xmm11
  psubw       xmm0,xmm14
  psubw       xmm1,xmm12
  movdqa      xmm5,xmm6
  pabsw       xmm0,xmm0
  pcmpgtw     xmm13,xmm0
  pabsw       xmm0,xmm1
  movdqa      xmm1,xmm8
  pcmpgtw     xmm2,xmm0
  paddw       xmm1,xmm8
  movdqa      xmm0,xmm10
  pand        xmm13,xmm2
  psubw       xmm0,xmm14
  paddw       xmm1,xmm4
  movdqa      xmm2,xmm11
  pabsw       xmm0,xmm0
  paddw       xmm2,xmm11
  paddw       xmm1,xmm7
  pcmpgtw     xmm3,xmm0
  paddw       xmm2,xmm12
  movd        xmm0,eax
  pand        xmm13,xmm3
  paddw       xmm2,xmm10
  punpcklwd   xmm0,xmm0
  pshufd      xmm3,xmm0,0
  movdqa      xmm0,xmm6
  paddw       xmm1,xmm3
  pandn       xmm0,xmm4
  paddw       xmm2,xmm3
  psraw       xmm1,2
  pand        xmm5,xmm1
  por         xmm5,xmm0
  paddw       xmm7,xmm7
  paddw       xmm10,xmm10
  psraw       xmm2,2
  movdqa      xmm1,xmm13
  movdqa      xmm0,xmm13
  pandn       xmm0,xmm12
  pand        xmm1,xmm2
  paddw       xmm7,xmm9
  por         xmm1,xmm0
  paddw       xmm10,xmm14
  paddw       xmm7,xmm8
  movdqa      xmm0,xmm13
  packuswb    xmm5,xmm1
  paddw       xmm7,xmm3
  paddw       xmm10,xmm11
  movdqa      xmm1,xmm6
  paddw       xmm10,xmm3
  pandn       xmm6,xmm9
  psraw       xmm7,2
  pand        xmm1,xmm7
  psraw       xmm10,2
  pandn       xmm13,xmm14
  pand        xmm0,xmm10
  por         xmm1,xmm6
  movdqa      xmm6,[rsp]
  movdqa      xmm4,xmm6
  por         xmm0,xmm13
  punpcklbw   xmm4,xmm5
  punpckhbw   xmm6,xmm5
  movdqa      xmm3,xmm4
  packuswb    xmm1,xmm0
  movdqa      xmm0,xmm1
  punpckhbw   xmm1,xmm15
  punpcklbw   xmm0,xmm15
  punpcklwd   xmm3,xmm0
  punpckhwd   xmm4,xmm0
  movdqa      xmm0,xmm6
  movdqa      xmm2,xmm3
  punpcklwd   xmm0,xmm1
  punpckhwd   xmm6,xmm1
  movdqa      xmm1,xmm4
  punpckldq   xmm2,xmm0
  punpckhdq   xmm3,xmm0
  punpckldq   xmm1,xmm6
  movdqa      xmm0,xmm2
  punpcklqdq  xmm0,xmm1
  punpckhdq   xmm4,xmm6
  punpckhqdq  xmm2,xmm1
  movdqa      [rsp+10h],xmm0
  movdqa      [rsp+60h],xmm2
  movdqa      xmm0,xmm3
  mov         eax,[rsp+10h]
  mov         [rcx-2],eax
  mov         eax,[rsp+60h]
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm3,xmm4
  mov         [r10+rcx-2],eax
  movdqa      [rsp+20h],xmm0
  mov         eax, [rsp+20h]
  movdqa      [rsp+70h],xmm3
  mov         [rcx+r10*2-2],eax
  mov         eax,[rsp+70h]
  mov         [rdx+rcx-2],eax
  mov         eax,[rsp+18h]
  mov         [r11],eax
  mov         eax,[rsp+68h]
  mov         [r10+r11],eax
  mov         eax,[rsp+28h]
  mov         [r11+r10*2],eax
  mov         eax,[rsp+78h]
  mov         [rdx+r11],eax
  mov         eax,[rsp+14h]
  mov         [rdi-2],eax
  mov         eax,[rsp+64h]
  mov         [r10+rdi-2],eax
  mov         eax,[rsp+24h]
  mov         [rdi+r10*2-2],eax
  mov         eax, [rsp+74h]
  mov         [rdx+rdi-2],eax
  mov         eax, [rsp+1Ch]
  mov         [rbx],eax
  mov         eax, [rsp+6Ch]
  mov         [r10+rbx],eax
  mov         eax,[rsp+2Ch]
  mov         [rbx+r10*2],eax
  mov         eax,[rsp+7Ch]
  mov         [rdx+rbx],eax
  lea         r11,[rsp+140h]
  mov         rbx, [r11+28h]
  mov         rsp,r11
  pop         r12
  pop         rbp
  pop         rbx
  ret


WELS_EXTERN DeblockChromaLt4H_ssse3
  mov         rax,rsp
  push        rbx
  push        rbp
  push        r12
  push        r13
  push        r14
  sub         rsp,170h

  mov         r13,   r8
  mov         r14,   r9
  mov         r8,    rdx
  mov         r9,    rcx
  mov         rdx,   rdi
  mov         rcx,   rsi

  movsxd      rsi,r8d
  lea         eax,[r8*4]
  mov         r11d,r9d
  movsxd      r10,eax
  mov         eax, [rcx-2]
  mov         r12,rdx
  mov         [rsp+40h],eax
  mov         eax, [rsi+rcx-2]
  lea         rbx,[r10+rcx-2]
  movdqa      xmm5,[rsp+40h]
  mov         [rsp+50h],eax
  mov         eax, [rcx+rsi*2-2]
  lea         rbp,[r10+rdx-2]
  movdqa      xmm2, [rsp+50h]
  mov         [rsp+60h],eax
  lea         r10,[rsi+rsi*2]
  mov         rdi,rcx
  mov         eax,[r10+rcx-2]
  movdqa      xmm4,[rsp+60h]
  mov         [rsp+70h],eax
  mov         eax,[rdx-2]
  mov         [rsp+80h],eax
  mov         eax, [rsi+rdx-2]
  movdqa      xmm3,[rsp+70h]
  mov         [rsp+90h],eax
  mov         eax,[rdx+rsi*2-2]
  punpckldq   xmm5,[rsp+80h]
  mov         [rsp+0A0h],eax
  mov         eax, [r10+rdx-2]
  punpckldq   xmm2,[rsp+90h]
  mov         [rsp+0B0h],eax
  mov         eax, [rbx]
  punpckldq   xmm4,[rsp+0A0h]
  mov         [rsp+80h],eax
  mov         eax,[rbp]
  punpckldq   xmm3,[rsp+0B0h]
  mov         [rsp+90h],eax
  mov         eax,[rsi+rbx]
  movdqa      xmm0,[rsp+80h]
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm5,xmm0
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax,[rsi+rbp]
  movdqa      xmm0,[rsp+80h]
  movdqa      xmm1,xmm5
  mov         [rsp+90h],eax
  mov         eax,[rbx+rsi*2]
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm2,xmm0
  punpcklbw   xmm1,xmm2
  punpckhbw   xmm5,xmm2
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax,[rbp+rsi*2]
  movdqa      xmm0, [rsp+80h]
  mov         [rsp+90h],eax
  mov         eax,[r10+rbx]
  movdqa      xmm7,xmm1
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm4,xmm0
  movdqa      [rsp+80h],xmm0
  mov         [rsp+80h],eax
  mov         eax, [r10+rbp]
  movdqa      xmm0,[rsp+80h]
  mov         [rsp+90h],eax
  punpckldq   xmm0,[rsp+90h]
  punpcklqdq  xmm3,xmm0
  movdqa      xmm0,xmm4
  punpcklbw   xmm0,xmm3
  punpckhbw   xmm4,xmm3
  punpcklwd   xmm7,xmm0
  punpckhwd   xmm1,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm6,xmm7
  punpcklwd   xmm0,xmm4
  punpckhwd   xmm5,xmm4
  punpckldq   xmm6,xmm0
  punpckhdq   xmm7,xmm0
  movdqa      xmm0,xmm1
  punpckldq   xmm0,xmm5
  mov         rax, r14    ; pTC
  punpckhdq   xmm1,xmm5
  movdqa      xmm9,xmm6
  punpckhqdq  xmm6,xmm0
  punpcklqdq  xmm9,xmm0
  movdqa      xmm2,xmm7
  movdqa      xmm13,xmm6
  movdqa      xmm4,xmm9
  movdqa      [rsp+10h],xmm9
  punpcklqdq  xmm2,xmm1
  punpckhqdq  xmm7,xmm1
  pxor        xmm1,xmm1
  movsx       ecx,byte [rax+3]
  movsx       edx,byte [rax+2]
  movsx       r8d,byte [rax+1]
  movsx       r9d,byte [rax]
  movdqa      xmm10,xmm1
  movdqa      xmm15,xmm2
  punpckhbw   xmm2,xmm1
  punpckhbw   xmm6,xmm1
  punpcklbw   xmm4,xmm1
  movsx       eax,r11w
  mov         word [rsp+0Eh],cx
  mov         word [rsp+0Ch],cx
  movdqa      xmm3,xmm7
  movdqa      xmm8,xmm7
  movdqa      [rsp+20h],xmm7
  punpcklbw   xmm15,xmm1
  punpcklbw   xmm13,xmm1
  punpcklbw   xmm3,xmm1
  mov         word [rsp+0Ah],dx
  mov         word [rsp+8],dx
  mov         word [rsp+6],r8w
  movd        xmm0,eax
  movdqa      [rsp+30h],xmm6
  punpckhbw   xmm9,xmm1
  punpckhbw   xmm8,xmm1
  punpcklwd   xmm0,xmm0
  mov         eax, r13d   ; iBeta
  mov         word [rsp+4],r8w
  mov         word [rsp+2],r9w
  pshufd      xmm12,xmm0,0
  mov         word [rsp],r9w
  movd        xmm0,eax
  mov         eax,4
  cwde
  movdqa      xmm14, [rsp]
  movdqa      [rsp],xmm2
  movdqa      xmm2,xmm12
  punpcklwd   xmm0,xmm0
  pshufd      xmm11,xmm0,0
  psubw       xmm10,xmm14
  movd        xmm0,eax
  movdqa      xmm7,xmm14
  movdqa      xmm6,xmm14
  pcmpgtw     xmm7,xmm1
  punpcklwd   xmm0,xmm0
  pshufd      xmm5,xmm0,0
  movdqa      xmm0,xmm4
  movdqa      xmm1,xmm15
  psubw       xmm4,xmm13
  psubw       xmm0,xmm3
  psubw       xmm1,xmm13
  psubw       xmm3,xmm15
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm10
  psraw       xmm1,3
  pmaxsw      xmm0,xmm1
  pminsw      xmm6,xmm0
  movdqa      xmm1,xmm11
  movdqa      xmm0,xmm13
  psubw       xmm0,xmm15
  pabsw       xmm0,xmm0
  pcmpgtw     xmm2,xmm0
  pabsw       xmm0,xmm4
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm3
  pand        xmm2,xmm1
  movdqa      xmm1,xmm11
  movdqa      xmm3,[rsp+30h]
  pcmpgtw     xmm1,xmm0
  movdqa      xmm0,xmm9
  pand        xmm2,xmm1
  psubw       xmm0,xmm8
  psubw       xmm9,xmm3
  pand        xmm2,xmm7
  pand        xmm6,xmm2
  psubw       xmm15,xmm6
  paddw       xmm13,xmm6
  movdqa      xmm2,[rsp]
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  psubw       xmm8,xmm2
  psllw       xmm1,2
  paddw       xmm1,xmm0
  paddw       xmm1,xmm5
  movdqa      xmm0,xmm3
  movdqa      xmm5,[rsp+10h]
  psubw       xmm0,xmm2
  psraw       xmm1,3
  movdqa      xmm4,xmm5
  pabsw       xmm0,xmm0
  pmaxsw      xmm10,xmm1
  movdqa      xmm1,xmm11
  pcmpgtw     xmm12,xmm0
  pabsw       xmm0,xmm9
  pminsw      xmm14,xmm10
  pcmpgtw     xmm1,xmm0
  pabsw       xmm0,xmm8
  pcmpgtw     xmm11,xmm0
  pand        xmm12,xmm1
  movdqa      xmm1,[rsp+20h]
  pand        xmm12,xmm11
  pand        xmm12,xmm7
  pand        xmm14,xmm12
  paddw       xmm3,xmm14
  psubw       xmm2,xmm14
  packuswb    xmm13,xmm3
  packuswb    xmm15,xmm2
  punpcklbw   xmm4,xmm13
  punpckhbw   xmm5,xmm13
  movdqa      xmm0,xmm15
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm15,xmm1
  movdqa      xmm3,xmm4
  punpcklwd   xmm3,xmm0
  punpckhwd   xmm4,xmm0
  movdqa      xmm0,xmm5
  movdqa      xmm2,xmm3
  movdqa      xmm1,xmm4
  punpcklwd   xmm0,xmm15
  punpckhwd   xmm5,xmm15
  punpckldq   xmm2,xmm0
  punpckhdq   xmm3,xmm0
  punpckldq   xmm1,xmm5
  movdqa      xmm0,xmm2
  punpcklqdq  xmm0,xmm1
  punpckhdq   xmm4,xmm5
  punpckhqdq  xmm2,xmm1
  movdqa      [rsp+40h],xmm0
  movdqa      xmm0,xmm3
  movdqa      [rsp+90h],xmm2
  mov         eax,[rsp+40h]
  mov         [rdi-2],eax
  mov         eax, [rsp+90h]
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm3,xmm4
  mov         [rsi+rdi-2],eax
  movdqa      [rsp+50h],xmm0
  mov         eax,[rsp+50h]
  movdqa      [rsp+0A0h],xmm3
  mov         [rdi+rsi*2-2],eax
  mov         eax,[rsp+0A0h]
  mov         [r10+rdi-2],eax
  mov         eax,[rsp+48h]
  mov         [rbx],eax
  mov         eax,[rsp+98h]
  mov         [rsi+rbx],eax
  mov         eax,[rsp+58h]
  mov         [rbx+rsi*2],eax
  mov         eax, [rsp+0A8h]
  mov         [r10+rbx],eax
  mov         eax, [rsp+44h]
  mov         [r12-2],eax
  mov         eax,[rsp+94h]
  mov         [rsi+r12-2],eax
  mov         eax,[rsp+54h]
  mov         [r12+rsi*2-2],eax
  mov         eax, [rsp+0A4h]
  mov         [r10+r12-2],eax
  mov         eax,[rsp+4Ch]
  mov         [rbp],eax
  mov         eax,[rsp+9Ch]
  mov         [rsi+rbp],eax
  mov         eax, [rsp+5Ch]
  mov         [rbp+rsi*2],eax
  mov         eax,[rsp+0ACh]
  mov         [r10+rbp],eax
  lea         r11,[rsp+170h]
  mov         rsp,r11
  pop         r14
  pop         r13
  pop         r12
  pop         rbp
  pop         rbx
  ret



%elifdef  X86_32

;********************************************************************************
;  void DeblockChromaEq4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                             int32_t iAlpha, int32_t iBeta)
;********************************************************************************
WELS_EXTERN   DeblockChromaEq4V_ssse3
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,68h
  mov         edx,[ebp+10h]      ;  iStride
  mov         eax,[ebp+8]        ;  pPixCb
  mov         ecx,[ebp+0Ch]      ;  pPixCr
  movq        xmm4,[ecx]
  movq        xmm5,[edx+ecx]
  push        esi
  push        edi
  lea         esi,[edx+edx]
  mov         edi,eax
  sub         edi,esi
  movq        xmm1,[edi]
  mov         edi,ecx
  sub         edi,esi
  movq        xmm2,[edi]
  punpcklqdq  xmm1,xmm2
  mov         esi,eax
  sub         esi,edx
  movq        xmm2,[esi]
  mov         edi,ecx
  sub         edi,edx
  movq        xmm3,[edi]
  punpcklqdq  xmm2,xmm3
  movq        xmm3,[eax]
  punpcklqdq  xmm3,xmm4
  movq        xmm4,[edx+eax]
  mov       edx, [ebp + 14h]
  punpcklqdq  xmm4,xmm5
  movd        xmm5,edx
  mov       edx, [ebp + 18h]
  pxor        xmm0,xmm0
  movdqa      xmm6,xmm5
  punpcklwd   xmm6,xmm5
  pshufd      xmm5,xmm6,0
  movd        xmm6,edx
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      xmm7,xmm1
  punpckhbw   xmm1,xmm0
  punpcklbw   xmm7,xmm0
  movdqa      [esp+40h],xmm1
  movdqa      [esp+60h],xmm7
  movdqa      xmm7,xmm2
  punpcklbw   xmm7,xmm0
  movdqa      [esp+10h],xmm7
  movdqa      xmm7,xmm3
  punpcklbw   xmm7,xmm0
  punpckhbw   xmm3,xmm0
  movdqa      [esp+50h],xmm7
  movdqa      xmm7,xmm4
  punpckhbw   xmm4,xmm0
  punpckhbw   xmm2,xmm0
  punpcklbw   xmm7,xmm0
  movdqa      [esp+30h],xmm3
  movdqa      xmm3,[esp+10h]
  movdqa      xmm1,xmm3
  psubw       xmm1,[esp+50h]
  pabsw       xmm1,xmm1
  movdqa      [esp+20h],xmm4
  movdqa      xmm0,xmm5
  pcmpgtw     xmm0,xmm1
  movdqa      xmm1,[esp+60h]
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  pand        xmm0,xmm4
  movdqa      xmm1,xmm7
  psubw       xmm1,[esp+50h]
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  movdqa      xmm1,xmm2
  psubw       xmm1,[esp+30h]
  pabsw       xmm1,xmm1
  pcmpgtw     xmm5,xmm1
  movdqa      xmm1,[esp+40h]
  pand        xmm0,xmm4
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  movdqa      xmm4,xmm6
  pcmpgtw     xmm4,xmm1
  movdqa      xmm1,[esp+20h]
  psubw       xmm1,[esp+30h]
  pand        xmm5,xmm4
  pabsw       xmm1,xmm1
  pcmpgtw     xmm6,xmm1
  pand        xmm5,xmm6
  mov         edx,2
  movsx       edx,dx
  movd        xmm1,edx
  movdqa      xmm4,xmm1
  punpcklwd   xmm4,xmm1
  pshufd      xmm1,xmm4,0
  movdqa      xmm4,[esp+60h]
  movdqa      xmm6,xmm4
  paddw       xmm6,xmm4
  paddw       xmm6,xmm3
  paddw       xmm6,xmm7
  movdqa      [esp+10h],xmm1
  paddw       xmm6,[esp+10h]
  psraw       xmm6,2
  movdqa      xmm4,xmm0
  pandn       xmm4,xmm3
  movdqa      xmm3,[esp+40h]
  movdqa      xmm1,xmm0
  pand        xmm1,xmm6
  por         xmm1,xmm4
  movdqa      xmm6,xmm3
  paddw       xmm6,xmm3
  movdqa      xmm3,[esp+10h]
  paddw       xmm6,xmm2
  paddw       xmm6,[esp+20h]
  paddw       xmm6,xmm3
  psraw       xmm6,2
  movdqa      xmm4,xmm5
  pand        xmm4,xmm6
  movdqa      xmm6,xmm5
  pandn       xmm6,xmm2
  por         xmm4,xmm6
  packuswb    xmm1,xmm4
  movdqa      xmm4,[esp+50h]
  movdqa      xmm6,xmm7
  paddw       xmm6,xmm7
  paddw       xmm6,xmm4
  paddw       xmm6,[esp+60h]
  paddw       xmm6,xmm3
  psraw       xmm6,2
  movdqa      xmm2,xmm0
  pand        xmm2,xmm6
  pandn       xmm0,xmm4
  por         xmm2,xmm0
  movdqa      xmm0,[esp+20h]
  movdqa      xmm6,xmm0
  paddw       xmm6,xmm0
  movdqa      xmm0,[esp+30h]
  paddw       xmm6,xmm0
  paddw       xmm6,[esp+40h]
  movdqa      xmm4,xmm5
  paddw       xmm6,xmm3
  movq        [esi],xmm1
  psraw       xmm6,2
  pand        xmm4,xmm6
  pandn       xmm5,xmm0
  por         xmm4,xmm5
  packuswb    xmm2,xmm4
  movq        [eax],xmm2
  psrldq      xmm1,8
  movq        [edi],xmm1
  pop         edi
  psrldq      xmm2,8
  movq        [ecx],xmm2
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret

;******************************************************************************
; void DeblockChromaLt4V_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN  DeblockChromaLt4V_ssse3
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,0E4h
  push        ebx
  push        esi
  mov         esi, [ebp+1Ch]      ;  pTC
  movsx       ebx, byte [esi+2]
  push        edi
  movsx       di,byte [esi+3]
  mov         word [esp+0Ch],bx
  movsx       bx,byte  [esi+1]
  movsx       esi,byte  [esi]
  mov         word  [esp+0Eh],si
  movzx       esi,di
  movd        xmm1,esi
  movzx       esi,di
  movd        xmm2,esi
  mov         si,word  [esp+0Ch]
  mov         edx, [ebp + 10h]
  mov         eax, [ebp + 08h]
  movzx       edi,si
  movzx       esi,si
  mov         ecx, [ebp + 0Ch]
  movd        xmm4,esi
  movzx       esi,bx
  movd        xmm5,esi
  movd        xmm3,edi
  movzx       esi,bx
  movd        xmm6,esi
  mov         si,word [esp+0Eh]
  movzx       edi,si
  movzx       esi,si
  punpcklwd   xmm6,xmm2
  pxor        xmm0,xmm0
  movdqa      [esp+40h],xmm0
  movd        xmm7,edi
  movd        xmm0,esi
  lea         esi,[edx+edx]
  mov         edi,eax
  sub         edi,esi
  punpcklwd   xmm5,xmm1
  movdqa      xmm1,[esp+40h]
  punpcklwd   xmm0,xmm4
  movq        xmm4,[edx+ecx]
  punpcklwd   xmm7,xmm3
  movq        xmm3,[eax]
  punpcklwd   xmm0,xmm6
  movq        xmm6,[edi]
  punpcklwd   xmm7,xmm5
  punpcklwd   xmm0,xmm7
  mov         edi,ecx
  sub         edi,esi
  movdqa      xmm2,xmm1
  psubw       xmm2,xmm0
  movdqa      [esp+60h],xmm2
  movq        xmm2, [edi]
  punpcklqdq  xmm6,xmm2
  mov         esi,eax
  sub         esi,edx
  movq        xmm7,[esi]
  mov         edi,ecx
  sub         edi,edx
  movq        xmm2,[edi]
  punpcklqdq  xmm7,xmm2
  movq        xmm2,[ecx]
  punpcklqdq  xmm3,xmm2
  movq        xmm2,[edx+eax]
  movsx       edx,word [ebp + 14h]
  punpcklqdq  xmm2,xmm4
  movdqa      [esp+0E0h],xmm2
  movd        xmm2,edx
  movsx       edx,word [ebp + 18h]
  movdqa      xmm4,xmm2
  punpcklwd   xmm4,xmm2
  movd        xmm2,edx
  movdqa      xmm5,xmm2
  punpcklwd   xmm5,xmm2
  pshufd      xmm2,xmm5,0
  movdqa      [esp+50h],xmm2
  movdqa      xmm2,xmm6
  punpcklbw   xmm2,xmm1
  movdqa      [esp+0D0h],xmm3
  pshufd      xmm4,xmm4,0
  movdqa      [esp+30h],xmm2
  punpckhbw   xmm6,xmm1
  movdqa      [esp+80h],xmm6
  movdqa      xmm6,[esp+0D0h]
  punpckhbw   xmm6,xmm1
  movdqa      [esp+70h],xmm6
  movdqa      xmm6, [esp+0E0h]
  punpckhbw   xmm6,xmm1
  movdqa     [esp+90h],xmm6
  movdqa      xmm5, [esp+0E0h]
  movdqa      xmm2,xmm7
  punpckhbw   xmm7,xmm1
  punpcklbw   xmm5,xmm1
  movdqa       [esp+0A0h],xmm7
  punpcklbw   xmm3,xmm1
  mov         edx,4
  punpcklbw   xmm2,xmm1
  movsx       edx,dx
  movd        xmm6,edx
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      xmm7,[esp+30h]
  movdqa      [esp+20h],xmm6
  psubw       xmm7,xmm5
  movdqa      xmm6,xmm0
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1,[esp+60h]
  movdqa      [esp+40h],xmm6
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm2
  psllw       xmm6,2
  paddw       xmm6,xmm7
  paddw       xmm6, [esp+20h]
  movdqa      xmm7, [esp+50h]
  psraw       xmm6,3
  pmaxsw      xmm1,xmm6
  movdqa      [esp+10h],xmm0
  movdqa      xmm6, [esp+10h]
  pminsw      xmm6,xmm1
  movdqa      [esp+10h],xmm6
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  movdqa      xmm6,xmm4
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1, [esp+30h]
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  pcmpgtw     xmm7,xmm1
  movdqa      xmm1,[esp+50h]
  pand        xmm6,xmm7
  movdqa      xmm7,[esp+50h]
  psubw       xmm5,xmm3
  pabsw       xmm5,xmm5
  pcmpgtw     xmm1,xmm5
  movdqa      xmm5,[esp+80h]
  psubw       xmm5,[esp+90h]
  pand        xmm6,xmm1
  pand        xmm6,[esp+40h]
  movdqa      xmm1,[esp+10h]
  pand        xmm1,xmm6
  movdqa      xmm6,[esp+70h]
  movdqa      [esp+30h],xmm1
  movdqa      xmm1,[esp+0A0h]
  psubw       xmm6,xmm1
  psllw       xmm6,2
  paddw       xmm6,xmm5
  paddw       xmm6,[esp+20h]
  movdqa      xmm5,[esp+60h]
  psraw       xmm6,3
  pmaxsw      xmm5,xmm6
  pminsw      xmm0,xmm5
  movdqa      xmm5,[esp+70h]
  movdqa      xmm6,xmm1
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm4,xmm6
  movdqa      xmm6,[esp+80h]
  psubw       xmm6,xmm1
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+90h]
  pand        xmm4,xmm7
  movdqa      xmm7,[esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  pand        xmm4,xmm7
  pand        xmm4,[esp+40h]
  pand        xmm0,xmm4
  movdqa      xmm4,[esp+30h]
  paddw       xmm2,xmm4
  paddw       xmm1,xmm0
  packuswb    xmm2,xmm1
  movq        [esi],xmm2
  psubw       xmm3,xmm4
  psubw       xmm5,xmm0
  packuswb    xmm3,xmm5
  movq        [eax],xmm3
  psrldq      xmm2,8
  movq        [edi],xmm2
  pop         edi
  pop         esi
  psrldq      xmm3,8
  movq        [ecx],xmm3
  pop         ebx
  mov         esp,ebp
  pop         ebp
  ret

;***************************************************************************
;  void DeblockChromaEq4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;          int32_t iAlpha, int32_t iBeta)
;***************************************************************************

WELS_EXTERN     DeblockChromaEq4H_ssse3
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,0C8h
  mov         ecx,dword [ebp+8]
  mov         edx,dword [ebp+0Ch]
  mov         eax,dword [ebp+10h]
  sub         ecx,2
  sub         edx,2
  push        esi
  lea         esi,[eax+eax*2]
  mov         dword [esp+18h],ecx
  mov         dword [esp+4],edx
  lea         ecx,[ecx+eax*4]
  lea         edx,[edx+eax*4]
  lea         eax,[esp+7Ch]
  push        edi
  mov         dword [esp+14h],esi
  mov         dword [esp+18h],ecx
  mov         dword [esp+0Ch],edx
  mov         dword [esp+10h],eax
  mov         esi,dword [esp+1Ch]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+14h]
  movd        xmm0,dword [esi]
  movd        xmm1,dword [esi+ecx]
  movd        xmm2,dword [esi+ecx*2]
  movd        xmm3,dword [esi+edx]
  mov         esi,dword  [esp+8]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [esi+ecx]
  movd        xmm6,dword [esi+ecx*2]
  movd        xmm7,dword [esi+edx]
  punpckldq   xmm0,xmm4
  punpckldq   xmm1,xmm5
  punpckldq   xmm2,xmm6
  punpckldq   xmm3,xmm7
  mov         esi,dword [esp+18h]
  mov         edi,dword [esp+0Ch]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [edi]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm0,xmm4
  movd        xmm4,dword [esi+ecx]
  movd        xmm5,dword [edi+ecx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm1,xmm4
  movd        xmm4,dword [esi+ecx*2]
  movd        xmm5,dword [edi+ecx*2]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm2,xmm4
  movd        xmm4,dword [esi+edx]
  movd        xmm5,dword [edi+edx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm3,xmm4
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         edi,dword [esp+10h]
  movdqa      [edi],xmm0
  movdqa      [edi+10h],xmm5
  movdqa      [edi+20h],xmm1
  movdqa      [edi+30h],xmm6
  movsx       ecx,word [ebp+14h]
  movsx       edx,word [ebp+18h]
  movdqa      xmm6,[esp+80h]
  movdqa      xmm4,[esp+90h]
  movdqa      xmm5,[esp+0A0h]
  movdqa      xmm7,[esp+0B0h]
  pxor        xmm0,xmm0
  movd        xmm1,ecx
  movdqa      xmm2,xmm1
  punpcklwd   xmm2,xmm1
  pshufd      xmm1,xmm2,0
  movd        xmm2,edx
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm2,xmm3,0
  movdqa      xmm3,xmm6
  punpckhbw   xmm6,xmm0
  movdqa      [esp+60h],xmm6
  movdqa      xmm6,[esp+90h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+30h],xmm6
  movdqa      xmm6,[esp+0A0h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+40h],xmm6
  movdqa      xmm6,[esp+0B0h]
  punpckhbw   xmm6,xmm0
  movdqa      [esp+70h],xmm6
  punpcklbw   xmm7,xmm0
  punpcklbw   xmm4,xmm0
  punpcklbw   xmm5,xmm0
  punpcklbw   xmm3,xmm0
  movdqa      [esp+50h],xmm7
  movdqa      xmm6,xmm4
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  movdqa      xmm0,xmm1
  pcmpgtw     xmm0,xmm6
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm4
  pabsw       xmm6,xmm6
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pand        xmm0,xmm7
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+30h]
  psubw       xmm6,[esp+40h]
  pabsw       xmm6,xmm6
  pcmpgtw     xmm1,xmm6
  movdqa      xmm6,[esp+60h]
  psubw       xmm6,[esp+30h]
  pabsw       xmm6,xmm6
  pand        xmm0,xmm7
  movdqa      xmm7,xmm2
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6,[esp+70h]
  psubw       xmm6,[esp+40h]
  pabsw       xmm6,xmm6
  pand        xmm1,xmm7
  pcmpgtw     xmm2,xmm6
  pand        xmm1,xmm2
  mov         eax,2
  movsx       ecx,ax
  movd        xmm2,ecx
  movdqa      xmm6,xmm2
  punpcklwd   xmm6,xmm2
  pshufd      xmm2,xmm6,0
  movdqa      [esp+20h],xmm2
  movdqa      xmm2,xmm3
  paddw       xmm2,xmm3
  paddw       xmm2,xmm4
  paddw       xmm2,[esp+50h]
  paddw       xmm2,[esp+20h]
  psraw       xmm2,2
  movdqa      xmm6,xmm0
  pand        xmm6,xmm2
  movdqa      xmm2,xmm0
  pandn       xmm2,xmm4
  por         xmm6,xmm2
  movdqa      xmm2,[esp+60h]
  movdqa      xmm7,xmm2
  paddw       xmm7,xmm2
  paddw       xmm7,[esp+30h]
  paddw       xmm7,[esp+70h]
  paddw       xmm7,[esp+20h]
  movdqa      xmm4,xmm1
  movdqa      xmm2,xmm1
  pandn       xmm2,[esp+30h]
  psraw       xmm7,2
  pand        xmm4,xmm7
  por         xmm4,xmm2
  movdqa      xmm2,[esp+50h]
  packuswb    xmm6,xmm4
  movdqa      [esp+90h],xmm6
  movdqa      xmm6,xmm2
  paddw       xmm6,xmm2
  movdqa      xmm2,[esp+20h]
  paddw       xmm6,xmm5
  paddw       xmm6,xmm3
  movdqa      xmm4,xmm0
  pandn       xmm0,xmm5
  paddw       xmm6,xmm2
  psraw       xmm6,2
  pand        xmm4,xmm6
  por         xmm4,xmm0
  movdqa      xmm0,[esp+70h]
  movdqa      xmm5,xmm0
  paddw       xmm5,xmm0
  movdqa      xmm0,[esp+40h]
  paddw       xmm5,xmm0
  paddw       xmm5,[esp+60h]
  movdqa      xmm3,xmm1
  paddw       xmm5,xmm2
  psraw       xmm5,2
  pand        xmm3,xmm5
  pandn       xmm1,xmm0
  por         xmm3,xmm1
  packuswb    xmm4,xmm3
  movdqa      [esp+0A0h],xmm4
  mov         esi,dword [esp+10h]
  movdqa      xmm0,[esi]
  movdqa      xmm1,[esi+10h]
  movdqa      xmm2,[esi+20h]
  movdqa      xmm3,[esi+30h]
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         esi,dword [esp+1Ch]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+14h]
  mov         edi,dword [esp+8]
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         esi,dword [esp+18h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         edi,dword [esp+0Ch]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  pop         edi
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret

;*******************************************************************************
;    void DeblockChromaLt4H_ssse3(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
;*******************************************************************************

WELS_EXTERN  DeblockChromaLt4H_ssse3
  push        ebp
  mov         ebp,esp
  and         esp,0FFFFFFF0h
  sub         esp,108h
  mov         ecx,dword [ebp+8]
  mov         edx,dword [ebp+0Ch]
  mov         eax,dword [ebp+10h]
  sub         ecx,2
  sub         edx,2
  push        esi
  lea         esi,[eax+eax*2]
  mov         dword [esp+10h],ecx
  mov         dword [esp+4],edx
  lea         ecx,[ecx+eax*4]
  lea         edx,[edx+eax*4]
  lea         eax,[esp+6Ch]
  push        edi
  mov         dword [esp+0Ch],esi
  mov         dword [esp+18h],ecx
  mov         dword [esp+10h],edx
  mov         dword [esp+1Ch],eax
  mov         esi,dword [esp+14h]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+0Ch]
  movd        xmm0,dword [esi]
  movd        xmm1,dword [esi+ecx]
  movd        xmm2,dword [esi+ecx*2]
  movd        xmm3,dword [esi+edx]
  mov         esi,dword [esp+8]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [esi+ecx]
  movd        xmm6,dword [esi+ecx*2]
  movd        xmm7,dword [esi+edx]
  punpckldq   xmm0,xmm4
  punpckldq   xmm1,xmm5
  punpckldq   xmm2,xmm6
  punpckldq   xmm3,xmm7
  mov         esi,dword [esp+18h]
  mov         edi,dword [esp+10h]
  movd        xmm4,dword [esi]
  movd        xmm5,dword [edi]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm0,xmm4
  movd        xmm4,dword [esi+ecx]
  movd        xmm5,dword [edi+ecx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm1,xmm4
  movd        xmm4,dword [esi+ecx*2]
  movd        xmm5,dword [edi+ecx*2]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm2,xmm4
  movd        xmm4,dword [esi+edx]
  movd        xmm5,dword [edi+edx]
  punpckldq   xmm4,xmm5
  punpcklqdq  xmm3,xmm4
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         edi,dword [esp+1Ch]
  movdqa      [edi],xmm0
  movdqa      [edi+10h],xmm5
  movdqa      [edi+20h],xmm1
  movdqa      [edi+30h],xmm6
  mov         eax,dword [ebp+1Ch]
  movsx       cx,byte [eax+3]
  movsx       dx,byte [eax+2]
  movsx       si,byte [eax+1]
  movsx       ax,byte [eax]
  movzx       edi,cx
  movzx       ecx,cx
  movd        xmm2,ecx
  movzx       ecx,dx
  movzx       edx,dx
  movd        xmm3,ecx
  movd        xmm4,edx
  movzx       ecx,si
  movzx       edx,si
  movd        xmm5,ecx
  pxor        xmm0,xmm0
  movd        xmm6,edx
  movzx       ecx,ax
  movdqa      [esp+60h],xmm0
  movzx       edx,ax
  movsx       eax,word [ebp+14h]
  punpcklwd   xmm6,xmm2
  movd        xmm1,edi
  movd        xmm7,ecx
  movsx       ecx,word [ebp+18h]
  movd        xmm0,edx
  punpcklwd   xmm7,xmm3
  punpcklwd   xmm5,xmm1
  movdqa      xmm1,[esp+60h]
  punpcklwd   xmm7,xmm5
  movdqa      xmm5,[esp+0A0h]
  punpcklwd   xmm0,xmm4
  punpcklwd   xmm0,xmm6
  movdqa      xmm6, [esp+70h]
  punpcklwd   xmm0,xmm7
  movdqa      xmm7,[esp+80h]
  movdqa      xmm2,xmm1
  psubw       xmm2,xmm0
  movdqa      [esp+0D0h],xmm2
  movd        xmm2,eax
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm4,xmm3,0
  movd        xmm2,ecx
  movdqa      xmm3,xmm2
  punpcklwd   xmm3,xmm2
  pshufd      xmm2,xmm3,0
  movdqa      xmm3, [esp+90h]
  movdqa      [esp+50h],xmm2
  movdqa      xmm2,xmm6
  punpcklbw   xmm2,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      [esp+40h],xmm2
  movdqa      [esp+0B0h],xmm6
  movdqa      xmm6,[esp+90h]
  movdqa      xmm2,xmm7
  punpckhbw   xmm7,xmm1
  punpckhbw   xmm6,xmm1
  punpcklbw   xmm2,xmm1
  punpcklbw   xmm3,xmm1
  punpcklbw   xmm5,xmm1
  movdqa      [esp+0F0h],xmm7
  movdqa      [esp+0C0h],xmm6
  movdqa      xmm6, [esp+0A0h]
  punpckhbw   xmm6,xmm1
  movdqa      [esp+0E0h],xmm6
  mov         edx,4
  movsx       eax,dx
  movd        xmm6,eax
  movdqa      xmm7,xmm6
  punpcklwd   xmm7,xmm6
  pshufd      xmm6,xmm7,0
  movdqa      [esp+30h],xmm6
  movdqa      xmm7, [esp+40h]
  psubw       xmm7,xmm5
  movdqa      xmm6,xmm0
  pcmpgtw     xmm6,xmm1
  movdqa      [esp+60h],xmm6
  movdqa      xmm1, [esp+0D0h]
  movdqa      xmm6,xmm3
  psubw       xmm6,xmm2
  psllw       xmm6,2
  paddw       xmm6,xmm7
  paddw       xmm6,[esp+30h]
  psraw       xmm6,3
  pmaxsw      xmm1,xmm6
  movdqa      xmm7,[esp+50h]
  movdqa      [esp+20h],xmm0
  movdqa      xmm6, [esp+20h]
  pminsw      xmm6,xmm1
  movdqa      [esp+20h],xmm6
  movdqa      xmm6,xmm4
  movdqa      xmm1,xmm2
  psubw       xmm1,xmm3
  pabsw       xmm1,xmm1
  pcmpgtw     xmm6,xmm1
  movdqa      xmm1, [esp+40h]
  psubw       xmm1,xmm2
  pabsw       xmm1,xmm1
  pcmpgtw     xmm7,xmm1
  movdqa      xmm1, [esp+50h]
  pand        xmm6,xmm7
  movdqa      xmm7, [esp+50h]
  psubw       xmm5,xmm3
  pabsw       xmm5,xmm5
  pcmpgtw     xmm1,xmm5
  movdqa      xmm5, [esp+0B0h]
  psubw       xmm5,[esp+0E0h]
  pand        xmm6,xmm1
  pand        xmm6, [esp+60h]
  movdqa      xmm1, [esp+20h]
  pand        xmm1,xmm6
  movdqa      xmm6, [esp+0C0h]
  movdqa      [esp+40h],xmm1
  movdqa      xmm1, [esp+0F0h]
  psubw       xmm6,xmm1
  psllw       xmm6,2
  paddw       xmm6,xmm5
  paddw       xmm6, [esp+30h]
  movdqa      xmm5, [esp+0D0h]
  psraw       xmm6,3
  pmaxsw      xmm5,xmm6
  pminsw      xmm0,xmm5
  movdqa      xmm5,[esp+0C0h]
  movdqa      xmm6,xmm1
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm4,xmm6
  movdqa      xmm6,[esp+0B0h]
  psubw       xmm6,xmm1
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  movdqa      xmm6, [esp+0E0h]
  pand        xmm4,xmm7
  movdqa      xmm7, [esp+50h]
  psubw       xmm6,xmm5
  pabsw       xmm6,xmm6
  pcmpgtw     xmm7,xmm6
  pand        xmm4,xmm7
  pand        xmm4,[esp+60h]
  pand        xmm0,xmm4
  movdqa      xmm4, [esp+40h]
  paddw       xmm2,xmm4
  paddw       xmm1,xmm0
  psubw       xmm3,xmm4
  psubw       xmm5,xmm0
  packuswb    xmm2,xmm1
  packuswb    xmm3,xmm5
  movdqa      [esp+80h],xmm2
  movdqa      [esp+90h],xmm3
  mov         esi,dword [esp+1Ch]
  movdqa      xmm0, [esi]
  movdqa      xmm1, [esi+10h]
  movdqa      xmm2, [esi+20h]
  movdqa      xmm3, [esi+30h]
  movdqa      xmm6,xmm0
  punpcklbw   xmm0,xmm1
  punpckhbw   xmm6,xmm1
  movdqa      xmm7,xmm2
  punpcklbw   xmm2,xmm3
  punpckhbw   xmm7,xmm3
  movdqa      xmm4,xmm0
  movdqa      xmm5,xmm6
  punpcklwd   xmm0,xmm2
  punpckhwd   xmm4,xmm2
  punpcklwd   xmm6,xmm7
  punpckhwd   xmm5,xmm7
  movdqa      xmm1,xmm0
  movdqa      xmm2,xmm4
  punpckldq   xmm0,xmm6
  punpckhdq   xmm1,xmm6
  punpckldq   xmm4,xmm5
  punpckhdq   xmm2,xmm5
  movdqa      xmm5,xmm0
  movdqa      xmm6,xmm1
  punpcklqdq  xmm0,xmm4
  punpckhqdq  xmm5,xmm4
  punpcklqdq  xmm1,xmm2
  punpckhqdq  xmm6,xmm2
  mov         esi,dword [esp+14h]
  mov         ecx,dword [ebp+10h]
  mov         edx,dword [esp+0Ch]
  mov         edi,dword [esp+8]
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         esi,dword [esp+18h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  movd        dword [esi],xmm0
  movd        dword [esi+ecx],xmm5
  movd        dword [esi+ecx*2],xmm1
  movd        dword [esi+edx],xmm6
  psrldq      xmm0,4
  psrldq      xmm5,4
  psrldq      xmm1,4
  psrldq      xmm6,4
  mov         edi,dword [esp+10h]
  movd        dword [edi],xmm0
  movd        dword [edi+ecx],xmm5
  movd        dword [edi+ecx*2],xmm1
  movd        dword [edi+edx],xmm6
  pop         edi
  pop         esi
  mov         esp,ebp
  pop         ebp
  ret



;*******************************************************************************
;    void DeblockLumaLt4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta, int8_t * pTC)
;*******************************************************************************


WELS_EXTERN  DeblockLumaLt4V_ssse3
    push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 420				; 000001a4H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]

	pxor	xmm0, xmm0
	push	ebx
	mov	edx, dword [ebp+24]
	movdqa	[esp+424-384], xmm0
	push	esi

	lea	esi, [ecx+ecx*2]
	push	edi
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]

	lea	esi, [ecx+ecx]
	movdqa	[esp+432-208], xmm0
	mov	edi, eax
	sub	edi, esi
	movdqa	xmm0, [edi]
	movdqa	[esp+448-208], xmm0

	mov	ebx, eax
	sub	ebx, ecx
	movdqa	xmm0, [ebx]
	movdqa	[esp+464-208], xmm0

	movdqa	xmm0, [eax]

	add	ecx, eax
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [ecx]
	mov	dword [esp+432-404], ecx

	movsx	ecx, word [ebp+16]
	movdqa	[esp+496-208], xmm0
	movdqa	xmm0, [esi+eax]

	movsx	si, byte [edx]
	movdqa	[esp+512-208], xmm0
	movd	xmm0, ecx
	movsx	ecx, word [ebp+20]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	pshufd	xmm0, xmm1, 0
	movdqa	[esp+432-112], xmm0
	movd	xmm0, ecx
	movsx	cx, byte [edx+1]
	movdqa	xmm1, xmm0
	punpcklwd xmm1, xmm0
	mov	dword [esp+432-408], ebx
	movzx	ebx, cx
	pshufd	xmm0, xmm1, 0
	movd	xmm1, ebx
	movzx	ebx, cx
	movd	xmm2, ebx
	movzx	ebx, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, si
	movd	xmm5, ecx
	movzx	ecx, si
	movd	xmm6, ecx
	movzx	ecx, si
	movd	xmm7, ecx
	movzx	ecx, si
	movdqa	[esp+432-336], xmm0
	movd	xmm0, ecx

	movsx	cx, byte [edx+3]
	movsx	dx, byte [edx+2]
	movd	xmm3, ebx
	punpcklwd xmm0, xmm4
	movzx	esi, cx
	punpcklwd xmm6, xmm2
	punpcklwd xmm5, xmm1
	punpcklwd xmm0, xmm6
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	punpcklwd xmm0, xmm7
	movdqa	[esp+432-400], xmm0
	movd	xmm0, esi
	movzx	esi, cx
	movd	xmm2, esi
	movzx	esi, cx
	movzx	ecx, cx
	movd	xmm4, ecx
	movzx	ecx, dx
	movd	xmm3, esi
	movd	xmm5, ecx
	punpcklwd xmm5, xmm0

	movdqa	xmm0, [esp+432-384]
	movzx	ecx, dx
	movd	xmm6, ecx
	movzx	ecx, dx
	movzx	edx, dx
	punpcklwd xmm6, xmm2
	movd	xmm7, ecx
	movd	xmm1, edx

	movdqa	xmm2, [esp+448-208]
	punpcklbw xmm2, xmm0

	mov	ecx, 4
	movsx	edx, cx
	punpcklwd xmm7, xmm3
	punpcklwd xmm7, xmm5
	movdqa	xmm5, [esp+496-208]
	movdqa	xmm3, [esp+464-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-240], xmm5
	movdqa	xmm5, [esp+512-208]
	punpcklbw xmm5, xmm0
	movdqa	[esp+432-352], xmm5
	punpcklwd xmm1, xmm4
	movdqa	xmm4, [esp+432-208]
	punpcklwd xmm1, xmm6
	movdqa	xmm6, [esp+480-208]
	punpcklwd xmm1, xmm7
	punpcklbw xmm6, xmm0
	punpcklbw xmm3, xmm0
	punpcklbw xmm4, xmm0
	movdqa	xmm7, xmm3
	psubw	xmm7, xmm4
	pabsw	xmm7, xmm7
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-336]
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5
	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-352]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5
	movdqa	xmm5, xmm3
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5
	movdqa	xmm5, [esp+432-400]
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, xmm3
	movdqa	[esp+432-32], xmm6
	psubw	xmm6, [esp+432-240]
	movdqa	xmm7, xmm5
	movdqa	[esp+432-384], xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6

	pand	xmm5, xmm7
	movdqa	xmm6, xmm3
	psubw	xmm6, xmm2
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm4
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-400]
	pand	xmm5, xmm7
	movdqa	xmm7, xmm6
	pcmpeqw	xmm6, xmm0
	pcmpgtw	xmm7, xmm0
	por	xmm7, xmm6
	pand	xmm5, xmm7
	movdqa	[esp+432-320], xmm5
	movd	xmm5, edx
	movdqa	xmm6, xmm5
	punpcklwd xmm6, xmm5
	pshufd	xmm5, xmm6, 0
	movdqa	[esp+432-336], xmm5
	movdqa	xmm5, [esp+432-224]
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm0
	psubw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	psllw	xmm5, 2
	movdqa	xmm7, xmm2
	psubw	xmm7, [esp+432-240]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	psraw	xmm7, 3
	pmaxsw	xmm6, xmm7
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	movdqa	xmm6, [esp+432-400]
	movdqa	[esp+432-64], xmm5
	movdqa	[esp+432-384], xmm6
	movdqa	xmm5, xmm0
	psubw	xmm5, xmm6
	movdqa	[esp+432-368], xmm5
	movdqa	xmm6, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm2
	paddw	xmm7, xmm2
	psubw	xmm5, xmm7
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-384]
	pminsw	xmm5, xmm6

	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-288]
	movdqa	xmm6, [esp+432-240]
	movdqa	[esp+432-96], xmm5
	movdqa	xmm5, [esp+432-352]
	paddw	xmm5, [esp+432-304]
	movdqa	xmm7, xmm6
	paddw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]
	psubw	xmm5, xmm7

	movdqa	xmm7, [esp+496-208]
	psraw	xmm5, 1
	pmaxsw	xmm6, xmm5
	movdqa	xmm5, [esp+432-400]
	pminsw	xmm5, xmm6
	pand	xmm5, [esp+432-320]
	pand	xmm5, [esp+432-256]
	movdqa	xmm6, [esp+448-208]
	punpckhbw xmm7, xmm0
	movdqa	[esp+432-352], xmm7

	movdqa	xmm7, [esp+512-208]
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-48], xmm5
	movdqa	xmm5, [esp+432-208]
	movdqa	[esp+432-368], xmm6
	movdqa	xmm6, [esp+464-208]
	punpckhbw xmm7, xmm0
	punpckhbw xmm5, xmm0
	movdqa	[esp+432-384], xmm7
	punpckhbw xmm6, xmm0
	movdqa	[esp+432-400], xmm6

	movdqa	xmm7, [esp+432-400]
	movdqa	xmm6, [esp+480-208]
	psubw	xmm7, xmm5
	movdqa	[esp+432-16], xmm5
	pabsw	xmm7, xmm7
	punpckhbw xmm6, xmm0
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-288], xmm5

	movdqa	xmm7, xmm6
	psubw	xmm7, [esp+432-384]
	pabsw	xmm7, xmm7
	movdqa	xmm5, xmm4
	pcmpgtw	xmm5, xmm7
	movdqa	[esp+432-256], xmm5

	movdqa	xmm5, [esp+432-400]
	movdqa	[esp+432-80], xmm6
	pavgw	xmm5, xmm6
	movdqa	[esp+432-304], xmm5

	movdqa	xmm5, xmm1
	psubw	xmm5, [esp+432-288]
	psubw	xmm5, [esp+432-256]
	movdqa	[esp+432-224], xmm5
	movdqa	xmm5, xmm6
	psubw	xmm5, [esp+432-400]
	psubw	xmm6, [esp+432-352]
	movdqa	[esp+432-272], xmm5
	movdqa	xmm7, xmm5
	movdqa	xmm5, [esp+432-112]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm5, xmm7
	movdqa	xmm7, xmm4
	pabsw	xmm6, xmm6
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+432-368]

	pand	xmm5, xmm7
	movdqa	xmm7, [esp+432-400]
	psubw	xmm7, xmm6
	psubw	xmm6, [esp+432-352]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7
	pand	xmm5, xmm4

	paddw	xmm2, [esp+432-96]
	movdqa	xmm4, xmm1
	pcmpgtw	xmm4, xmm0
	movdqa	xmm7, xmm1
	pcmpeqw	xmm7, xmm0
	por	xmm4, xmm7
	pand	xmm5, xmm4
	movdqa	xmm4, [esp+432-224]
	movdqa	[esp+432-320], xmm5
	movdqa	xmm5, [esp+432-272]
	movdqa	xmm7, xmm0
	psubw	xmm7, xmm4
	psubw	xmm0, xmm1
	psllw	xmm5, 2
	paddw	xmm6, xmm5
	paddw	xmm6, [esp+432-336]
	movdqa	xmm5, [esp+432-368]
	movdqa	[esp+432-336], xmm0
	psraw	xmm6, 3
	pmaxsw	xmm7, xmm6
	pminsw	xmm4, xmm7
	pand	xmm4, [esp+432-320]
	movdqa	xmm6, xmm0
	movdqa	xmm0, [esp+432-16]
	paddw	xmm0, [esp+432-304]
	movdqa	[esp+432-272], xmm4
	movdqa	xmm4, [esp+432-368]
	paddw	xmm4, xmm4
	psubw	xmm0, xmm4

	movdqa	xmm4, [esp+432-64]
	psraw	xmm0, 1
	pmaxsw	xmm6, xmm0
	movdqa	xmm0, [esp+432-400]
	movdqa	xmm7, xmm1
	pminsw	xmm7, xmm6
	movdqa	xmm6, [esp+432-320]
	pand	xmm7, xmm6
	pand	xmm7, [esp+432-288]
	paddw	xmm5, xmm7
	packuswb xmm2, xmm5
	movdqa	xmm5, [esp+432-272]
	paddw	xmm0, xmm5
	paddw	xmm3, xmm4
	packuswb xmm3, xmm0

	movdqa	xmm0, [esp+432-32]
	psubw	xmm0, xmm4
	movdqa	xmm4, [esp+432-80]
	psubw	xmm4, xmm5

	movdqa	xmm5, [esp+432-240]
	paddw	xmm5, [esp+432-48]
	packuswb xmm0, xmm4
	movdqa	xmm4, [esp+432-384]
	paddw	xmm4, [esp+432-304]
	movdqa	[esp+480-208], xmm0
	movdqa	xmm0, [esp+432-352]
	movdqa	xmm7, xmm0
	paddw	xmm0, xmm0

	mov	ecx, dword [esp+432-408]

	mov	edx, dword [esp+432-404]
	psubw	xmm4, xmm0
	movdqa	xmm0, [esp+432-336]
	movdqa	[edi], xmm2
	psraw	xmm4, 1
	pmaxsw	xmm0, xmm4
	pminsw	xmm1, xmm0
	movdqa	xmm0, [esp+480-208]

	pop	edi
	pand	xmm1, xmm6
	pand	xmm1, [esp+428-256]
	movdqa	[ecx], xmm3
	paddw	xmm7, xmm1
	pop	esi
	packuswb xmm5, xmm7
	movdqa	[eax], xmm0
	movdqa	[edx], xmm5
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret


;*******************************************************************************
;    void DeblockLumaEq4V_ssse3(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
;                                 int32_t iBeta)
;*******************************************************************************


WELS_EXTERN  DeblockLumaEq4V_ssse3

	push	ebp
	mov	ebp, esp
	and	esp, -16				; fffffff0H
	sub	esp, 628				; 00000274H
	mov	eax, dword [ebp+8]
	mov	ecx, dword [ebp+12]
	push	ebx
	push	esi

	lea	edx, [ecx*4]
	pxor	xmm0, xmm0
	movdqa	xmm2, xmm0

	movdqa	xmm0, [ecx+eax]
	mov	esi, eax
	sub	esi, edx
	movdqa	xmm3, [esi]
	movdqa	xmm5, [eax]
	push	edi
	lea	edi, [ecx+ecx]
	lea	ebx, [ecx+ecx*2]
	mov	dword [esp+640-600], edi
	mov	esi, eax
	sub	esi, edi
	movdqa	xmm1, [esi]
	movdqa	 [esp+720-272], xmm0
	mov	edi, eax
	sub	edi, ecx
	movdqa	xmm4, [edi]
	add	ecx, eax
	mov	dword [esp+640-596], ecx

	mov	ecx, dword [esp+640-600]
	movdqa	xmm0, [ecx+eax]
	movdqa	 [esp+736-272], xmm0

	movdqa	xmm0, [eax+ebx]
	mov	edx, eax
	sub	edx, ebx

	movsx	ebx, word [ebp+16]
	movdqa	xmm6, [edx]
	add	ecx, eax
	movdqa	 [esp+752-272], xmm0
	movd	xmm0, ebx

	movsx	ebx, word [ebp+20]
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0
	movdqa	 [esp+640-320], xmm0
	movd	xmm0, ebx
	movdqa	xmm7, xmm0
	punpcklwd xmm7, xmm0
	pshufd	xmm0, xmm7, 0

	movdqa	xmm7, [esp+736-272]
	punpcklbw xmm7, xmm2
	movdqa	 [esp+640-416], xmm7
	movdqa	 [esp+640-512], xmm0
	movdqa	xmm0, xmm1
	movdqa	 [esp+672-272], xmm1
	movdqa	xmm1, xmm4
	movdqa	 [esp+704-272], xmm5
	punpcklbw xmm5, xmm2
	punpcklbw xmm1, xmm2

	movdqa	xmm7, xmm5
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	punpcklbw xmm0, xmm2
	movdqa	 [esp+688-272], xmm4
	movdqa	xmm4, [esp+720-272]
	movdqa	 [esp+640-480], xmm0

	movdqa	xmm7, xmm1
	psubw	xmm7, xmm0

	movdqa	xmm0, [esp+640-512]
	pabsw	xmm7, xmm7
	punpcklbw xmm4, xmm2
	pcmpgtw	xmm0, xmm7
	movdqa	 [esp+640-384], xmm4
	movdqa	xmm7, xmm5
	psubw	xmm7, xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+656-272], xmm6
	punpcklbw xmm6, xmm2
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-48], xmm2
	movdqa	 [esp+640-368], xmm6
	movdqa	 [esp+640-144], xmm1
	movdqa	 [esp+640-400], xmm5
	pcmpgtw	xmm4, xmm7
	pand	xmm0, xmm4
	movdqa	xmm4, [esp+640-320]
	pcmpgtw	xmm4, [esp+640-560]
	pand	xmm0, xmm4

	mov	ebx, 2
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, [esp+640-320]
	psraw	xmm4, 2
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm7
	movdqa	 [esp+640-576], xmm4
	pcmpgtw	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-512]
	movdqa	 [esp+640-624], xmm7
	movdqa	xmm7, xmm1
	psubw	xmm7, xmm6
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-544], xmm4
	movdqa	xmm4, [esp+640-512]
	movdqa	xmm7, xmm5
	psubw	xmm7, [esp+640-416]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm4, xmm7

	pand	xmm4, [esp+640-560]
	movdqa	 [esp+640-560], xmm4

	movdqa	xmm4, [esp+640-544]
	pandn	xmm4, xmm6
	movdqa	 [esp+640-16], xmm4
	mov	ebx, 4
	movsx	ebx, bx
	movd	xmm4, ebx
	movdqa	xmm7, xmm4
	punpcklwd xmm7, xmm4
	movdqa	xmm4, xmm3
	punpcklbw xmm4, xmm2
	psllw	xmm4, 1
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, xmm6
	paddw	xmm4, [esp+640-480]

	movdqa	xmm6, [esp+640-560]
	pshufd	xmm7, xmm7, 0
	paddw	xmm4, xmm1
	movdqa	 [esp+640-592], xmm7
	paddw	xmm4, xmm5
	paddw	xmm4, xmm7
	movdqa	xmm7, [esp+640-416]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-80], xmm6
	movdqa	xmm6, [esp+752-272]
	punpcklbw xmm6, xmm2
	psllw	xmm6, 1
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-384]

	movdqa	xmm7, [esp+640-480]
	paddw	xmm6, xmm5
	paddw	xmm6, xmm1
	paddw	xmm6, [esp+640-592]
	psraw	xmm6, 3
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-112], xmm6
	movdqa	xmm6, [esp+640-544]
	pandn	xmm6, xmm7
	movdqa	 [esp+640-336], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-528], xmm6
	movdqa	xmm6, [esp+640-368]
	paddw	xmm6, xmm7
	movdqa	xmm7, xmm1
	psraw	xmm4, 3
	pand	xmm4, [esp+640-544]
	paddw	xmm7, xmm5
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]

	paddw	xmm5, xmm1
	psraw	xmm6, 2
	pand	xmm7, xmm6

	movdqa	xmm6, [esp+640-384]
	movdqa	 [esp+640-64], xmm7
	movdqa	xmm7, [esp+640-560]
	pandn	xmm7, xmm6
	movdqa	 [esp+640-304], xmm7
	movdqa	xmm7, [esp+640-560]
	movdqa	 [esp+640-528], xmm7
	movdqa	xmm7, [esp+640-416]
	paddw	xmm7, xmm6
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pand	xmm5, xmm7
	movdqa	 [esp+640-32], xmm5

	movdqa	xmm5, [esp+640-544]
	movdqa	 [esp+640-528], xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	xmm7, xmm5
	paddw	xmm7, xmm5
	movdqa	xmm5, xmm1
	paddw	xmm5, xmm6
	paddw	xmm6, [esp+640-592]
	paddw	xmm7, xmm5
	paddw	xmm7, [esp+640-624]
	movdqa	xmm5, [esp+640-528]
	psraw	xmm7, 2
	pandn	xmm5, xmm7
	movdqa	xmm7, [esp+640-480]
	paddw	xmm7, xmm1
	paddw	xmm7, [esp+640-400]
	movdqa	xmm1, [esp+640-544]
	movdqa	 [esp+640-352], xmm5
	movdqa	xmm5, [esp+640-368]
	psllw	xmm7, 1
	paddw	xmm7, xmm6
	paddw	xmm5, xmm7

	movdqa	xmm7, [esp+640-400]
	psraw	xmm5, 3
	pand	xmm1, xmm5
	movdqa	xmm5, [esp+640-480]
	movdqa	 [esp+640-96], xmm1
	movdqa	xmm1, [esp+640-560]
	movdqa	 [esp+640-528], xmm1
	movdqa	xmm1, [esp+640-384]
	movdqa	xmm6, xmm1
	paddw	xmm6, xmm1
	paddw	xmm1, [esp+640-400]
	paddw	xmm1, [esp+640-144]
	paddw	xmm7, xmm5
	paddw	xmm5, [esp+640-592]
	paddw	xmm6, xmm7
	paddw	xmm6, [esp+640-624]
	movdqa	xmm7, [esp+640-528]
	psraw	xmm6, 2
	psllw	xmm1, 1
	paddw	xmm1, xmm5

	movdqa	xmm5, [esp+656-272]
	pandn	xmm7, xmm6
	movdqa	xmm6, [esp+640-416]
	paddw	xmm6, xmm1
	movdqa	xmm1, [esp+640-560]
	psraw	xmm6, 3
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+704-272]
	movdqa	 [esp+640-128], xmm1
	movdqa	xmm1, [esp+672-272]
	punpckhbw xmm1, xmm2
	movdqa	 [esp+640-448], xmm1
	movdqa	xmm1, [esp+688-272]
	punpckhbw xmm1, xmm2
	punpckhbw xmm6, xmm2
	movdqa	 [esp+640-288], xmm7
	punpckhbw xmm5, xmm2
	movdqa	 [esp+640-496], xmm1
	movdqa	 [esp+640-432], xmm6

	movdqa	xmm7, [esp+720-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-464], xmm7

	movdqa	xmm7, [esp+736-272]
	punpckhbw xmm7, xmm2
	movdqa	 [esp+640-528], xmm7

	movdqa	xmm7, xmm6

	psubw	xmm6, [esp+640-464]
	psubw	xmm7, xmm1
	pabsw	xmm7, xmm7
	movdqa	 [esp+640-560], xmm7
	por	xmm4, [esp+640-16]
	pabsw	xmm6, xmm6
	movdqa	xmm7, xmm1
	psubw	xmm7, [esp+640-448]

	movdqa	xmm1, [esp+640-512]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm1, xmm7
	movdqa	xmm7, [esp+640-512]
	pcmpgtw	xmm7, xmm6
	movdqa	xmm6, [esp+640-320]
	pand	xmm1, xmm7
	movdqa	xmm7, [esp+640-560]
	pcmpgtw	xmm6, xmm7
	pand	xmm1, xmm6

	movdqa	xmm6, [esp+640-576]
	pcmpgtw	xmm6, xmm7

	movdqa	xmm7, [esp+640-496]
	punpckhbw xmm3, xmm2
	movdqa	 [esp+640-560], xmm6
	movdqa	xmm6, [esp+640-512]
	psubw	xmm7, xmm5
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7

	pand	xmm6, [esp+640-560]
	movdqa	xmm7, [esp+640-432]
	psubw	xmm7, [esp+640-528]

	psllw	xmm3, 1
	movdqa	 [esp+640-544], xmm6
	movdqa	xmm6, [esp+640-512]

	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, xmm5
	paddw	xmm3, [esp+640-448]
	paddw	xmm3, [esp+640-496]
	pabsw	xmm7, xmm7
	pcmpgtw	xmm6, xmm7
	pand	xmm6, [esp+640-560]
	movdqa	 [esp+640-560], xmm6

	movdqa	xmm6, xmm0
	pand	xmm6, xmm4
	movdqa	xmm4, xmm0
	pandn	xmm4, [esp+640-368]
	por	xmm6, xmm4
	movdqa	xmm4, [esp+640-432]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-592]
	psraw	xmm3, 3
	pand	xmm3, xmm2
	pandn	xmm2, xmm5
	por	xmm3, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm3
	movdqa	xmm3, [esp+640-64]
	por	xmm3, [esp+640-336]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm5
	por	xmm7, xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-480]
	por	xmm2, xmm3
	packuswb xmm6, xmm7
	movdqa	 [esp+640-336], xmm2
	movdqa	 [esp+656-272], xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	xmm2, xmm5
	paddw	xmm2, [esp+640-448]
	movdqa	xmm3, xmm1
	movdqa	xmm7, [esp+640-496]
	paddw	xmm7, xmm4
	paddw	xmm2, xmm7
	paddw	xmm2, [esp+640-624]
	movdqa	xmm7, [esp+640-544]
	psraw	xmm2, 2
	pand	xmm6, xmm2
	movdqa	xmm2, [esp+640-448]
	pandn	xmm7, xmm2
	por	xmm6, xmm7
	pand	xmm3, xmm6
	movdqa	xmm6, xmm1
	pandn	xmm6, xmm2
	paddw	xmm2, [esp+640-496]
	paddw	xmm2, xmm4
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-336]
	packuswb xmm6, xmm3
	psllw	xmm2, 1
	movdqa	 [esp+672-272], xmm6
	movdqa	xmm6, [esp+640-96]
	por	xmm6, [esp+640-352]

	movdqa	xmm3, xmm0
	pand	xmm3, xmm6
	movdqa	xmm6, xmm0
	pandn	xmm6, [esp+640-144]
	por	xmm3, xmm6
	movdqa	xmm6, [esp+640-544]
	movdqa	 [esp+640-352], xmm3
	movdqa	xmm3, [esp+640-464]
	paddw	xmm3, [esp+640-592]
	paddw	xmm2, xmm3
	movdqa	xmm3, [esp+640-448]
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-496]
	psraw	xmm5, 3
	pand	xmm6, xmm5
	movdqa	xmm5, [esp+640-464]
	paddw	xmm2, xmm5
	paddw	xmm5, [esp+640-432]
	movdqa	xmm4, xmm3
	paddw	xmm4, xmm3
	paddw	xmm4, xmm2
	paddw	xmm4, [esp+640-624]
	movdqa	xmm2, [esp+640-544]
	paddw	xmm3, [esp+640-592]
	psraw	xmm4, 2
	pandn	xmm2, xmm4
	por	xmm6, xmm2
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-496]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm6
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-352]
	packuswb xmm2, xmm7
	movdqa	 [esp+688-272], xmm2
	movdqa	xmm2, [esp+640-128]
	por	xmm2, [esp+640-288]

	movdqa	xmm4, xmm0
	pand	xmm4, xmm2
	paddw	xmm5, xmm6
	movdqa	xmm2, xmm0
	pandn	xmm2, [esp+640-400]
	por	xmm4, xmm2
	movdqa	xmm2, [esp+640-528]
	psllw	xmm5, 1
	paddw	xmm5, xmm3
	movdqa	xmm3, [esp+640-560]
	paddw	xmm2, xmm5
	psraw	xmm2, 3
	movdqa	 [esp+640-288], xmm4
	movdqa	xmm4, [esp+640-560]
	pand	xmm4, xmm2
	movdqa	xmm2, [esp+640-464]
	movdqa	xmm5, xmm2
	paddw	xmm5, xmm2
	movdqa	xmm2, [esp+640-432]
	paddw	xmm2, [esp+640-448]
	movdqa	xmm7, xmm1
	paddw	xmm5, xmm2
	paddw	xmm5, [esp+640-624]
	movdqa	xmm6, [esp+640-560]
	psraw	xmm5, 2
	pandn	xmm3, xmm5
	por	xmm4, xmm3
	movdqa	xmm3, [esp+640-32]
	por	xmm3, [esp+640-304]
	pand	xmm7, xmm4
	movdqa	xmm4, [esp+640-432]
	movdqa	xmm5, [esp+640-464]
	movdqa	xmm2, xmm1
	pandn	xmm2, xmm4
	paddw	xmm4, [esp+640-496]
	por	xmm7, xmm2
	movdqa	xmm2, [esp+640-288]
	packuswb xmm2, xmm7
	movdqa	 [esp+704-272], xmm2

	movdqa	xmm2, xmm0
	pand	xmm2, xmm3
	movdqa	xmm3, xmm0
	pandn	xmm3, [esp+640-384]
	por	xmm2, xmm3
	movdqa	 [esp+640-304], xmm2
	movdqa	xmm2, [esp+640-528]
	movdqa	xmm3, xmm2
	paddw	xmm3, [esp+640-464]
	paddw	xmm3, xmm4
	paddw	xmm3, [esp+640-624]
	psraw	xmm3, 2
	pand	xmm6, xmm3
	movdqa	xmm3, [esp+640-560]
	movdqa	xmm4, xmm3
	pandn	xmm4, xmm5
	por	xmm6, xmm4
	movdqa	xmm7, xmm1
	pand	xmm7, xmm6
	movdqa	xmm6, [esp+640-304]
	movdqa	xmm4, xmm1
	pandn	xmm4, xmm5
	por	xmm7, xmm4

	movdqa	xmm4, xmm0
	pandn	xmm0, [esp+640-416]
	packuswb xmm6, xmm7
	movdqa	xmm7, [esp+640-112]
	por	xmm7, [esp+640-80]
	pand	xmm4, xmm7
	por	xmm4, xmm0
	movdqa	xmm0, [esp+752-272]
	punpckhbw xmm0, [esp+640-48]
	psllw	xmm0, 1
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm2
	paddw	xmm0, xmm5
	paddw	xmm0, [esp+640-432]
	paddw	xmm0, [esp+640-496]
	paddw	xmm0, [esp+640-592]
	psraw	xmm0, 3
	pand	xmm0, xmm3
	movdqa	xmm7, xmm1
	pandn	xmm3, xmm2
	por	xmm0, xmm3
	pand	xmm7, xmm0

	movdqa	xmm0, [esp+656-272]
	movdqa	 [edx], xmm0

	movdqa	xmm0, [esp+672-272]

	mov	edx, dword [esp+640-596]
	movdqa	 [esi], xmm0
	movdqa	xmm0, [esp+688-272]
	movdqa	 [edi], xmm0
	movdqa	xmm0, [esp+704-272]

	pop	edi
	pandn	xmm1, xmm2
	movdqa	 [eax], xmm0
	por	xmm7, xmm1
	pop	esi
	packuswb xmm4, xmm7
	movdqa	 [edx], xmm6
	movdqa	 [ecx], xmm4
	pop	ebx
	mov	esp, ebp
	pop	ebp
	ret

%endif



;********************************************************************************
;
;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
;
;********************************************************************************

WELS_EXTERN  DeblockLumaTransposeH2V_sse2
    push     r3
    push     r4
    push     r5

%assign   push_num   3
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r5,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    lea      r3,    [r0 + r1 * 8]
    lea      r4,    [r1 * 3]

    movq    xmm0,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm0,  xmm7
    movq    xmm1,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm1,  xmm7
    movq    xmm2,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm2,  xmm7
    movq    xmm3,  [r0 + r4]
    movq    xmm7,  [r3 + r4]
    punpcklqdq   xmm3,  xmm7

    lea     r0,   [r0 + r1 * 4]
    lea     r3,   [r3 + r1 * 4]
    movq    xmm4,  [r0]
    movq    xmm7,  [r3]
    punpcklqdq   xmm4,  xmm7
    movq    xmm5,  [r0 + r1]
    movq    xmm7,  [r3 + r1]
    punpcklqdq   xmm5,  xmm7
    movq    xmm6,  [r0 + r1*2]
    movq    xmm7,  [r3 + r1*2]
    punpcklqdq   xmm6,  xmm7

    movdqa  [r7],   xmm0
    movq    xmm7,  [r0 + r4]
    movq    xmm0,  [r3 + r4]
    punpcklqdq   xmm7,  xmm0
    movdqa  xmm0,   [r7]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    movdqa  [r2],    xmm4
    movdqa  [r2 + 10h],  xmm2
    movdqa  [r2 + 20h],  xmm3
    movdqa  [r2 + 30h],  xmm7
    movdqa  [r2 + 40h],  xmm5
    movdqa  [r2 + 50h],  xmm1
    movdqa  [r2 + 60h],  xmm6
    movdqa  [r2 + 70h],  xmm0

    mov     r7,   r5
    POP_XMM
    pop     r5
    pop     r4
    pop     r3
    ret


;*******************************************************************************************
;
;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
;
;*******************************************************************************************

WELS_EXTERN   DeblockLumaTransposeV2H_sse2
    push     r3
    push     r4

%assign  push_num 2
    LOAD_3_PARA
    PUSH_XMM 8

    SIGN_EXTENSION   r1, r1d

    mov      r4,    r7
    mov      r3,    r7
    and      r3,    0Fh
    sub      r7,    r3
    sub      r7,    10h

    movdqa   xmm0,   [r2]
    movdqa   xmm1,   [r2 + 10h]
    movdqa   xmm2,   [r2 + 20h]
    movdqa   xmm3,   [r2 + 30h]
    movdqa   xmm4,   [r2 + 40h]
    movdqa   xmm5,   [r2 + 50h]
    movdqa   xmm6,   [r2 + 60h]
    movdqa   xmm7,   [r2 + 70h]

    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1

    lea      r2,   [r1 * 3]

    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0

    psrldq    xmm4,   8
    psrldq    xmm2,   8
    psrldq    xmm3,   8
    psrldq    xmm7,   8
    psrldq    xmm5,   8
    psrldq    xmm1,   8
    psrldq    xmm6,   8
    psrldq    xmm0,   8

    lea       r0,  [r0 + r1*4]
    movq     [r0],  xmm4
    movq     [r0 + r1],  xmm2
    movq     [r0 + r1*2],  xmm3
    movq     [r0 + r2],  xmm7

    lea      r0,   [r0 + r1*4]
    movq     [r0],  xmm5
    movq     [r0 + r1],  xmm1
    movq     [r0 + r1*2],  xmm6
    movq     [r0 + r2],  xmm0


    mov      r7,   r4
    POP_XMM
    pop      r4
    pop      r3
    ret