shithub: openh264

Download patch

ref: 8f9a5469beb962c22b6d8bbe78f01ec79fb33a55
parent: dcf08c6d413b0c9b56fef0b8602049c6a58b184b
author: Martin Storsjö <martin@martin.st>
date: Fri Dec 13 04:40:57 EST 2013

Convert source files to unix newlines

Most files were converted in ff6b669176, but some (non C++
source files) were left with windows newlines.

--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -1,129 +1,129 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        ?Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        ?Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  dct.asm
-;*
-;*  Abstract
-;*      WelsDctFourT4_sse2
-;*
-;*  History
-;*      8/4/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-%macro MMX_SumSubDiv2 3
-    movq    %3, %2
-    psraw   %3, $1
-    paddw   %3, %1
-    psraw   %1, $1
-    psubw   %1, %2
-%endmacro
-
-%macro MMX_SumSub 3
-	movq    %3, %2
-    psubw   %2, %1
-    paddw   %1, %3
-%endmacro
-
-%macro MMX_IDCT 6
-    MMX_SumSub      %4, %5, %6
-    MMX_SumSubDiv2  %3, %2, %1
-    MMX_SumSub		%1, %4, %6
-	MMX_SumSub		%3, %5, %6
-%endmacro
-
-
-%macro MMX_StoreDiff4P 5
-    movd       %2, %5
-    punpcklbw  %2, %4
-    paddw      %1, %3
-    psraw      %1, $6
-    paddsw     %1, %2
-    packuswb   %1, %2
-    movd       %5, %1
-%endmacro
-
-;*******************************************************************************
-; Code
-;*******************************************************************************
-
-SECTION .text
-
-WELS_EXTERN IdctResAddPred_mmx
-
-ALIGN 16
-;*******************************************************************************
-;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
-;*******************************************************************************
-
-IdctResAddPred_mmx:
-
-%define	pushsize	0
-%define pPred       esp+pushsize+4
-%define kiStride     esp+pushsize+8
-%define pRs         esp+pushsize+12
-
-	mov     eax, [pRs   ] 
-    mov     edx, [pPred ]   
-    mov     ecx, [kiStride]   
-    movq    mm0, [eax+ 0]
-    movq    mm1, [eax+ 8]
-    movq    mm2, [eax+16]
-    movq    mm3, [eax+24]
-
-	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
-	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
-    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
-	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
-
-    WELS_Zero			mm7
-    WELS_DW32			mm6
-    
-    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
-    lea     edx, [edx+2*ecx]
-    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
-    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
-    
-%undef	pushsize
-%undef  pPred
-%undef  kiStride
-%undef  pRs
-	emms
-    ret
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  dct.asm
+;*
+;*  Abstract
+;*      WelsDctFourT4_sse2
+;*
+;*  History
+;*      8/4/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+%macro MMX_SumSubDiv2 3
+    movq    %3, %2
+    psraw   %3, $1
+    paddw   %3, %1
+    psraw   %1, $1
+    psubw   %1, %2
+%endmacro
+
+%macro MMX_SumSub 3
+	movq    %3, %2
+    psubw   %2, %1
+    paddw   %1, %3
+%endmacro
+
+%macro MMX_IDCT 6
+    MMX_SumSub      %4, %5, %6
+    MMX_SumSubDiv2  %3, %2, %1
+    MMX_SumSub		%1, %4, %6
+	MMX_SumSub		%3, %5, %6
+%endmacro
+
+
+%macro MMX_StoreDiff4P 5
+    movd       %2, %5
+    punpcklbw  %2, %4
+    paddw      %1, %3
+    psraw      %1, $6
+    paddsw     %1, %2
+    packuswb   %1, %2
+    movd       %5, %1
+%endmacro
+
+;*******************************************************************************
+; Code
+;*******************************************************************************
+
+SECTION .text
+
+WELS_EXTERN IdctResAddPred_mmx
+
+ALIGN 16
+;*******************************************************************************
+;   void_t __cdecl IdctResAddPred_mmx( uint8_t *pPred, const int32_t kiStride, int16_t *pRs )
+;*******************************************************************************
+
+IdctResAddPred_mmx:
+
+%define	pushsize	0
+%define pPred       esp+pushsize+4
+%define kiStride     esp+pushsize+8
+%define pRs         esp+pushsize+12
+
+	mov     eax, [pRs   ] 
+    mov     edx, [pPred ]   
+    mov     ecx, [kiStride]   
+    movq    mm0, [eax+ 0]
+    movq    mm1, [eax+ 8]
+    movq    mm2, [eax+16]
+    movq    mm3, [eax+24]
+
+	MMX_Trans4x4W        mm0, mm1, mm2, mm3, mm4
+	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
+    MMX_Trans4x4W        mm1, mm3, mm0, mm4, mm2
+	MMX_IDCT			mm3, mm0, mm4, mm2, mm1, mm6
+
+    WELS_Zero			mm7
+    WELS_DW32			mm6
+    
+    MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
+    lea     edx, [edx+2*ecx]
+    MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
+    MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
+    
+%undef	pushsize
+%undef  pPred
+%undef  kiStride
+%undef  pRs
+	emms
+    ret
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,68h 
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx] 
+  movq        xmm5,[edx+ecx] 
+  push        esi  
+  push        edi  
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  movq        xmm1,[edi] 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm1,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm2,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm3,[edi] 
+  punpcklqdq  xmm2,xmm3 
+  movq        xmm3,[eax] 
+  punpcklqdq  xmm3,xmm4 
+  movq        xmm4,[edx+eax] 
+  mov       edx, [ebp + 14h] 
+  punpcklqdq  xmm4,xmm5 
+  movd        xmm5,edx 
+  mov       edx, [ebp + 18h] 
+  pxor        xmm0,xmm0 
+  movdqa      xmm6,xmm5 
+  punpcklwd   xmm6,xmm5 
+  pshufd      xmm5,xmm6,0 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,xmm1 
+  punpckhbw   xmm1,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+40h],xmm1 
+  movdqa      [esp+60h],xmm7 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+10h],xmm7 
+  movdqa      xmm7,xmm3 
+  punpcklbw   xmm7,xmm0 
+  punpckhbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm7,xmm4 
+  punpckhbw   xmm4,xmm0 
+  punpckhbw   xmm2,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+30h],xmm3 
+  movdqa      xmm3,[esp+10h] 
+  movdqa      xmm1,xmm3 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      [esp+20h],xmm4 
+  movdqa      xmm0,xmm5 
+  pcmpgtw     xmm0,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  pand        xmm0,xmm4 
+  movdqa      xmm1,xmm7 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,[esp+30h] 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  pand        xmm0,xmm4 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,[esp+20h] 
+  psubw       xmm1,[esp+30h] 
+  pand        xmm5,xmm4 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  pand        xmm5,xmm6 
+  mov         edx,2 
+  movsx       edx,dx 
+  movd        xmm1,edx 
+  movdqa      xmm4,xmm1 
+  punpcklwd   xmm4,xmm1 
+  pshufd      xmm1,xmm4,0 
+  movdqa      xmm4,[esp+60h] 
+  movdqa      xmm6,xmm4 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,xmm3 
+  paddw       xmm6,xmm7 
+  movdqa      [esp+10h],xmm1 
+  paddw       xmm6,[esp+10h] 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm0 
+  pandn       xmm4,xmm3 
+  movdqa      xmm3,[esp+40h] 
+  movdqa      xmm1,xmm0 
+  pand        xmm1,xmm6 
+  por         xmm1,xmm4 
+  movdqa      xmm6,xmm3 
+  paddw       xmm6,xmm3 
+  movdqa      xmm3,[esp+10h] 
+  paddw       xmm6,xmm2 
+  paddw       xmm6,[esp+20h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm5 
+  pand        xmm4,xmm6 
+  movdqa      xmm6,xmm5 
+  pandn       xmm6,xmm2 
+  por         xmm4,xmm6 
+  packuswb    xmm1,xmm4 
+  movdqa      xmm4,[esp+50h] 
+  movdqa      xmm6,xmm7 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,[esp+60h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm2,xmm0 
+  pand        xmm2,xmm6 
+  pandn       xmm0,xmm4 
+  por         xmm2,xmm0 
+  movdqa      xmm0,[esp+20h] 
+  movdqa      xmm6,xmm0 
+  paddw       xmm6,xmm0 
+  movdqa      xmm0,[esp+30h] 
+  paddw       xmm6,xmm0 
+  paddw       xmm6,[esp+40h] 
+  movdqa      xmm4,xmm5 
+  paddw       xmm6,xmm3 
+  movq        [esi],xmm1 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  pandn       xmm5,xmm0 
+  por         xmm4,xmm5 
+  packuswb    xmm2,xmm4 
+  movq        [eax],xmm2 
+  psrldq      xmm1,8 
+  movq        [edi],xmm1 
+  pop         edi  
+  psrldq      xmm2,8 
+  movq        [ecx],xmm2 
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0E4h 
+  push        ebx  
+  push        esi  
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2] 
+  push        edi  
+  movsx       di,byte [esi+3] 
+  mov         word [esp+0Ch],bx 
+  movsx       bx,byte  [esi+1] 
+  movsx       esi,byte  [esi] 
+  mov         word  [esp+0Eh],si 
+  movzx       esi,di 
+  movd        xmm1,esi 
+  movzx       esi,di 
+  movd        xmm2,esi 
+  mov         si,word  [esp+0Ch] 
+  mov         edx, [ebp + 10h] 
+  mov         eax, [ebp + 08h] 
+  movzx       edi,si 
+  movzx       esi,si 
+  mov         ecx, [ebp + 0Ch] 
+  movd        xmm4,esi 
+  movzx       esi,bx 
+  movd        xmm5,esi 
+  movd        xmm3,edi 
+  movzx       esi,bx 
+  movd        xmm6,esi 
+  mov         si,word [esp+0Eh] 
+  movzx       edi,si 
+  movzx       esi,si 
+  punpcklwd   xmm6,xmm2 
+  pxor        xmm0,xmm0 
+  movdqa      [esp+40h],xmm0 
+  movd        xmm7,edi 
+  movd        xmm0,esi 
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  punpcklwd   xmm0,xmm4 
+  movq        xmm4,[edx+ecx] 
+  punpcklwd   xmm7,xmm3 
+  movq        xmm3,[eax] 
+  punpcklwd   xmm0,xmm6 
+  movq        xmm6,[edi] 
+  punpcklwd   xmm7,xmm5 
+  punpcklwd   xmm0,xmm7 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+60h],xmm2 
+  movq        xmm2, [edi] 
+  punpcklqdq  xmm6,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm7,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm7,xmm2 
+  movq        xmm2,[ecx] 
+  punpcklqdq  xmm3,xmm2 
+  movq        xmm2,[edx+eax] 
+  movsx       edx,word [ebp + 14h] 
+  punpcklqdq  xmm2,xmm4 
+  movdqa      [esp+0E0h],xmm2 
+  movd        xmm2,edx 
+  movsx       edx,word [ebp + 18h] 
+  movdqa      xmm4,xmm2 
+  punpcklwd   xmm4,xmm2 
+  movd        xmm2,edx 
+  movdqa      xmm5,xmm2 
+  punpcklwd   xmm5,xmm2 
+  pshufd      xmm2,xmm5,0 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  movdqa      [esp+0D0h],xmm3 
+  pshufd      xmm4,xmm4,0 
+  movdqa      [esp+30h],xmm2 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+80h],xmm6 
+  movdqa      xmm6,[esp+0D0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+70h],xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa     [esp+90h],xmm6 
+  movdqa      xmm5, [esp+0E0h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa       [esp+0A0h],xmm7 
+  punpcklbw   xmm3,xmm1 
+  mov         edx,4 
+  punpcklbw   xmm2,xmm1 
+  movsx       edx,dx 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,[esp+30h] 
+  movdqa      [esp+20h],xmm6 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6, [esp+20h] 
+  movdqa      xmm7, [esp+50h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      [esp+10h],xmm0 
+  movdqa      xmm6, [esp+10h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+10h],xmm6 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm6,xmm4 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+30h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1,[esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5,[esp+80h] 
+  psubw       xmm5,[esp+90h] 
+  pand        xmm6,xmm1 
+  pand        xmm6,[esp+40h] 
+  movdqa      xmm1,[esp+10h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  movdqa      [esp+30h],xmm1 
+  movdqa      xmm1,[esp+0A0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,[esp+20h] 
+  movdqa      xmm5,[esp+60h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+70h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+80h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+90h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+40h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4,[esp+30h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  packuswb    xmm2,xmm1 
+  movq        [esi],xmm2 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm3,xmm5 
+  movq        [eax],xmm3 
+  psrldq      xmm2,8 
+  movq        [edi],xmm2 
+  pop         edi  
+  pop         esi  
+  psrldq      xmm3,8 
+  movq        [ecx],xmm3 
+  pop         ebx  
+  mov         esp,ebp 
+  pop         ebp  
+  ret    
+  
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+  
+DeblockChromaEq4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0C8h  
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+7Ch] 
+  push        edi  
+  mov         dword [esp+14h],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+0Ch],edx 
+  mov         dword [esp+10h],eax 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword  [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+0Ch] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+10h] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  movsx       ecx,word [ebp+14h] 
+  movsx       edx,word [ebp+18h] 
+  movdqa      xmm6,[esp+80h] 
+  movdqa      xmm4,[esp+90h] 
+  movdqa      xmm5,[esp+0A0h] 
+  movdqa      xmm7,[esp+0B0h] 
+  pxor        xmm0,xmm0 
+  movd        xmm1,ecx 
+  movdqa      xmm2,xmm1 
+  punpcklwd   xmm2,xmm1 
+  pshufd      xmm1,xmm2,0 
+  movd        xmm2,edx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3,xmm6 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm6,[esp+0A0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+70h],xmm6 
+  punpcklbw   xmm7,xmm0 
+  punpcklbw   xmm4,xmm0 
+  punpcklbw   xmm5,xmm0 
+  punpcklbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm6,xmm4 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm0,xmm1 
+  pcmpgtw     xmm0,xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm4 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+30h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm1,xmm6 
+  movdqa      xmm6,[esp+60h] 
+  psubw       xmm6,[esp+30h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm1,xmm7 
+  pcmpgtw     xmm2,xmm6 
+  pand        xmm1,xmm2 
+  mov         eax,2 
+  movsx       ecx,ax 
+  movd        xmm2,ecx 
+  movdqa      xmm6,xmm2 
+  punpcklwd   xmm6,xmm2 
+  pshufd      xmm2,xmm6,0 
+  movdqa      [esp+20h],xmm2 
+  movdqa      xmm2,xmm3 
+  paddw       xmm2,xmm3 
+  paddw       xmm2,xmm4 
+  paddw       xmm2,[esp+50h] 
+  paddw       xmm2,[esp+20h] 
+  psraw       xmm2,2 
+  movdqa      xmm6,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm2,xmm0 
+  pandn       xmm2,xmm4 
+  por         xmm6,xmm2 
+  movdqa      xmm2,[esp+60h] 
+  movdqa      xmm7,xmm2 
+  paddw       xmm7,xmm2 
+  paddw       xmm7,[esp+30h] 
+  paddw       xmm7,[esp+70h] 
+  paddw       xmm7,[esp+20h] 
+  movdqa      xmm4,xmm1 
+  movdqa      xmm2,xmm1 
+  pandn       xmm2,[esp+30h] 
+  psraw       xmm7,2 
+  pand        xmm4,xmm7 
+  por         xmm4,xmm2 
+  movdqa      xmm2,[esp+50h] 
+  packuswb    xmm6,xmm4 
+  movdqa      [esp+90h],xmm6 
+  movdqa      xmm6,xmm2 
+  paddw       xmm6,xmm2 
+  movdqa      xmm2,[esp+20h] 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,xmm3 
+  movdqa      xmm4,xmm0 
+  pandn       xmm0,xmm5 
+  paddw       xmm6,xmm2 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  por         xmm4,xmm0 
+  movdqa      xmm0,[esp+70h] 
+  movdqa      xmm5,xmm0 
+  paddw       xmm5,xmm0 
+  movdqa      xmm0,[esp+40h] 
+  paddw       xmm5,xmm0 
+  paddw       xmm5,[esp+60h] 
+  movdqa      xmm3,xmm1 
+  paddw       xmm5,xmm2 
+  psraw       xmm5,2 
+  pand        xmm3,xmm5 
+  pandn       xmm1,xmm0 
+  por         xmm3,xmm1 
+  packuswb    xmm4,xmm3 
+  movdqa      [esp+0A0h],xmm4 
+  mov         esi,dword [esp+10h] 
+  movdqa      xmm0,[esi] 
+  movdqa      xmm1,[esi+10h] 
+  movdqa      xmm2,[esi+20h] 
+  movdqa      xmm3,[esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+0Ch] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  pop         edi  
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+  
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+  
+WELS_EXTERN  DeblockChromaLt4H_sse2
+  
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,108h   
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+10h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+6Ch] 
+  push        edi  
+  mov         dword [esp+0Ch],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+10h],edx 
+  mov         dword [esp+1Ch],eax 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+10h] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+1Ch] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  mov         eax,dword [ebp+1Ch] 
+  movsx       cx,byte [eax+3] 
+  movsx       dx,byte [eax+2] 
+  movsx       si,byte [eax+1] 
+  movsx       ax,byte [eax] 
+  movzx       edi,cx 
+  movzx       ecx,cx 
+  movd        xmm2,ecx 
+  movzx       ecx,dx 
+  movzx       edx,dx 
+  movd        xmm3,ecx 
+  movd        xmm4,edx 
+  movzx       ecx,si 
+  movzx       edx,si 
+  movd        xmm5,ecx 
+  pxor        xmm0,xmm0 
+  movd        xmm6,edx 
+  movzx       ecx,ax 
+  movdqa      [esp+60h],xmm0 
+  movzx       edx,ax 
+  movsx       eax,word [ebp+14h] 
+  punpcklwd   xmm6,xmm2 
+  movd        xmm1,edi 
+  movd        xmm7,ecx 
+  movsx       ecx,word [ebp+18h] 
+  movd        xmm0,edx 
+  punpcklwd   xmm7,xmm3 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  punpcklwd   xmm7,xmm5 
+  movdqa      xmm5,[esp+0A0h] 
+  punpcklwd   xmm0,xmm4 
+  punpcklwd   xmm0,xmm6 
+  movdqa      xmm6, [esp+70h] 
+  punpcklwd   xmm0,xmm7 
+  movdqa      xmm7,[esp+80h] 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+0D0h],xmm2 
+  movd        xmm2,eax 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm4,xmm3,0 
+  movd        xmm2,ecx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3, [esp+90h] 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+40h],xmm2 
+  movdqa      [esp+0B0h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm2,xmm1 
+  punpcklbw   xmm3,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa      [esp+0F0h],xmm7 
+  movdqa      [esp+0C0h],xmm6 
+  movdqa      xmm6, [esp+0A0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+0E0h],xmm6 
+  mov         edx,4 
+  movsx       eax,dx 
+  movd        xmm6,eax 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm7, [esp+40h] 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm1, [esp+0D0h] 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,[esp+30h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      xmm7,[esp+50h] 
+  movdqa      [esp+20h],xmm0 
+  movdqa      xmm6, [esp+20h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+20h],xmm6 
+  movdqa      xmm6,xmm4 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+40h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1, [esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5, [esp+0B0h] 
+  psubw       xmm5,[esp+0E0h] 
+  pand        xmm6,xmm1 
+  pand        xmm6, [esp+60h] 
+  movdqa      xmm1, [esp+20h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6, [esp+0C0h] 
+  movdqa      [esp+40h],xmm1 
+  movdqa      xmm1, [esp+0F0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6, [esp+30h] 
+  movdqa      xmm5, [esp+0D0h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+0C0h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+60h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4, [esp+40h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm2,xmm1 
+  packuswb    xmm3,xmm5 
+  movdqa      [esp+80h],xmm2 
+  movdqa      [esp+90h],xmm3 
+  mov         esi,dword [esp+1Ch] 
+  movdqa      xmm0, [esi] 
+  movdqa      xmm1, [esi+10h] 
+  movdqa      xmm2, [esi+20h] 
+  movdqa      xmm3, [esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+10h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6  
+  pop         edi  
+  pop         esi   
+  mov         esp,ebp 
+  pop         ebp  
+  ret     
+  
+  
+  
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+  
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+  
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+  
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+  
+    
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h    
+    
+    mov     eax,   [ebp + 0Ch]  
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+    
+    movq    xmm0,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7  
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+    
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7  
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+    
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4 
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0   
+    
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+    
+    
+    
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+    
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h  
+    
+    mov      eax,   [ebp + 10h]  
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+      
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    lea      eax,   [ecx * 3]
+    
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0    
+    
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+    
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0   
+    
+    
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src, 
+;							int32_t iSrcStride, 
+;							uint8_t *pDst, 
+;							int32_t iDstStride, 
+;							uint8_t *pABCD, 
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3       
+	punpckhwd mm4, mm4		 
+	
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+	
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+	
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+	
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+	
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+	
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0	
+
+	movq mm0, mm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
+;						int32_t iSrcStride, 
+;						uint8_t *pDst, 
+;						int32_t iDstStride, 
+;						uint8_t *pABCD, 
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+	
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+	
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+	
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+	
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+	
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+	
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0	
+
+	movdqa xmm0, xmm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride, 
+;                        uint8_t *pDst,  
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+		
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]   
+    punpcklwd xmm5, xmm5  
+    punpckldq xmm5, xmm5 
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6    
+    
+	mov eax, [esp + 12 + 4]   
+	mov edx, [esp + 12 + 8]   
+	mov esi, [esp + 12 + 12]  
+	mov edi, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+    
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+	
+.hloop_chroma:	
+	lea	esi, [esi+2*edi]
+	
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+	
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0	
+    
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+    
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4	
+	
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -1,2113 +1,2113 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  deblock.asm
-;*
-;*  Abstract
-;*      edge loop
-;*
-;*  History
-;*      08/07/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;*******************************************************************************
-; Macros and other preprocessor constants
-;*******************************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata pData
-%else
-SECTION .rodata align=16
-%endif
-
-SECTION .text
-
-;********************************************************************************
-;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
-;                             int32_t iAlpha, int32_t iBeta)
-;********************************************************************************
-WELS_EXTERN   DeblockChromaEq4V_sse2
-
-ALIGN  16
-DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
-  mov         edx,[ebp+10h]      ;  iStride
-  mov         eax,[ebp+8]        ;  pPixCb
-  mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-
-;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-
-WELS_EXTERN  DeblockChromaLt4V_sse2
-
-DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
-  mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
-;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;          int32_t iAlpha, int32_t iBeta)
-;***************************************************************************
-
-WELS_EXTERN     DeblockChromaEq4H_sse2
-
-ALIGN  16
-  
-DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
-;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
-;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
-;*******************************************************************************
-  
-WELS_EXTERN  DeblockChromaLt4H_sse2
-  
-ALIGN  16
-
-DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
-;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta, int8_t * pTC)
-;*******************************************************************************
-  
-
-WELS_EXTERN  DeblockLumaLt4V_sse2
-  
-ALIGN  16
-
-DeblockLumaLt4V_sse2:
-    push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 420				; 000001a4H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-
-	pxor	xmm0, xmm0
-	push	ebx
-	mov	edx, dword [ebp+24]
-	movdqa	[esp+424-384], xmm0
-	push	esi
-
-	lea	esi, [ecx+ecx*2]
-	push	edi
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-
-	lea	esi, [ecx+ecx]
-	movdqa	[esp+432-208], xmm0
-	mov	edi, eax
-	sub	edi, esi
-	movdqa	xmm0, [edi]
-	movdqa	[esp+448-208], xmm0
-
-	mov	ebx, eax
-	sub	ebx, ecx
-	movdqa	xmm0, [ebx]
-	movdqa	[esp+464-208], xmm0
-
-	movdqa	xmm0, [eax]
-
-	add	ecx, eax
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [ecx]
-	mov	dword [esp+432-404], ecx
-
-	movsx	ecx, word [ebp+16]
-	movdqa	[esp+496-208], xmm0
-	movdqa	xmm0, [esi+eax]
-
-	movsx	si, byte [edx]
-	movdqa	[esp+512-208], xmm0
-	movd	xmm0, ecx
-	movsx	ecx, word [ebp+20]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	pshufd	xmm0, xmm1, 0
-	movdqa	[esp+432-112], xmm0
-	movd	xmm0, ecx
-	movsx	cx, byte [edx+1]
-	movdqa	xmm1, xmm0
-	punpcklwd xmm1, xmm0
-	mov	dword [esp+432-408], ebx
-	movzx	ebx, cx
-	pshufd	xmm0, xmm1, 0
-	movd	xmm1, ebx
-	movzx	ebx, cx
-	movd	xmm2, ebx
-	movzx	ebx, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, si
-	movd	xmm5, ecx
-	movzx	ecx, si
-	movd	xmm6, ecx
-	movzx	ecx, si
-	movd	xmm7, ecx
-	movzx	ecx, si
-	movdqa	[esp+432-336], xmm0
-	movd	xmm0, ecx
-
-	movsx	cx, byte [edx+3]
-	movsx	dx, byte [edx+2]
-	movd	xmm3, ebx
-	punpcklwd xmm0, xmm4
-	movzx	esi, cx
-	punpcklwd xmm6, xmm2
-	punpcklwd xmm5, xmm1
-	punpcklwd xmm0, xmm6
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	punpcklwd xmm0, xmm7
-	movdqa	[esp+432-400], xmm0
-	movd	xmm0, esi
-	movzx	esi, cx
-	movd	xmm2, esi
-	movzx	esi, cx
-	movzx	ecx, cx
-	movd	xmm4, ecx
-	movzx	ecx, dx
-	movd	xmm3, esi
-	movd	xmm5, ecx
-	punpcklwd xmm5, xmm0
-
-	movdqa	xmm0, [esp+432-384]
-	movzx	ecx, dx
-	movd	xmm6, ecx
-	movzx	ecx, dx
-	movzx	edx, dx
-	punpcklwd xmm6, xmm2
-	movd	xmm7, ecx
-	movd	xmm1, edx
-
-	movdqa	xmm2, [esp+448-208]
-	punpcklbw xmm2, xmm0
-
-	mov	ecx, 4
-	movsx	edx, cx
-	punpcklwd xmm7, xmm3
-	punpcklwd xmm7, xmm5
-	movdqa	xmm5, [esp+496-208]
-	movdqa	xmm3, [esp+464-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-240], xmm5
-	movdqa	xmm5, [esp+512-208]
-	punpcklbw xmm5, xmm0
-	movdqa	[esp+432-352], xmm5
-	punpcklwd xmm1, xmm4
-	movdqa	xmm4, [esp+432-208]
-	punpcklwd xmm1, xmm6
-	movdqa	xmm6, [esp+480-208]
-	punpcklwd xmm1, xmm7
-	punpcklbw xmm6, xmm0
-	punpcklbw xmm3, xmm0
-	punpcklbw xmm4, xmm0
-	movdqa	xmm7, xmm3
-	psubw	xmm7, xmm4
-	pabsw	xmm7, xmm7
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-336]
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-352]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-	movdqa	xmm5, xmm3
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-	movdqa	xmm5, [esp+432-400]
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, xmm3
-	movdqa	[esp+432-32], xmm6
-	psubw	xmm6, [esp+432-240]
-	movdqa	xmm7, xmm5
-	movdqa	[esp+432-384], xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-
-	pand	xmm5, xmm7
-	movdqa	xmm6, xmm3
-	psubw	xmm6, xmm2
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm4
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-400]
-	pand	xmm5, xmm7
-	movdqa	xmm7, xmm6
-	pcmpeqw	xmm6, xmm0
-	pcmpgtw	xmm7, xmm0
-	por	xmm7, xmm6
-	pand	xmm5, xmm7
-	movdqa	[esp+432-320], xmm5
-	movd	xmm5, edx
-	movdqa	xmm6, xmm5
-	punpcklwd xmm6, xmm5
-	pshufd	xmm5, xmm6, 0
-	movdqa	[esp+432-336], xmm5
-	movdqa	xmm5, [esp+432-224]
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm0
-	psubw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	psllw	xmm5, 2
-	movdqa	xmm7, xmm2
-	psubw	xmm7, [esp+432-240]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	psraw	xmm7, 3
-	pmaxsw	xmm6, xmm7
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	movdqa	xmm6, [esp+432-400]
-	movdqa	[esp+432-64], xmm5
-	movdqa	[esp+432-384], xmm6
-	movdqa	xmm5, xmm0
-	psubw	xmm5, xmm6
-	movdqa	[esp+432-368], xmm5
-	movdqa	xmm6, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm2
-	paddw	xmm7, xmm2
-	psubw	xmm5, xmm7
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-384]
-	pminsw	xmm5, xmm6
-
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-288]
-	movdqa	xmm6, [esp+432-240]
-	movdqa	[esp+432-96], xmm5
-	movdqa	xmm5, [esp+432-352]
-	paddw	xmm5, [esp+432-304]
-	movdqa	xmm7, xmm6
-	paddw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-	psubw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+496-208]
-	psraw	xmm5, 1
-	pmaxsw	xmm6, xmm5
-	movdqa	xmm5, [esp+432-400]
-	pminsw	xmm5, xmm6
-	pand	xmm5, [esp+432-320]
-	pand	xmm5, [esp+432-256]
-	movdqa	xmm6, [esp+448-208]
-	punpckhbw xmm7, xmm0
-	movdqa	[esp+432-352], xmm7
-
-	movdqa	xmm7, [esp+512-208]
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-48], xmm5
-	movdqa	xmm5, [esp+432-208]
-	movdqa	[esp+432-368], xmm6
-	movdqa	xmm6, [esp+464-208]
-	punpckhbw xmm7, xmm0
-	punpckhbw xmm5, xmm0
-	movdqa	[esp+432-384], xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	[esp+432-400], xmm6
-
-	movdqa	xmm7, [esp+432-400]
-	movdqa	xmm6, [esp+480-208]
-	psubw	xmm7, xmm5
-	movdqa	[esp+432-16], xmm5
-	pabsw	xmm7, xmm7
-	punpckhbw xmm6, xmm0
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-288], xmm5
-
-	movdqa	xmm7, xmm6
-	psubw	xmm7, [esp+432-384]
-	pabsw	xmm7, xmm7
-	movdqa	xmm5, xmm4
-	pcmpgtw	xmm5, xmm7
-	movdqa	[esp+432-256], xmm5
-
-	movdqa	xmm5, [esp+432-400]
-	movdqa	[esp+432-80], xmm6
-	pavgw	xmm5, xmm6
-	movdqa	[esp+432-304], xmm5
-
-	movdqa	xmm5, xmm1
-	psubw	xmm5, [esp+432-288]
-	psubw	xmm5, [esp+432-256]
-	movdqa	[esp+432-224], xmm5
-	movdqa	xmm5, xmm6
-	psubw	xmm5, [esp+432-400]
-	psubw	xmm6, [esp+432-352]
-	movdqa	[esp+432-272], xmm5
-	movdqa	xmm7, xmm5
-	movdqa	xmm5, [esp+432-112]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm5, xmm7
-	movdqa	xmm7, xmm4
-	pabsw	xmm6, xmm6
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-368]
-
-	pand	xmm5, xmm7
-	movdqa	xmm7, [esp+432-400]
-	psubw	xmm7, xmm6
-	psubw	xmm6, [esp+432-352]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-	pand	xmm5, xmm4
-
-	paddw	xmm2, [esp+432-96]
-	movdqa	xmm4, xmm1
-	pcmpgtw	xmm4, xmm0
-	movdqa	xmm7, xmm1
-	pcmpeqw	xmm7, xmm0
-	por	xmm4, xmm7
-	pand	xmm5, xmm4
-	movdqa	xmm4, [esp+432-224]
-	movdqa	[esp+432-320], xmm5
-	movdqa	xmm5, [esp+432-272]
-	movdqa	xmm7, xmm0
-	psubw	xmm7, xmm4
-	psubw	xmm0, xmm1
-	psllw	xmm5, 2
-	paddw	xmm6, xmm5
-	paddw	xmm6, [esp+432-336]
-	movdqa	xmm5, [esp+432-368]
-	movdqa	[esp+432-336], xmm0
-	psraw	xmm6, 3
-	pmaxsw	xmm7, xmm6
-	pminsw	xmm4, xmm7
-	pand	xmm4, [esp+432-320]
-	movdqa	xmm6, xmm0
-	movdqa	xmm0, [esp+432-16]
-	paddw	xmm0, [esp+432-304]
-	movdqa	[esp+432-272], xmm4
-	movdqa	xmm4, [esp+432-368]
-	paddw	xmm4, xmm4
-	psubw	xmm0, xmm4
-
-	movdqa	xmm4, [esp+432-64]
-	psraw	xmm0, 1
-	pmaxsw	xmm6, xmm0
-	movdqa	xmm0, [esp+432-400]
-	movdqa	xmm7, xmm1
-	pminsw	xmm7, xmm6
-	movdqa	xmm6, [esp+432-320]
-	pand	xmm7, xmm6
-	pand	xmm7, [esp+432-288]
-	paddw	xmm5, xmm7
-	packuswb xmm2, xmm5
-	movdqa	xmm5, [esp+432-272]
-	paddw	xmm0, xmm5
-	paddw	xmm3, xmm4
-	packuswb xmm3, xmm0
-
-	movdqa	xmm0, [esp+432-32]
-	psubw	xmm0, xmm4
-	movdqa	xmm4, [esp+432-80]
-	psubw	xmm4, xmm5
-
-	movdqa	xmm5, [esp+432-240]
-	paddw	xmm5, [esp+432-48]
-	packuswb xmm0, xmm4
-	movdqa	xmm4, [esp+432-384]
-	paddw	xmm4, [esp+432-304]
-	movdqa	[esp+480-208], xmm0
-	movdqa	xmm0, [esp+432-352]
-	movdqa	xmm7, xmm0
-	paddw	xmm0, xmm0
-
-	mov	ecx, dword [esp+432-408]
-
-	mov	edx, dword [esp+432-404]
-	psubw	xmm4, xmm0
-	movdqa	xmm0, [esp+432-336]
-	movdqa	[edi], xmm2
-	psraw	xmm4, 1
-	pmaxsw	xmm0, xmm4
-	pminsw	xmm1, xmm0
-	movdqa	xmm0, [esp+480-208]
-
-	pop	edi
-	pand	xmm1, xmm6
-	pand	xmm1, [esp+428-256]
-	movdqa	[ecx], xmm3
-	paddw	xmm7, xmm1
-	pop	esi
-	packuswb xmm5, xmm7
-	movdqa	[eax], xmm0
-	movdqa	[edx], xmm5
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-
-
-;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
-;                                 int32_t iBeta)
-;*******************************************************************************
-
-WELS_EXTERN  DeblockLumaEq4V_sse2
-  
-ALIGN  16
-
-DeblockLumaEq4V_sse2:
-
-	push	ebp
-	mov	ebp, esp
-	and	esp, -16				; fffffff0H
-	sub	esp, 628				; 00000274H
-	mov	eax, dword [ebp+8]
-	mov	ecx, dword [ebp+12]
-	push	ebx
-	push	esi
-
-	lea	edx, [ecx*4]
-	pxor	xmm0, xmm0
-	movdqa	xmm2, xmm0
-
-	movdqa	xmm0, [ecx+eax]
-	mov	esi, eax
-	sub	esi, edx
-	movdqa	xmm3, [esi]
-	movdqa	xmm5, [eax]
-	push	edi
-	lea	edi, [ecx+ecx]
-	lea	ebx, [ecx+ecx*2]
-	mov	dword [esp+640-600], edi
-	mov	esi, eax
-	sub	esi, edi
-	movdqa	xmm1, [esi]
-	movdqa	 [esp+720-272], xmm0
-	mov	edi, eax
-	sub	edi, ecx
-	movdqa	xmm4, [edi]
-	add	ecx, eax
-	mov	dword [esp+640-596], ecx
-
-	mov	ecx, dword [esp+640-600]
-	movdqa	xmm0, [ecx+eax]
-	movdqa	 [esp+736-272], xmm0
-
-	movdqa	xmm0, [eax+ebx]
-	mov	edx, eax
-	sub	edx, ebx
-
-	movsx	ebx, word [ebp+16]
-	movdqa	xmm6, [edx]
-	add	ecx, eax
-	movdqa	 [esp+752-272], xmm0
-	movd	xmm0, ebx
-
-	movsx	ebx, word [ebp+20]
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-	movdqa	 [esp+640-320], xmm0
-	movd	xmm0, ebx
-	movdqa	xmm7, xmm0
-	punpcklwd xmm7, xmm0
-	pshufd	xmm0, xmm7, 0
-
-	movdqa	xmm7, [esp+736-272]
-	punpcklbw xmm7, xmm2
-	movdqa	 [esp+640-416], xmm7
-	movdqa	 [esp+640-512], xmm0
-	movdqa	xmm0, xmm1
-	movdqa	 [esp+672-272], xmm1
-	movdqa	xmm1, xmm4
-	movdqa	 [esp+704-272], xmm5
-	punpcklbw xmm5, xmm2
-	punpcklbw xmm1, xmm2
-
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	punpcklbw xmm0, xmm2
-	movdqa	 [esp+688-272], xmm4
-	movdqa	xmm4, [esp+720-272]
-	movdqa	 [esp+640-480], xmm0
-
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm0
-
-	movdqa	xmm0, [esp+640-512]
-	pabsw	xmm7, xmm7
-	punpcklbw xmm4, xmm2
-	pcmpgtw	xmm0, xmm7
-	movdqa	 [esp+640-384], xmm4
-	movdqa	xmm7, xmm5
-	psubw	xmm7, xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+656-272], xmm6
-	punpcklbw xmm6, xmm2
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-48], xmm2
-	movdqa	 [esp+640-368], xmm6
-	movdqa	 [esp+640-144], xmm1
-	movdqa	 [esp+640-400], xmm5
-	pcmpgtw	xmm4, xmm7
-	pand	xmm0, xmm4
-	movdqa	xmm4, [esp+640-320]
-	pcmpgtw	xmm4, [esp+640-560]
-	pand	xmm0, xmm4
-
-	mov	ebx, 2
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, [esp+640-320]
-	psraw	xmm4, 2
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm7
-	movdqa	 [esp+640-576], xmm4
-	pcmpgtw	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-512]
-	movdqa	 [esp+640-624], xmm7
-	movdqa	xmm7, xmm1
-	psubw	xmm7, xmm6
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-544], xmm4
-	movdqa	xmm4, [esp+640-512]
-	movdqa	xmm7, xmm5
-	psubw	xmm7, [esp+640-416]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm4, xmm7
-
-	pand	xmm4, [esp+640-560]
-	movdqa	 [esp+640-560], xmm4
-
-	movdqa	xmm4, [esp+640-544]
-	pandn	xmm4, xmm6
-	movdqa	 [esp+640-16], xmm4
-	mov	ebx, 4
-	movsx	ebx, bx
-	movd	xmm4, ebx
-	movdqa	xmm7, xmm4
-	punpcklwd xmm7, xmm4
-	movdqa	xmm4, xmm3
-	punpcklbw xmm4, xmm2
-	psllw	xmm4, 1
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, xmm6
-	paddw	xmm4, [esp+640-480]
-
-	movdqa	xmm6, [esp+640-560]
-	pshufd	xmm7, xmm7, 0
-	paddw	xmm4, xmm1
-	movdqa	 [esp+640-592], xmm7
-	paddw	xmm4, xmm5
-	paddw	xmm4, xmm7
-	movdqa	xmm7, [esp+640-416]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-80], xmm6
-	movdqa	xmm6, [esp+752-272]
-	punpcklbw xmm6, xmm2
-	psllw	xmm6, 1
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-384]
-
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm6, xmm5
-	paddw	xmm6, xmm1
-	paddw	xmm6, [esp+640-592]
-	psraw	xmm6, 3
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-112], xmm6
-	movdqa	xmm6, [esp+640-544]
-	pandn	xmm6, xmm7
-	movdqa	 [esp+640-336], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-528], xmm6
-	movdqa	xmm6, [esp+640-368]
-	paddw	xmm6, xmm7
-	movdqa	xmm7, xmm1
-	psraw	xmm4, 3
-	pand	xmm4, [esp+640-544]
-	paddw	xmm7, xmm5
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-
-	paddw	xmm5, xmm1
-	psraw	xmm6, 2
-	pand	xmm7, xmm6
-
-	movdqa	xmm6, [esp+640-384]
-	movdqa	 [esp+640-64], xmm7
-	movdqa	xmm7, [esp+640-560]
-	pandn	xmm7, xmm6
-	movdqa	 [esp+640-304], xmm7
-	movdqa	xmm7, [esp+640-560]
-	movdqa	 [esp+640-528], xmm7
-	movdqa	xmm7, [esp+640-416]
-	paddw	xmm7, xmm6
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pand	xmm5, xmm7
-	movdqa	 [esp+640-32], xmm5
-
-	movdqa	xmm5, [esp+640-544]
-	movdqa	 [esp+640-528], xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	xmm7, xmm5
-	paddw	xmm7, xmm5
-	movdqa	xmm5, xmm1
-	paddw	xmm5, xmm6
-	paddw	xmm6, [esp+640-592]
-	paddw	xmm7, xmm5
-	paddw	xmm7, [esp+640-624]
-	movdqa	xmm5, [esp+640-528]
-	psraw	xmm7, 2
-	pandn	xmm5, xmm7
-	movdqa	xmm7, [esp+640-480]
-	paddw	xmm7, xmm1
-	paddw	xmm7, [esp+640-400]
-	movdqa	xmm1, [esp+640-544]
-	movdqa	 [esp+640-352], xmm5
-	movdqa	xmm5, [esp+640-368]
-	psllw	xmm7, 1
-	paddw	xmm7, xmm6
-	paddw	xmm5, xmm7
-
-	movdqa	xmm7, [esp+640-400]
-	psraw	xmm5, 3
-	pand	xmm1, xmm5
-	movdqa	xmm5, [esp+640-480]
-	movdqa	 [esp+640-96], xmm1
-	movdqa	xmm1, [esp+640-560]
-	movdqa	 [esp+640-528], xmm1
-	movdqa	xmm1, [esp+640-384]
-	movdqa	xmm6, xmm1
-	paddw	xmm6, xmm1
-	paddw	xmm1, [esp+640-400]
-	paddw	xmm1, [esp+640-144]
-	paddw	xmm7, xmm5
-	paddw	xmm5, [esp+640-592]
-	paddw	xmm6, xmm7
-	paddw	xmm6, [esp+640-624]
-	movdqa	xmm7, [esp+640-528]
-	psraw	xmm6, 2
-	psllw	xmm1, 1
-	paddw	xmm1, xmm5
-
-	movdqa	xmm5, [esp+656-272]
-	pandn	xmm7, xmm6
-	movdqa	xmm6, [esp+640-416]
-	paddw	xmm6, xmm1
-	movdqa	xmm1, [esp+640-560]
-	psraw	xmm6, 3
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+704-272]
-	movdqa	 [esp+640-128], xmm1
-	movdqa	xmm1, [esp+672-272]
-	punpckhbw xmm1, xmm2
-	movdqa	 [esp+640-448], xmm1
-	movdqa	xmm1, [esp+688-272]
-	punpckhbw xmm1, xmm2
-	punpckhbw xmm6, xmm2
-	movdqa	 [esp+640-288], xmm7
-	punpckhbw xmm5, xmm2
-	movdqa	 [esp+640-496], xmm1
-	movdqa	 [esp+640-432], xmm6
-
-	movdqa	xmm7, [esp+720-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-464], xmm7
-
-	movdqa	xmm7, [esp+736-272]
-	punpckhbw xmm7, xmm2
-	movdqa	 [esp+640-528], xmm7
-
-	movdqa	xmm7, xmm6
-
-	psubw	xmm6, [esp+640-464]
-	psubw	xmm7, xmm1
-	pabsw	xmm7, xmm7
-	movdqa	 [esp+640-560], xmm7
-	por	xmm4, [esp+640-16]
-	pabsw	xmm6, xmm6
-	movdqa	xmm7, xmm1
-	psubw	xmm7, [esp+640-448]
-
-	movdqa	xmm1, [esp+640-512]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm1, xmm7
-	movdqa	xmm7, [esp+640-512]
-	pcmpgtw	xmm7, xmm6
-	movdqa	xmm6, [esp+640-320]
-	pand	xmm1, xmm7
-	movdqa	xmm7, [esp+640-560]
-	pcmpgtw	xmm6, xmm7
-	pand	xmm1, xmm6
-
-	movdqa	xmm6, [esp+640-576]
-	pcmpgtw	xmm6, xmm7
-
-	movdqa	xmm7, [esp+640-496]
-	punpckhbw xmm3, xmm2
-	movdqa	 [esp+640-560], xmm6
-	movdqa	xmm6, [esp+640-512]
-	psubw	xmm7, xmm5
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-
-	pand	xmm6, [esp+640-560]
-	movdqa	xmm7, [esp+640-432]
-	psubw	xmm7, [esp+640-528]
-
-	psllw	xmm3, 1
-	movdqa	 [esp+640-544], xmm6
-	movdqa	xmm6, [esp+640-512]
-
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, xmm5
-	paddw	xmm3, [esp+640-448]
-	paddw	xmm3, [esp+640-496]
-	pabsw	xmm7, xmm7
-	pcmpgtw	xmm6, xmm7
-	pand	xmm6, [esp+640-560]
-	movdqa	 [esp+640-560], xmm6
-
-	movdqa	xmm6, xmm0
-	pand	xmm6, xmm4
-	movdqa	xmm4, xmm0
-	pandn	xmm4, [esp+640-368]
-	por	xmm6, xmm4
-	movdqa	xmm4, [esp+640-432]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm3, 3
-	pand	xmm3, xmm2
-	pandn	xmm2, xmm5
-	por	xmm3, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm3
-	movdqa	xmm3, [esp+640-64]
-	por	xmm3, [esp+640-336]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm5
-	por	xmm7, xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-480]
-	por	xmm2, xmm3
-	packuswb xmm6, xmm7
-	movdqa	 [esp+640-336], xmm2
-	movdqa	 [esp+656-272], xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	xmm2, xmm5
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm3, xmm1
-	movdqa	xmm7, [esp+640-496]
-	paddw	xmm7, xmm4
-	paddw	xmm2, xmm7
-	paddw	xmm2, [esp+640-624]
-	movdqa	xmm7, [esp+640-544]
-	psraw	xmm2, 2
-	pand	xmm6, xmm2
-	movdqa	xmm2, [esp+640-448]
-	pandn	xmm7, xmm2
-	por	xmm6, xmm7
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm1
-	pandn	xmm6, xmm2
-	paddw	xmm2, [esp+640-496]
-	paddw	xmm2, xmm4
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-336]
-	packuswb xmm6, xmm3
-	psllw	xmm2, 1
-	movdqa	 [esp+672-272], xmm6
-	movdqa	xmm6, [esp+640-96]
-	por	xmm6, [esp+640-352]
-
-	movdqa	xmm3, xmm0
-	pand	xmm3, xmm6
-	movdqa	xmm6, xmm0
-	pandn	xmm6, [esp+640-144]
-	por	xmm3, xmm6
-	movdqa	xmm6, [esp+640-544]
-	movdqa	 [esp+640-352], xmm3
-	movdqa	xmm3, [esp+640-464]
-	paddw	xmm3, [esp+640-592]
-	paddw	xmm2, xmm3
-	movdqa	xmm3, [esp+640-448]
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-496]
-	psraw	xmm5, 3
-	pand	xmm6, xmm5
-	movdqa	xmm5, [esp+640-464]
-	paddw	xmm2, xmm5
-	paddw	xmm5, [esp+640-432]
-	movdqa	xmm4, xmm3
-	paddw	xmm4, xmm3
-	paddw	xmm4, xmm2
-	paddw	xmm4, [esp+640-624]
-	movdqa	xmm2, [esp+640-544]
-	paddw	xmm3, [esp+640-592]
-	psraw	xmm4, 2
-	pandn	xmm2, xmm4
-	por	xmm6, xmm2
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-496]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm6
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-352]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+688-272], xmm2
-	movdqa	xmm2, [esp+640-128]
-	por	xmm2, [esp+640-288]
-
-	movdqa	xmm4, xmm0
-	pand	xmm4, xmm2
-	paddw	xmm5, xmm6
-	movdqa	xmm2, xmm0
-	pandn	xmm2, [esp+640-400]
-	por	xmm4, xmm2
-	movdqa	xmm2, [esp+640-528]
-	psllw	xmm5, 1
-	paddw	xmm5, xmm3
-	movdqa	xmm3, [esp+640-560]
-	paddw	xmm2, xmm5
-	psraw	xmm2, 3
-	movdqa	 [esp+640-288], xmm4
-	movdqa	xmm4, [esp+640-560]
-	pand	xmm4, xmm2
-	movdqa	xmm2, [esp+640-464]
-	movdqa	xmm5, xmm2
-	paddw	xmm5, xmm2
-	movdqa	xmm2, [esp+640-432]
-	paddw	xmm2, [esp+640-448]
-	movdqa	xmm7, xmm1
-	paddw	xmm5, xmm2
-	paddw	xmm5, [esp+640-624]
-	movdqa	xmm6, [esp+640-560]
-	psraw	xmm5, 2
-	pandn	xmm3, xmm5
-	por	xmm4, xmm3
-	movdqa	xmm3, [esp+640-32]
-	por	xmm3, [esp+640-304]
-	pand	xmm7, xmm4
-	movdqa	xmm4, [esp+640-432]
-	movdqa	xmm5, [esp+640-464]
-	movdqa	xmm2, xmm1
-	pandn	xmm2, xmm4
-	paddw	xmm4, [esp+640-496]
-	por	xmm7, xmm2
-	movdqa	xmm2, [esp+640-288]
-	packuswb xmm2, xmm7
-	movdqa	 [esp+704-272], xmm2
-
-	movdqa	xmm2, xmm0
-	pand	xmm2, xmm3
-	movdqa	xmm3, xmm0
-	pandn	xmm3, [esp+640-384]
-	por	xmm2, xmm3
-	movdqa	 [esp+640-304], xmm2
-	movdqa	xmm2, [esp+640-528]
-	movdqa	xmm3, xmm2
-	paddw	xmm3, [esp+640-464]
-	paddw	xmm3, xmm4
-	paddw	xmm3, [esp+640-624]
-	psraw	xmm3, 2
-	pand	xmm6, xmm3
-	movdqa	xmm3, [esp+640-560]
-	movdqa	xmm4, xmm3
-	pandn	xmm4, xmm5
-	por	xmm6, xmm4
-	movdqa	xmm7, xmm1
-	pand	xmm7, xmm6
-	movdqa	xmm6, [esp+640-304]
-	movdqa	xmm4, xmm1
-	pandn	xmm4, xmm5
-	por	xmm7, xmm4
-
-	movdqa	xmm4, xmm0
-	pandn	xmm0, [esp+640-416]
-	packuswb xmm6, xmm7
-	movdqa	xmm7, [esp+640-112]
-	por	xmm7, [esp+640-80]
-	pand	xmm4, xmm7
-	por	xmm4, xmm0
-	movdqa	xmm0, [esp+752-272]
-	punpckhbw xmm0, [esp+640-48]
-	psllw	xmm0, 1
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm2
-	paddw	xmm0, xmm5
-	paddw	xmm0, [esp+640-432]
-	paddw	xmm0, [esp+640-496]
-	paddw	xmm0, [esp+640-592]
-	psraw	xmm0, 3
-	pand	xmm0, xmm3
-	movdqa	xmm7, xmm1
-	pandn	xmm3, xmm2
-	por	xmm0, xmm3
-	pand	xmm7, xmm0
-
-	movdqa	xmm0, [esp+656-272]
-	movdqa	 [edx], xmm0
-
-	movdqa	xmm0, [esp+672-272]
-
-	mov	edx, dword [esp+640-596]
-	movdqa	 [esi], xmm0
-	movdqa	xmm0, [esp+688-272]
-	movdqa	 [edi], xmm0
-	movdqa	xmm0, [esp+704-272]
-
-	pop	edi
-	pandn	xmm1, xmm2
-	movdqa	 [eax], xmm0
-	por	xmm7, xmm1
-	pop	esi
-	packuswb xmm4, xmm7
-	movdqa	 [edx], xmm6
-	movdqa	 [ecx], xmm4
-	pop	ebx
-	mov	esp, ebp
-	pop	ebp
-	ret
-  
-    
-;********************************************************************************
-;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
-;
-;********************************************************************************
-
-WELS_EXTERN  DeblockLumaTransposeH2V_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeH2V_sse2:
-    push    ebp
-    push    ebx
-    mov     ebp,   esp
-    and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
-    mov     ecx,   [ebp + 10h]
-    lea     edx,   [eax + ecx * 8]
-    lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
-    movq    xmm1,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm2,  xmm7
-    movq    xmm3,  [eax + ebx]
-    movq    xmm7,  [edx + ebx]
-    punpcklqdq   xmm3,  xmm7
-    
-    lea     eax,   [eax + ecx * 4]
-    lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
-    movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
-    movq    xmm5,  [eax + ecx]
-    movq    xmm7,  [edx + ecx]
-    punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
-    movq    xmm7,  [edx + ecx*2]
-    punpcklqdq   xmm6,  xmm7
-    
-    movdqa  [esp],   xmm0
-    movq    xmm7,  [eax + ebx]
-    movq    xmm0,  [edx + ebx]
-    punpcklqdq   xmm7,  xmm0
-    movdqa  xmm0,   [esp]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
-    movdqa  [eax + 10h],  xmm2
-    movdqa  [eax + 20h],  xmm3
-    movdqa  [eax + 30h],  xmm7
-    movdqa  [eax + 40h],  xmm5
-    movdqa  [eax + 50h],  xmm1
-    movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
-    mov     esp,   ebp
-    pop     ebx
-    pop     ebp
-    ret
-    
-    
-    
-;*******************************************************************************************
-;
-;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
-;
-;*******************************************************************************************
-
-WELS_EXTERN   DeblockLumaTransposeV2H_sse2
-
-ALIGN  16
-
-DeblockLumaTransposeV2H_sse2:
-    push     ebp
-    mov      ebp,   esp
-    
-    and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
-    mov      ecx,   [ebp + 0Ch]
-    mov      edx,   [ebp + 08h]
-      
-    movdqa   xmm0,  [eax]
-    movdqa   xmm1,  [eax + 10h]
-    movdqa   xmm2,  [eax + 20h]
-    movdqa   xmm3,	[eax + 30h]
-    movdqa   xmm4,	[eax + 40h]
-    movdqa   xmm5,	[eax + 50h]
-    movdqa   xmm6,	[eax + 60h]
-    movdqa   xmm7,	[eax + 70h]
-    
-    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
-    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
-    lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
-    psrldq    xmm4,   8
-    psrldq    xmm2,   8
-    psrldq    xmm3,   8
-    psrldq    xmm7,   8
-    psrldq    xmm5,   8
-    psrldq    xmm1,   8
-    psrldq    xmm6,   8
-    psrldq    xmm0,   8
-    
-    lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
-    movq     [edx + ecx],  xmm2
-    movq     [edx + ecx*2],  xmm3
-    movq     [edx + eax],  xmm7
-    
-    lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
-    movq     [edx + ecx],  xmm1
-    movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
-    mov      esp,   ebp
-    pop      ebp
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  deblock.asm
+;*
+;*  Abstract
+;*      edge loop
+;*
+;*  History
+;*      08/07/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;*******************************************************************************
+; Macros and other preprocessor constants
+;*******************************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata pData
+%else
+SECTION .rodata align=16
+%endif
+
+SECTION .text
+
+;********************************************************************************
+;  void DeblockChromaEq4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
+;                             int32_t iAlpha, int32_t iBeta)
+;********************************************************************************
+WELS_EXTERN   DeblockChromaEq4V_sse2
+
+ALIGN  16
+DeblockChromaEq4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,68h 
+  mov         edx,[ebp+10h]      ;  iStride
+  mov         eax,[ebp+8]        ;  pPixCb
+  mov         ecx,[ebp+0Ch]      ;  pPixCr
+  movq        xmm4,[ecx] 
+  movq        xmm5,[edx+ecx] 
+  push        esi  
+  push        edi  
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  movq        xmm1,[edi] 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm1,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm2,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm3,[edi] 
+  punpcklqdq  xmm2,xmm3 
+  movq        xmm3,[eax] 
+  punpcklqdq  xmm3,xmm4 
+  movq        xmm4,[edx+eax] 
+  mov       edx, [ebp + 14h] 
+  punpcklqdq  xmm4,xmm5 
+  movd        xmm5,edx 
+  mov       edx, [ebp + 18h] 
+  pxor        xmm0,xmm0 
+  movdqa      xmm6,xmm5 
+  punpcklwd   xmm6,xmm5 
+  pshufd      xmm5,xmm6,0 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,xmm1 
+  punpckhbw   xmm1,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+40h],xmm1 
+  movdqa      [esp+60h],xmm7 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+10h],xmm7 
+  movdqa      xmm7,xmm3 
+  punpcklbw   xmm7,xmm0 
+  punpckhbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm7,xmm4 
+  punpckhbw   xmm4,xmm0 
+  punpckhbw   xmm2,xmm0 
+  punpcklbw   xmm7,xmm0 
+  movdqa      [esp+30h],xmm3 
+  movdqa      xmm3,[esp+10h] 
+  movdqa      xmm1,xmm3 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      [esp+20h],xmm4 
+  movdqa      xmm0,xmm5 
+  pcmpgtw     xmm0,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  pand        xmm0,xmm4 
+  movdqa      xmm1,xmm7 
+  psubw       xmm1,[esp+50h] 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,[esp+30h] 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  pand        xmm0,xmm4 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm4,xmm6 
+  pcmpgtw     xmm4,xmm1 
+  movdqa      xmm1,[esp+20h] 
+  psubw       xmm1,[esp+30h] 
+  pand        xmm5,xmm4 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  pand        xmm5,xmm6 
+  mov         edx,2 
+  movsx       edx,dx 
+  movd        xmm1,edx 
+  movdqa      xmm4,xmm1 
+  punpcklwd   xmm4,xmm1 
+  pshufd      xmm1,xmm4,0 
+  movdqa      xmm4,[esp+60h] 
+  movdqa      xmm6,xmm4 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,xmm3 
+  paddw       xmm6,xmm7 
+  movdqa      [esp+10h],xmm1 
+  paddw       xmm6,[esp+10h] 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm0 
+  pandn       xmm4,xmm3 
+  movdqa      xmm3,[esp+40h] 
+  movdqa      xmm1,xmm0 
+  pand        xmm1,xmm6 
+  por         xmm1,xmm4 
+  movdqa      xmm6,xmm3 
+  paddw       xmm6,xmm3 
+  movdqa      xmm3,[esp+10h] 
+  paddw       xmm6,xmm2 
+  paddw       xmm6,[esp+20h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm4,xmm5 
+  pand        xmm4,xmm6 
+  movdqa      xmm6,xmm5 
+  pandn       xmm6,xmm2 
+  por         xmm4,xmm6 
+  packuswb    xmm1,xmm4 
+  movdqa      xmm4,[esp+50h] 
+  movdqa      xmm6,xmm7 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,xmm4 
+  paddw       xmm6,[esp+60h] 
+  paddw       xmm6,xmm3 
+  psraw       xmm6,2 
+  movdqa      xmm2,xmm0 
+  pand        xmm2,xmm6 
+  pandn       xmm0,xmm4 
+  por         xmm2,xmm0 
+  movdqa      xmm0,[esp+20h] 
+  movdqa      xmm6,xmm0 
+  paddw       xmm6,xmm0 
+  movdqa      xmm0,[esp+30h] 
+  paddw       xmm6,xmm0 
+  paddw       xmm6,[esp+40h] 
+  movdqa      xmm4,xmm5 
+  paddw       xmm6,xmm3 
+  movq        [esi],xmm1 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  pandn       xmm5,xmm0 
+  por         xmm4,xmm5 
+  packuswb    xmm2,xmm4 
+  movq        [eax],xmm2 
+  psrldq      xmm1,8 
+  movq        [edi],xmm1 
+  pop         edi  
+  psrldq      xmm2,8 
+  movq        [ecx],xmm2 
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+
+;******************************************************************************
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+
+WELS_EXTERN  DeblockChromaLt4V_sse2
+
+DeblockChromaLt4V_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0E4h 
+  push        ebx  
+  push        esi  
+  mov         esi, [ebp+1Ch]      ;  pTC
+  movsx       ebx, byte [esi+2] 
+  push        edi  
+  movsx       di,byte [esi+3] 
+  mov         word [esp+0Ch],bx 
+  movsx       bx,byte  [esi+1] 
+  movsx       esi,byte  [esi] 
+  mov         word  [esp+0Eh],si 
+  movzx       esi,di 
+  movd        xmm1,esi 
+  movzx       esi,di 
+  movd        xmm2,esi 
+  mov         si,word  [esp+0Ch] 
+  mov         edx, [ebp + 10h] 
+  mov         eax, [ebp + 08h] 
+  movzx       edi,si 
+  movzx       esi,si 
+  mov         ecx, [ebp + 0Ch] 
+  movd        xmm4,esi 
+  movzx       esi,bx 
+  movd        xmm5,esi 
+  movd        xmm3,edi 
+  movzx       esi,bx 
+  movd        xmm6,esi 
+  mov         si,word [esp+0Eh] 
+  movzx       edi,si 
+  movzx       esi,si 
+  punpcklwd   xmm6,xmm2 
+  pxor        xmm0,xmm0 
+  movdqa      [esp+40h],xmm0 
+  movd        xmm7,edi 
+  movd        xmm0,esi 
+  lea         esi,[edx+edx] 
+  mov         edi,eax 
+  sub         edi,esi 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+40h] 
+  punpcklwd   xmm0,xmm4 
+  movq        xmm4,[edx+ecx] 
+  punpcklwd   xmm7,xmm3 
+  movq        xmm3,[eax] 
+  punpcklwd   xmm0,xmm6 
+  movq        xmm6,[edi] 
+  punpcklwd   xmm7,xmm5 
+  punpcklwd   xmm0,xmm7 
+  mov         edi,ecx 
+  sub         edi,esi 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+60h],xmm2 
+  movq        xmm2, [edi] 
+  punpcklqdq  xmm6,xmm2 
+  mov         esi,eax 
+  sub         esi,edx 
+  movq        xmm7,[esi] 
+  mov         edi,ecx 
+  sub         edi,edx 
+  movq        xmm2,[edi] 
+  punpcklqdq  xmm7,xmm2 
+  movq        xmm2,[ecx] 
+  punpcklqdq  xmm3,xmm2 
+  movq        xmm2,[edx+eax] 
+  movsx       edx,word [ebp + 14h] 
+  punpcklqdq  xmm2,xmm4 
+  movdqa      [esp+0E0h],xmm2 
+  movd        xmm2,edx 
+  movsx       edx,word [ebp + 18h] 
+  movdqa      xmm4,xmm2 
+  punpcklwd   xmm4,xmm2 
+  movd        xmm2,edx 
+  movdqa      xmm5,xmm2 
+  punpcklwd   xmm5,xmm2 
+  pshufd      xmm2,xmm5,0 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  movdqa      [esp+0D0h],xmm3 
+  pshufd      xmm4,xmm4,0 
+  movdqa      [esp+30h],xmm2 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+80h],xmm6 
+  movdqa      xmm6,[esp+0D0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+70h],xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa     [esp+90h],xmm6 
+  movdqa      xmm5, [esp+0E0h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa       [esp+0A0h],xmm7 
+  punpcklbw   xmm3,xmm1 
+  mov         edx,4 
+  punpcklbw   xmm2,xmm1 
+  movsx       edx,dx 
+  movd        xmm6,edx 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      xmm7,[esp+30h] 
+  movdqa      [esp+20h],xmm6 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6, [esp+20h] 
+  movdqa      xmm7, [esp+50h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      [esp+10h],xmm0 
+  movdqa      xmm6, [esp+10h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+10h],xmm6 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  movdqa      xmm6,xmm4 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+30h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1,[esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5,[esp+80h] 
+  psubw       xmm5,[esp+90h] 
+  pand        xmm6,xmm1 
+  pand        xmm6,[esp+40h] 
+  movdqa      xmm1,[esp+10h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  movdqa      [esp+30h],xmm1 
+  movdqa      xmm1,[esp+0A0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,[esp+20h] 
+  movdqa      xmm5,[esp+60h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+70h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+80h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+90h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+40h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4,[esp+30h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  packuswb    xmm2,xmm1 
+  movq        [esi],xmm2 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm3,xmm5 
+  movq        [eax],xmm3 
+  psrldq      xmm2,8 
+  movq        [edi],xmm2 
+  pop         edi  
+  pop         esi  
+  psrldq      xmm3,8 
+  movq        [ecx],xmm3 
+  pop         ebx  
+  mov         esp,ebp 
+  pop         ebp  
+  ret    
+  
+;***************************************************************************
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;          int32_t iAlpha, int32_t iBeta)
+;***************************************************************************
+
+WELS_EXTERN     DeblockChromaEq4H_sse2
+
+ALIGN  16
+  
+DeblockChromaEq4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,0C8h  
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+7Ch] 
+  push        edi  
+  mov         dword [esp+14h],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+0Ch],edx 
+  mov         dword [esp+10h],eax 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword  [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+0Ch] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+10h] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  movsx       ecx,word [ebp+14h] 
+  movsx       edx,word [ebp+18h] 
+  movdqa      xmm6,[esp+80h] 
+  movdqa      xmm4,[esp+90h] 
+  movdqa      xmm5,[esp+0A0h] 
+  movdqa      xmm7,[esp+0B0h] 
+  pxor        xmm0,xmm0 
+  movd        xmm1,ecx 
+  movdqa      xmm2,xmm1 
+  punpcklwd   xmm2,xmm1 
+  pshufd      xmm1,xmm2,0 
+  movd        xmm2,edx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3,xmm6 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm6,[esp+0A0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+40h],xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  punpckhbw   xmm6,xmm0 
+  movdqa      [esp+70h],xmm6 
+  punpcklbw   xmm7,xmm0 
+  punpcklbw   xmm4,xmm0 
+  punpcklbw   xmm5,xmm0 
+  punpcklbw   xmm3,xmm0 
+  movdqa      [esp+50h],xmm7 
+  movdqa      xmm6,xmm4 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm0,xmm1 
+  pcmpgtw     xmm0,xmm6 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm4 
+  pabsw       xmm6,xmm6 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+30h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm1,xmm6 
+  movdqa      xmm6,[esp+60h] 
+  psubw       xmm6,[esp+30h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm0,xmm7 
+  movdqa      xmm7,xmm2 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6,[esp+70h] 
+  psubw       xmm6,[esp+40h] 
+  pabsw       xmm6,xmm6 
+  pand        xmm1,xmm7 
+  pcmpgtw     xmm2,xmm6 
+  pand        xmm1,xmm2 
+  mov         eax,2 
+  movsx       ecx,ax 
+  movd        xmm2,ecx 
+  movdqa      xmm6,xmm2 
+  punpcklwd   xmm6,xmm2 
+  pshufd      xmm2,xmm6,0 
+  movdqa      [esp+20h],xmm2 
+  movdqa      xmm2,xmm3 
+  paddw       xmm2,xmm3 
+  paddw       xmm2,xmm4 
+  paddw       xmm2,[esp+50h] 
+  paddw       xmm2,[esp+20h] 
+  psraw       xmm2,2 
+  movdqa      xmm6,xmm0 
+  pand        xmm6,xmm2 
+  movdqa      xmm2,xmm0 
+  pandn       xmm2,xmm4 
+  por         xmm6,xmm2 
+  movdqa      xmm2,[esp+60h] 
+  movdqa      xmm7,xmm2 
+  paddw       xmm7,xmm2 
+  paddw       xmm7,[esp+30h] 
+  paddw       xmm7,[esp+70h] 
+  paddw       xmm7,[esp+20h] 
+  movdqa      xmm4,xmm1 
+  movdqa      xmm2,xmm1 
+  pandn       xmm2,[esp+30h] 
+  psraw       xmm7,2 
+  pand        xmm4,xmm7 
+  por         xmm4,xmm2 
+  movdqa      xmm2,[esp+50h] 
+  packuswb    xmm6,xmm4 
+  movdqa      [esp+90h],xmm6 
+  movdqa      xmm6,xmm2 
+  paddw       xmm6,xmm2 
+  movdqa      xmm2,[esp+20h] 
+  paddw       xmm6,xmm5 
+  paddw       xmm6,xmm3 
+  movdqa      xmm4,xmm0 
+  pandn       xmm0,xmm5 
+  paddw       xmm6,xmm2 
+  psraw       xmm6,2 
+  pand        xmm4,xmm6 
+  por         xmm4,xmm0 
+  movdqa      xmm0,[esp+70h] 
+  movdqa      xmm5,xmm0 
+  paddw       xmm5,xmm0 
+  movdqa      xmm0,[esp+40h] 
+  paddw       xmm5,xmm0 
+  paddw       xmm5,[esp+60h] 
+  movdqa      xmm3,xmm1 
+  paddw       xmm5,xmm2 
+  psraw       xmm5,2 
+  pand        xmm3,xmm5 
+  pandn       xmm1,xmm0 
+  por         xmm3,xmm1 
+  packuswb    xmm4,xmm3 
+  movdqa      [esp+0A0h],xmm4 
+  mov         esi,dword [esp+10h] 
+  movdqa      xmm0,[esi] 
+  movdqa      xmm1,[esi+10h] 
+  movdqa      xmm2,[esi+20h] 
+  movdqa      xmm3,[esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+1Ch] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+14h] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+0Ch] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  pop         edi  
+  pop         esi  
+  mov         esp,ebp 
+  pop         ebp  
+  ret              
+  
+;*******************************************************************************
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
+;*******************************************************************************
+  
+WELS_EXTERN  DeblockChromaLt4H_sse2
+  
+ALIGN  16
+
+DeblockChromaLt4H_sse2:
+  push        ebp  
+  mov         ebp,esp 
+  and         esp,0FFFFFFF0h 
+  sub         esp,108h   
+  mov         ecx,dword [ebp+8] 
+  mov         edx,dword [ebp+0Ch] 
+  mov         eax,dword [ebp+10h] 
+  sub         ecx,2 
+  sub         edx,2 
+  push        esi  
+  lea         esi,[eax+eax*2] 
+  mov         dword [esp+10h],ecx 
+  mov         dword [esp+4],edx 
+  lea         ecx,[ecx+eax*4] 
+  lea         edx,[edx+eax*4] 
+  lea         eax,[esp+6Ch] 
+  push        edi  
+  mov         dword [esp+0Ch],esi 
+  mov         dword [esp+18h],ecx 
+  mov         dword [esp+10h],edx 
+  mov         dword [esp+1Ch],eax 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  movd        xmm0,dword [esi] 
+  movd        xmm1,dword [esi+ecx] 
+  movd        xmm2,dword [esi+ecx*2] 
+  movd        xmm3,dword [esi+edx] 
+  mov         esi,dword [esp+8] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [esi+ecx] 
+  movd        xmm6,dword [esi+ecx*2] 
+  movd        xmm7,dword [esi+edx] 
+  punpckldq   xmm0,xmm4 
+  punpckldq   xmm1,xmm5 
+  punpckldq   xmm2,xmm6 
+  punpckldq   xmm3,xmm7 
+  mov         esi,dword [esp+18h] 
+  mov         edi,dword [esp+10h] 
+  movd        xmm4,dword [esi] 
+  movd        xmm5,dword [edi] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm0,xmm4 
+  movd        xmm4,dword [esi+ecx] 
+  movd        xmm5,dword [edi+ecx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm1,xmm4 
+  movd        xmm4,dword [esi+ecx*2] 
+  movd        xmm5,dword [edi+ecx*2] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm2,xmm4 
+  movd        xmm4,dword [esi+edx] 
+  movd        xmm5,dword [edi+edx] 
+  punpckldq   xmm4,xmm5 
+  punpcklqdq  xmm3,xmm4 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         edi,dword [esp+1Ch] 
+  movdqa      [edi],xmm0 
+  movdqa      [edi+10h],xmm5 
+  movdqa      [edi+20h],xmm1 
+  movdqa      [edi+30h],xmm6 
+  mov         eax,dword [ebp+1Ch] 
+  movsx       cx,byte [eax+3] 
+  movsx       dx,byte [eax+2] 
+  movsx       si,byte [eax+1] 
+  movsx       ax,byte [eax] 
+  movzx       edi,cx 
+  movzx       ecx,cx 
+  movd        xmm2,ecx 
+  movzx       ecx,dx 
+  movzx       edx,dx 
+  movd        xmm3,ecx 
+  movd        xmm4,edx 
+  movzx       ecx,si 
+  movzx       edx,si 
+  movd        xmm5,ecx 
+  pxor        xmm0,xmm0 
+  movd        xmm6,edx 
+  movzx       ecx,ax 
+  movdqa      [esp+60h],xmm0 
+  movzx       edx,ax 
+  movsx       eax,word [ebp+14h] 
+  punpcklwd   xmm6,xmm2 
+  movd        xmm1,edi 
+  movd        xmm7,ecx 
+  movsx       ecx,word [ebp+18h] 
+  movd        xmm0,edx 
+  punpcklwd   xmm7,xmm3 
+  punpcklwd   xmm5,xmm1 
+  movdqa      xmm1,[esp+60h] 
+  punpcklwd   xmm7,xmm5 
+  movdqa      xmm5,[esp+0A0h] 
+  punpcklwd   xmm0,xmm4 
+  punpcklwd   xmm0,xmm6 
+  movdqa      xmm6, [esp+70h] 
+  punpcklwd   xmm0,xmm7 
+  movdqa      xmm7,[esp+80h] 
+  movdqa      xmm2,xmm1 
+  psubw       xmm2,xmm0 
+  movdqa      [esp+0D0h],xmm2 
+  movd        xmm2,eax 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm4,xmm3,0 
+  movd        xmm2,ecx 
+  movdqa      xmm3,xmm2 
+  punpcklwd   xmm3,xmm2 
+  pshufd      xmm2,xmm3,0 
+  movdqa      xmm3, [esp+90h] 
+  movdqa      [esp+50h],xmm2 
+  movdqa      xmm2,xmm6 
+  punpcklbw   xmm2,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+40h],xmm2 
+  movdqa      [esp+0B0h],xmm6 
+  movdqa      xmm6,[esp+90h] 
+  movdqa      xmm2,xmm7 
+  punpckhbw   xmm7,xmm1 
+  punpckhbw   xmm6,xmm1 
+  punpcklbw   xmm2,xmm1 
+  punpcklbw   xmm3,xmm1 
+  punpcklbw   xmm5,xmm1 
+  movdqa      [esp+0F0h],xmm7 
+  movdqa      [esp+0C0h],xmm6 
+  movdqa      xmm6, [esp+0A0h] 
+  punpckhbw   xmm6,xmm1 
+  movdqa      [esp+0E0h],xmm6 
+  mov         edx,4 
+  movsx       eax,dx 
+  movd        xmm6,eax 
+  movdqa      xmm7,xmm6 
+  punpcklwd   xmm7,xmm6 
+  pshufd      xmm6,xmm7,0 
+  movdqa      [esp+30h],xmm6 
+  movdqa      xmm7, [esp+40h] 
+  psubw       xmm7,xmm5 
+  movdqa      xmm6,xmm0 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      [esp+60h],xmm6 
+  movdqa      xmm1, [esp+0D0h] 
+  movdqa      xmm6,xmm3 
+  psubw       xmm6,xmm2 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm7 
+  paddw       xmm6,[esp+30h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm1,xmm6 
+  movdqa      xmm7,[esp+50h] 
+  movdqa      [esp+20h],xmm0 
+  movdqa      xmm6, [esp+20h] 
+  pminsw      xmm6,xmm1 
+  movdqa      [esp+20h],xmm6 
+  movdqa      xmm6,xmm4 
+  movdqa      xmm1,xmm2 
+  psubw       xmm1,xmm3 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm6,xmm1 
+  movdqa      xmm1, [esp+40h] 
+  psubw       xmm1,xmm2 
+  pabsw       xmm1,xmm1 
+  pcmpgtw     xmm7,xmm1 
+  movdqa      xmm1, [esp+50h] 
+  pand        xmm6,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm5,xmm3 
+  pabsw       xmm5,xmm5 
+  pcmpgtw     xmm1,xmm5 
+  movdqa      xmm5, [esp+0B0h] 
+  psubw       xmm5,[esp+0E0h] 
+  pand        xmm6,xmm1 
+  pand        xmm6, [esp+60h] 
+  movdqa      xmm1, [esp+20h] 
+  pand        xmm1,xmm6 
+  movdqa      xmm6, [esp+0C0h] 
+  movdqa      [esp+40h],xmm1 
+  movdqa      xmm1, [esp+0F0h] 
+  psubw       xmm6,xmm1 
+  psllw       xmm6,2 
+  paddw       xmm6,xmm5 
+  paddw       xmm6, [esp+30h] 
+  movdqa      xmm5, [esp+0D0h] 
+  psraw       xmm6,3 
+  pmaxsw      xmm5,xmm6 
+  pminsw      xmm0,xmm5 
+  movdqa      xmm5,[esp+0C0h] 
+  movdqa      xmm6,xmm1 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm4,xmm6 
+  movdqa      xmm6,[esp+0B0h] 
+  psubw       xmm6,xmm1 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  movdqa      xmm6, [esp+0E0h] 
+  pand        xmm4,xmm7 
+  movdqa      xmm7, [esp+50h] 
+  psubw       xmm6,xmm5 
+  pabsw       xmm6,xmm6 
+  pcmpgtw     xmm7,xmm6 
+  pand        xmm4,xmm7 
+  pand        xmm4,[esp+60h] 
+  pand        xmm0,xmm4 
+  movdqa      xmm4, [esp+40h] 
+  paddw       xmm2,xmm4 
+  paddw       xmm1,xmm0 
+  psubw       xmm3,xmm4 
+  psubw       xmm5,xmm0 
+  packuswb    xmm2,xmm1 
+  packuswb    xmm3,xmm5 
+  movdqa      [esp+80h],xmm2 
+  movdqa      [esp+90h],xmm3 
+  mov         esi,dword [esp+1Ch] 
+  movdqa      xmm0, [esi] 
+  movdqa      xmm1, [esi+10h] 
+  movdqa      xmm2, [esi+20h] 
+  movdqa      xmm3, [esi+30h] 
+  movdqa      xmm6,xmm0 
+  punpcklbw   xmm0,xmm1 
+  punpckhbw   xmm6,xmm1 
+  movdqa      xmm7,xmm2 
+  punpcklbw   xmm2,xmm3 
+  punpckhbw   xmm7,xmm3 
+  movdqa      xmm4,xmm0 
+  movdqa      xmm5,xmm6 
+  punpcklwd   xmm0,xmm2 
+  punpckhwd   xmm4,xmm2 
+  punpcklwd   xmm6,xmm7 
+  punpckhwd   xmm5,xmm7 
+  movdqa      xmm1,xmm0 
+  movdqa      xmm2,xmm4 
+  punpckldq   xmm0,xmm6 
+  punpckhdq   xmm1,xmm6 
+  punpckldq   xmm4,xmm5 
+  punpckhdq   xmm2,xmm5 
+  movdqa      xmm5,xmm0 
+  movdqa      xmm6,xmm1 
+  punpcklqdq  xmm0,xmm4 
+  punpckhqdq  xmm5,xmm4 
+  punpcklqdq  xmm1,xmm2 
+  punpckhqdq  xmm6,xmm2 
+  mov         esi,dword [esp+14h] 
+  mov         ecx,dword [ebp+10h] 
+  mov         edx,dword [esp+0Ch] 
+  mov         edi,dword [esp+8] 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         esi,dword [esp+18h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  movd        dword [esi],xmm0 
+  movd        dword [esi+ecx],xmm5 
+  movd        dword [esi+ecx*2],xmm1 
+  movd        dword [esi+edx],xmm6 
+  psrldq      xmm0,4 
+  psrldq      xmm5,4 
+  psrldq      xmm1,4 
+  psrldq      xmm6,4 
+  mov         edi,dword [esp+10h] 
+  movd        dword [edi],xmm0 
+  movd        dword [edi+ecx],xmm5 
+  movd        dword [edi+ecx*2],xmm1 
+  movd        dword [edi+edx],xmm6  
+  pop         edi  
+  pop         esi   
+  mov         esp,ebp 
+  pop         ebp  
+  ret     
+  
+  
+  
+;*******************************************************************************
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta, int8_t * pTC)
+;*******************************************************************************
+  
+
+WELS_EXTERN  DeblockLumaLt4V_sse2
+  
+ALIGN  16
+
+DeblockLumaLt4V_sse2:
+    push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 420				; 000001a4H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+
+	pxor	xmm0, xmm0
+	push	ebx
+	mov	edx, dword [ebp+24]
+	movdqa	[esp+424-384], xmm0
+	push	esi
+
+	lea	esi, [ecx+ecx*2]
+	push	edi
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+
+	lea	esi, [ecx+ecx]
+	movdqa	[esp+432-208], xmm0
+	mov	edi, eax
+	sub	edi, esi
+	movdqa	xmm0, [edi]
+	movdqa	[esp+448-208], xmm0
+
+	mov	ebx, eax
+	sub	ebx, ecx
+	movdqa	xmm0, [ebx]
+	movdqa	[esp+464-208], xmm0
+
+	movdqa	xmm0, [eax]
+
+	add	ecx, eax
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [ecx]
+	mov	dword [esp+432-404], ecx
+
+	movsx	ecx, word [ebp+16]
+	movdqa	[esp+496-208], xmm0
+	movdqa	xmm0, [esi+eax]
+
+	movsx	si, byte [edx]
+	movdqa	[esp+512-208], xmm0
+	movd	xmm0, ecx
+	movsx	ecx, word [ebp+20]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	pshufd	xmm0, xmm1, 0
+	movdqa	[esp+432-112], xmm0
+	movd	xmm0, ecx
+	movsx	cx, byte [edx+1]
+	movdqa	xmm1, xmm0
+	punpcklwd xmm1, xmm0
+	mov	dword [esp+432-408], ebx
+	movzx	ebx, cx
+	pshufd	xmm0, xmm1, 0
+	movd	xmm1, ebx
+	movzx	ebx, cx
+	movd	xmm2, ebx
+	movzx	ebx, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, si
+	movd	xmm5, ecx
+	movzx	ecx, si
+	movd	xmm6, ecx
+	movzx	ecx, si
+	movd	xmm7, ecx
+	movzx	ecx, si
+	movdqa	[esp+432-336], xmm0
+	movd	xmm0, ecx
+
+	movsx	cx, byte [edx+3]
+	movsx	dx, byte [edx+2]
+	movd	xmm3, ebx
+	punpcklwd xmm0, xmm4
+	movzx	esi, cx
+	punpcklwd xmm6, xmm2
+	punpcklwd xmm5, xmm1
+	punpcklwd xmm0, xmm6
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	punpcklwd xmm0, xmm7
+	movdqa	[esp+432-400], xmm0
+	movd	xmm0, esi
+	movzx	esi, cx
+	movd	xmm2, esi
+	movzx	esi, cx
+	movzx	ecx, cx
+	movd	xmm4, ecx
+	movzx	ecx, dx
+	movd	xmm3, esi
+	movd	xmm5, ecx
+	punpcklwd xmm5, xmm0
+
+	movdqa	xmm0, [esp+432-384]
+	movzx	ecx, dx
+	movd	xmm6, ecx
+	movzx	ecx, dx
+	movzx	edx, dx
+	punpcklwd xmm6, xmm2
+	movd	xmm7, ecx
+	movd	xmm1, edx
+
+	movdqa	xmm2, [esp+448-208]
+	punpcklbw xmm2, xmm0
+
+	mov	ecx, 4
+	movsx	edx, cx
+	punpcklwd xmm7, xmm3
+	punpcklwd xmm7, xmm5
+	movdqa	xmm5, [esp+496-208]
+	movdqa	xmm3, [esp+464-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-240], xmm5
+	movdqa	xmm5, [esp+512-208]
+	punpcklbw xmm5, xmm0
+	movdqa	[esp+432-352], xmm5
+	punpcklwd xmm1, xmm4
+	movdqa	xmm4, [esp+432-208]
+	punpcklwd xmm1, xmm6
+	movdqa	xmm6, [esp+480-208]
+	punpcklwd xmm1, xmm7
+	punpcklbw xmm6, xmm0
+	punpcklbw xmm3, xmm0
+	punpcklbw xmm4, xmm0
+	movdqa	xmm7, xmm3
+	psubw	xmm7, xmm4
+	pabsw	xmm7, xmm7
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-336]
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-352]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+	movdqa	xmm5, xmm3
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+	movdqa	xmm5, [esp+432-400]
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, xmm3
+	movdqa	[esp+432-32], xmm6
+	psubw	xmm6, [esp+432-240]
+	movdqa	xmm7, xmm5
+	movdqa	[esp+432-384], xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+
+	pand	xmm5, xmm7
+	movdqa	xmm6, xmm3
+	psubw	xmm6, xmm2
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm4
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-400]
+	pand	xmm5, xmm7
+	movdqa	xmm7, xmm6
+	pcmpeqw	xmm6, xmm0
+	pcmpgtw	xmm7, xmm0
+	por	xmm7, xmm6
+	pand	xmm5, xmm7
+	movdqa	[esp+432-320], xmm5
+	movd	xmm5, edx
+	movdqa	xmm6, xmm5
+	punpcklwd xmm6, xmm5
+	pshufd	xmm5, xmm6, 0
+	movdqa	[esp+432-336], xmm5
+	movdqa	xmm5, [esp+432-224]
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm0
+	psubw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	psllw	xmm5, 2
+	movdqa	xmm7, xmm2
+	psubw	xmm7, [esp+432-240]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	psraw	xmm7, 3
+	pmaxsw	xmm6, xmm7
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	movdqa	xmm6, [esp+432-400]
+	movdqa	[esp+432-64], xmm5
+	movdqa	[esp+432-384], xmm6
+	movdqa	xmm5, xmm0
+	psubw	xmm5, xmm6
+	movdqa	[esp+432-368], xmm5
+	movdqa	xmm6, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm2
+	paddw	xmm7, xmm2
+	psubw	xmm5, xmm7
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-384]
+	pminsw	xmm5, xmm6
+
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-288]
+	movdqa	xmm6, [esp+432-240]
+	movdqa	[esp+432-96], xmm5
+	movdqa	xmm5, [esp+432-352]
+	paddw	xmm5, [esp+432-304]
+	movdqa	xmm7, xmm6
+	paddw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+	psubw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+496-208]
+	psraw	xmm5, 1
+	pmaxsw	xmm6, xmm5
+	movdqa	xmm5, [esp+432-400]
+	pminsw	xmm5, xmm6
+	pand	xmm5, [esp+432-320]
+	pand	xmm5, [esp+432-256]
+	movdqa	xmm6, [esp+448-208]
+	punpckhbw xmm7, xmm0
+	movdqa	[esp+432-352], xmm7
+
+	movdqa	xmm7, [esp+512-208]
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-48], xmm5
+	movdqa	xmm5, [esp+432-208]
+	movdqa	[esp+432-368], xmm6
+	movdqa	xmm6, [esp+464-208]
+	punpckhbw xmm7, xmm0
+	punpckhbw xmm5, xmm0
+	movdqa	[esp+432-384], xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	[esp+432-400], xmm6
+
+	movdqa	xmm7, [esp+432-400]
+	movdqa	xmm6, [esp+480-208]
+	psubw	xmm7, xmm5
+	movdqa	[esp+432-16], xmm5
+	pabsw	xmm7, xmm7
+	punpckhbw xmm6, xmm0
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-288], xmm5
+
+	movdqa	xmm7, xmm6
+	psubw	xmm7, [esp+432-384]
+	pabsw	xmm7, xmm7
+	movdqa	xmm5, xmm4
+	pcmpgtw	xmm5, xmm7
+	movdqa	[esp+432-256], xmm5
+
+	movdqa	xmm5, [esp+432-400]
+	movdqa	[esp+432-80], xmm6
+	pavgw	xmm5, xmm6
+	movdqa	[esp+432-304], xmm5
+
+	movdqa	xmm5, xmm1
+	psubw	xmm5, [esp+432-288]
+	psubw	xmm5, [esp+432-256]
+	movdqa	[esp+432-224], xmm5
+	movdqa	xmm5, xmm6
+	psubw	xmm5, [esp+432-400]
+	psubw	xmm6, [esp+432-352]
+	movdqa	[esp+432-272], xmm5
+	movdqa	xmm7, xmm5
+	movdqa	xmm5, [esp+432-112]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm5, xmm7
+	movdqa	xmm7, xmm4
+	pabsw	xmm6, xmm6
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-368]
+
+	pand	xmm5, xmm7
+	movdqa	xmm7, [esp+432-400]
+	psubw	xmm7, xmm6
+	psubw	xmm6, [esp+432-352]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+	pand	xmm5, xmm4
+
+	paddw	xmm2, [esp+432-96]
+	movdqa	xmm4, xmm1
+	pcmpgtw	xmm4, xmm0
+	movdqa	xmm7, xmm1
+	pcmpeqw	xmm7, xmm0
+	por	xmm4, xmm7
+	pand	xmm5, xmm4
+	movdqa	xmm4, [esp+432-224]
+	movdqa	[esp+432-320], xmm5
+	movdqa	xmm5, [esp+432-272]
+	movdqa	xmm7, xmm0
+	psubw	xmm7, xmm4
+	psubw	xmm0, xmm1
+	psllw	xmm5, 2
+	paddw	xmm6, xmm5
+	paddw	xmm6, [esp+432-336]
+	movdqa	xmm5, [esp+432-368]
+	movdqa	[esp+432-336], xmm0
+	psraw	xmm6, 3
+	pmaxsw	xmm7, xmm6
+	pminsw	xmm4, xmm7
+	pand	xmm4, [esp+432-320]
+	movdqa	xmm6, xmm0
+	movdqa	xmm0, [esp+432-16]
+	paddw	xmm0, [esp+432-304]
+	movdqa	[esp+432-272], xmm4
+	movdqa	xmm4, [esp+432-368]
+	paddw	xmm4, xmm4
+	psubw	xmm0, xmm4
+
+	movdqa	xmm4, [esp+432-64]
+	psraw	xmm0, 1
+	pmaxsw	xmm6, xmm0
+	movdqa	xmm0, [esp+432-400]
+	movdqa	xmm7, xmm1
+	pminsw	xmm7, xmm6
+	movdqa	xmm6, [esp+432-320]
+	pand	xmm7, xmm6
+	pand	xmm7, [esp+432-288]
+	paddw	xmm5, xmm7
+	packuswb xmm2, xmm5
+	movdqa	xmm5, [esp+432-272]
+	paddw	xmm0, xmm5
+	paddw	xmm3, xmm4
+	packuswb xmm3, xmm0
+
+	movdqa	xmm0, [esp+432-32]
+	psubw	xmm0, xmm4
+	movdqa	xmm4, [esp+432-80]
+	psubw	xmm4, xmm5
+
+	movdqa	xmm5, [esp+432-240]
+	paddw	xmm5, [esp+432-48]
+	packuswb xmm0, xmm4
+	movdqa	xmm4, [esp+432-384]
+	paddw	xmm4, [esp+432-304]
+	movdqa	[esp+480-208], xmm0
+	movdqa	xmm0, [esp+432-352]
+	movdqa	xmm7, xmm0
+	paddw	xmm0, xmm0
+
+	mov	ecx, dword [esp+432-408]
+
+	mov	edx, dword [esp+432-404]
+	psubw	xmm4, xmm0
+	movdqa	xmm0, [esp+432-336]
+	movdqa	[edi], xmm2
+	psraw	xmm4, 1
+	pmaxsw	xmm0, xmm4
+	pminsw	xmm1, xmm0
+	movdqa	xmm0, [esp+480-208]
+
+	pop	edi
+	pand	xmm1, xmm6
+	pand	xmm1, [esp+428-256]
+	movdqa	[ecx], xmm3
+	paddw	xmm7, xmm1
+	pop	esi
+	packuswb xmm5, xmm7
+	movdqa	[eax], xmm0
+	movdqa	[edx], xmm5
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+
+
+;*******************************************************************************
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;                                 int32_t iBeta)
+;*******************************************************************************
+
+WELS_EXTERN  DeblockLumaEq4V_sse2
+  
+ALIGN  16
+
+DeblockLumaEq4V_sse2:
+
+	push	ebp
+	mov	ebp, esp
+	and	esp, -16				; fffffff0H
+	sub	esp, 628				; 00000274H
+	mov	eax, dword [ebp+8]
+	mov	ecx, dword [ebp+12]
+	push	ebx
+	push	esi
+
+	lea	edx, [ecx*4]
+	pxor	xmm0, xmm0
+	movdqa	xmm2, xmm0
+
+	movdqa	xmm0, [ecx+eax]
+	mov	esi, eax
+	sub	esi, edx
+	movdqa	xmm3, [esi]
+	movdqa	xmm5, [eax]
+	push	edi
+	lea	edi, [ecx+ecx]
+	lea	ebx, [ecx+ecx*2]
+	mov	dword [esp+640-600], edi
+	mov	esi, eax
+	sub	esi, edi
+	movdqa	xmm1, [esi]
+	movdqa	 [esp+720-272], xmm0
+	mov	edi, eax
+	sub	edi, ecx
+	movdqa	xmm4, [edi]
+	add	ecx, eax
+	mov	dword [esp+640-596], ecx
+
+	mov	ecx, dword [esp+640-600]
+	movdqa	xmm0, [ecx+eax]
+	movdqa	 [esp+736-272], xmm0
+
+	movdqa	xmm0, [eax+ebx]
+	mov	edx, eax
+	sub	edx, ebx
+
+	movsx	ebx, word [ebp+16]
+	movdqa	xmm6, [edx]
+	add	ecx, eax
+	movdqa	 [esp+752-272], xmm0
+	movd	xmm0, ebx
+
+	movsx	ebx, word [ebp+20]
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+	movdqa	 [esp+640-320], xmm0
+	movd	xmm0, ebx
+	movdqa	xmm7, xmm0
+	punpcklwd xmm7, xmm0
+	pshufd	xmm0, xmm7, 0
+
+	movdqa	xmm7, [esp+736-272]
+	punpcklbw xmm7, xmm2
+	movdqa	 [esp+640-416], xmm7
+	movdqa	 [esp+640-512], xmm0
+	movdqa	xmm0, xmm1
+	movdqa	 [esp+672-272], xmm1
+	movdqa	xmm1, xmm4
+	movdqa	 [esp+704-272], xmm5
+	punpcklbw xmm5, xmm2
+	punpcklbw xmm1, xmm2
+
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	punpcklbw xmm0, xmm2
+	movdqa	 [esp+688-272], xmm4
+	movdqa	xmm4, [esp+720-272]
+	movdqa	 [esp+640-480], xmm0
+
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm0
+
+	movdqa	xmm0, [esp+640-512]
+	pabsw	xmm7, xmm7
+	punpcklbw xmm4, xmm2
+	pcmpgtw	xmm0, xmm7
+	movdqa	 [esp+640-384], xmm4
+	movdqa	xmm7, xmm5
+	psubw	xmm7, xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+656-272], xmm6
+	punpcklbw xmm6, xmm2
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-48], xmm2
+	movdqa	 [esp+640-368], xmm6
+	movdqa	 [esp+640-144], xmm1
+	movdqa	 [esp+640-400], xmm5
+	pcmpgtw	xmm4, xmm7
+	pand	xmm0, xmm4
+	movdqa	xmm4, [esp+640-320]
+	pcmpgtw	xmm4, [esp+640-560]
+	pand	xmm0, xmm4
+
+	mov	ebx, 2
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, [esp+640-320]
+	psraw	xmm4, 2
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm7
+	movdqa	 [esp+640-576], xmm4
+	pcmpgtw	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-512]
+	movdqa	 [esp+640-624], xmm7
+	movdqa	xmm7, xmm1
+	psubw	xmm7, xmm6
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-544], xmm4
+	movdqa	xmm4, [esp+640-512]
+	movdqa	xmm7, xmm5
+	psubw	xmm7, [esp+640-416]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm4, xmm7
+
+	pand	xmm4, [esp+640-560]
+	movdqa	 [esp+640-560], xmm4
+
+	movdqa	xmm4, [esp+640-544]
+	pandn	xmm4, xmm6
+	movdqa	 [esp+640-16], xmm4
+	mov	ebx, 4
+	movsx	ebx, bx
+	movd	xmm4, ebx
+	movdqa	xmm7, xmm4
+	punpcklwd xmm7, xmm4
+	movdqa	xmm4, xmm3
+	punpcklbw xmm4, xmm2
+	psllw	xmm4, 1
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, xmm6
+	paddw	xmm4, [esp+640-480]
+
+	movdqa	xmm6, [esp+640-560]
+	pshufd	xmm7, xmm7, 0
+	paddw	xmm4, xmm1
+	movdqa	 [esp+640-592], xmm7
+	paddw	xmm4, xmm5
+	paddw	xmm4, xmm7
+	movdqa	xmm7, [esp+640-416]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-80], xmm6
+	movdqa	xmm6, [esp+752-272]
+	punpcklbw xmm6, xmm2
+	psllw	xmm6, 1
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-384]
+
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm6, xmm5
+	paddw	xmm6, xmm1
+	paddw	xmm6, [esp+640-592]
+	psraw	xmm6, 3
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-112], xmm6
+	movdqa	xmm6, [esp+640-544]
+	pandn	xmm6, xmm7
+	movdqa	 [esp+640-336], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-528], xmm6
+	movdqa	xmm6, [esp+640-368]
+	paddw	xmm6, xmm7
+	movdqa	xmm7, xmm1
+	psraw	xmm4, 3
+	pand	xmm4, [esp+640-544]
+	paddw	xmm7, xmm5
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+
+	paddw	xmm5, xmm1
+	psraw	xmm6, 2
+	pand	xmm7, xmm6
+
+	movdqa	xmm6, [esp+640-384]
+	movdqa	 [esp+640-64], xmm7
+	movdqa	xmm7, [esp+640-560]
+	pandn	xmm7, xmm6
+	movdqa	 [esp+640-304], xmm7
+	movdqa	xmm7, [esp+640-560]
+	movdqa	 [esp+640-528], xmm7
+	movdqa	xmm7, [esp+640-416]
+	paddw	xmm7, xmm6
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pand	xmm5, xmm7
+	movdqa	 [esp+640-32], xmm5
+
+	movdqa	xmm5, [esp+640-544]
+	movdqa	 [esp+640-528], xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	xmm7, xmm5
+	paddw	xmm7, xmm5
+	movdqa	xmm5, xmm1
+	paddw	xmm5, xmm6
+	paddw	xmm6, [esp+640-592]
+	paddw	xmm7, xmm5
+	paddw	xmm7, [esp+640-624]
+	movdqa	xmm5, [esp+640-528]
+	psraw	xmm7, 2
+	pandn	xmm5, xmm7
+	movdqa	xmm7, [esp+640-480]
+	paddw	xmm7, xmm1
+	paddw	xmm7, [esp+640-400]
+	movdqa	xmm1, [esp+640-544]
+	movdqa	 [esp+640-352], xmm5
+	movdqa	xmm5, [esp+640-368]
+	psllw	xmm7, 1
+	paddw	xmm7, xmm6
+	paddw	xmm5, xmm7
+
+	movdqa	xmm7, [esp+640-400]
+	psraw	xmm5, 3
+	pand	xmm1, xmm5
+	movdqa	xmm5, [esp+640-480]
+	movdqa	 [esp+640-96], xmm1
+	movdqa	xmm1, [esp+640-560]
+	movdqa	 [esp+640-528], xmm1
+	movdqa	xmm1, [esp+640-384]
+	movdqa	xmm6, xmm1
+	paddw	xmm6, xmm1
+	paddw	xmm1, [esp+640-400]
+	paddw	xmm1, [esp+640-144]
+	paddw	xmm7, xmm5
+	paddw	xmm5, [esp+640-592]
+	paddw	xmm6, xmm7
+	paddw	xmm6, [esp+640-624]
+	movdqa	xmm7, [esp+640-528]
+	psraw	xmm6, 2
+	psllw	xmm1, 1
+	paddw	xmm1, xmm5
+
+	movdqa	xmm5, [esp+656-272]
+	pandn	xmm7, xmm6
+	movdqa	xmm6, [esp+640-416]
+	paddw	xmm6, xmm1
+	movdqa	xmm1, [esp+640-560]
+	psraw	xmm6, 3
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+704-272]
+	movdqa	 [esp+640-128], xmm1
+	movdqa	xmm1, [esp+672-272]
+	punpckhbw xmm1, xmm2
+	movdqa	 [esp+640-448], xmm1
+	movdqa	xmm1, [esp+688-272]
+	punpckhbw xmm1, xmm2
+	punpckhbw xmm6, xmm2
+	movdqa	 [esp+640-288], xmm7
+	punpckhbw xmm5, xmm2
+	movdqa	 [esp+640-496], xmm1
+	movdqa	 [esp+640-432], xmm6
+
+	movdqa	xmm7, [esp+720-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-464], xmm7
+
+	movdqa	xmm7, [esp+736-272]
+	punpckhbw xmm7, xmm2
+	movdqa	 [esp+640-528], xmm7
+
+	movdqa	xmm7, xmm6
+
+	psubw	xmm6, [esp+640-464]
+	psubw	xmm7, xmm1
+	pabsw	xmm7, xmm7
+	movdqa	 [esp+640-560], xmm7
+	por	xmm4, [esp+640-16]
+	pabsw	xmm6, xmm6
+	movdqa	xmm7, xmm1
+	psubw	xmm7, [esp+640-448]
+
+	movdqa	xmm1, [esp+640-512]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm1, xmm7
+	movdqa	xmm7, [esp+640-512]
+	pcmpgtw	xmm7, xmm6
+	movdqa	xmm6, [esp+640-320]
+	pand	xmm1, xmm7
+	movdqa	xmm7, [esp+640-560]
+	pcmpgtw	xmm6, xmm7
+	pand	xmm1, xmm6
+
+	movdqa	xmm6, [esp+640-576]
+	pcmpgtw	xmm6, xmm7
+
+	movdqa	xmm7, [esp+640-496]
+	punpckhbw xmm3, xmm2
+	movdqa	 [esp+640-560], xmm6
+	movdqa	xmm6, [esp+640-512]
+	psubw	xmm7, xmm5
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+
+	pand	xmm6, [esp+640-560]
+	movdqa	xmm7, [esp+640-432]
+	psubw	xmm7, [esp+640-528]
+
+	psllw	xmm3, 1
+	movdqa	 [esp+640-544], xmm6
+	movdqa	xmm6, [esp+640-512]
+
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, xmm5
+	paddw	xmm3, [esp+640-448]
+	paddw	xmm3, [esp+640-496]
+	pabsw	xmm7, xmm7
+	pcmpgtw	xmm6, xmm7
+	pand	xmm6, [esp+640-560]
+	movdqa	 [esp+640-560], xmm6
+
+	movdqa	xmm6, xmm0
+	pand	xmm6, xmm4
+	movdqa	xmm4, xmm0
+	pandn	xmm4, [esp+640-368]
+	por	xmm6, xmm4
+	movdqa	xmm4, [esp+640-432]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm3, 3
+	pand	xmm3, xmm2
+	pandn	xmm2, xmm5
+	por	xmm3, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm3
+	movdqa	xmm3, [esp+640-64]
+	por	xmm3, [esp+640-336]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm5
+	por	xmm7, xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-480]
+	por	xmm2, xmm3
+	packuswb xmm6, xmm7
+	movdqa	 [esp+640-336], xmm2
+	movdqa	 [esp+656-272], xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	xmm2, xmm5
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm3, xmm1
+	movdqa	xmm7, [esp+640-496]
+	paddw	xmm7, xmm4
+	paddw	xmm2, xmm7
+	paddw	xmm2, [esp+640-624]
+	movdqa	xmm7, [esp+640-544]
+	psraw	xmm2, 2
+	pand	xmm6, xmm2
+	movdqa	xmm2, [esp+640-448]
+	pandn	xmm7, xmm2
+	por	xmm6, xmm7
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm1
+	pandn	xmm6, xmm2
+	paddw	xmm2, [esp+640-496]
+	paddw	xmm2, xmm4
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-336]
+	packuswb xmm6, xmm3
+	psllw	xmm2, 1
+	movdqa	 [esp+672-272], xmm6
+	movdqa	xmm6, [esp+640-96]
+	por	xmm6, [esp+640-352]
+
+	movdqa	xmm3, xmm0
+	pand	xmm3, xmm6
+	movdqa	xmm6, xmm0
+	pandn	xmm6, [esp+640-144]
+	por	xmm3, xmm6
+	movdqa	xmm6, [esp+640-544]
+	movdqa	 [esp+640-352], xmm3
+	movdqa	xmm3, [esp+640-464]
+	paddw	xmm3, [esp+640-592]
+	paddw	xmm2, xmm3
+	movdqa	xmm3, [esp+640-448]
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-496]
+	psraw	xmm5, 3
+	pand	xmm6, xmm5
+	movdqa	xmm5, [esp+640-464]
+	paddw	xmm2, xmm5
+	paddw	xmm5, [esp+640-432]
+	movdqa	xmm4, xmm3
+	paddw	xmm4, xmm3
+	paddw	xmm4, xmm2
+	paddw	xmm4, [esp+640-624]
+	movdqa	xmm2, [esp+640-544]
+	paddw	xmm3, [esp+640-592]
+	psraw	xmm4, 2
+	pandn	xmm2, xmm4
+	por	xmm6, xmm2
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-496]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm6
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-352]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+688-272], xmm2
+	movdqa	xmm2, [esp+640-128]
+	por	xmm2, [esp+640-288]
+
+	movdqa	xmm4, xmm0
+	pand	xmm4, xmm2
+	paddw	xmm5, xmm6
+	movdqa	xmm2, xmm0
+	pandn	xmm2, [esp+640-400]
+	por	xmm4, xmm2
+	movdqa	xmm2, [esp+640-528]
+	psllw	xmm5, 1
+	paddw	xmm5, xmm3
+	movdqa	xmm3, [esp+640-560]
+	paddw	xmm2, xmm5
+	psraw	xmm2, 3
+	movdqa	 [esp+640-288], xmm4
+	movdqa	xmm4, [esp+640-560]
+	pand	xmm4, xmm2
+	movdqa	xmm2, [esp+640-464]
+	movdqa	xmm5, xmm2
+	paddw	xmm5, xmm2
+	movdqa	xmm2, [esp+640-432]
+	paddw	xmm2, [esp+640-448]
+	movdqa	xmm7, xmm1
+	paddw	xmm5, xmm2
+	paddw	xmm5, [esp+640-624]
+	movdqa	xmm6, [esp+640-560]
+	psraw	xmm5, 2
+	pandn	xmm3, xmm5
+	por	xmm4, xmm3
+	movdqa	xmm3, [esp+640-32]
+	por	xmm3, [esp+640-304]
+	pand	xmm7, xmm4
+	movdqa	xmm4, [esp+640-432]
+	movdqa	xmm5, [esp+640-464]
+	movdqa	xmm2, xmm1
+	pandn	xmm2, xmm4
+	paddw	xmm4, [esp+640-496]
+	por	xmm7, xmm2
+	movdqa	xmm2, [esp+640-288]
+	packuswb xmm2, xmm7
+	movdqa	 [esp+704-272], xmm2
+
+	movdqa	xmm2, xmm0
+	pand	xmm2, xmm3
+	movdqa	xmm3, xmm0
+	pandn	xmm3, [esp+640-384]
+	por	xmm2, xmm3
+	movdqa	 [esp+640-304], xmm2
+	movdqa	xmm2, [esp+640-528]
+	movdqa	xmm3, xmm2
+	paddw	xmm3, [esp+640-464]
+	paddw	xmm3, xmm4
+	paddw	xmm3, [esp+640-624]
+	psraw	xmm3, 2
+	pand	xmm6, xmm3
+	movdqa	xmm3, [esp+640-560]
+	movdqa	xmm4, xmm3
+	pandn	xmm4, xmm5
+	por	xmm6, xmm4
+	movdqa	xmm7, xmm1
+	pand	xmm7, xmm6
+	movdqa	xmm6, [esp+640-304]
+	movdqa	xmm4, xmm1
+	pandn	xmm4, xmm5
+	por	xmm7, xmm4
+
+	movdqa	xmm4, xmm0
+	pandn	xmm0, [esp+640-416]
+	packuswb xmm6, xmm7
+	movdqa	xmm7, [esp+640-112]
+	por	xmm7, [esp+640-80]
+	pand	xmm4, xmm7
+	por	xmm4, xmm0
+	movdqa	xmm0, [esp+752-272]
+	punpckhbw xmm0, [esp+640-48]
+	psllw	xmm0, 1
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm2
+	paddw	xmm0, xmm5
+	paddw	xmm0, [esp+640-432]
+	paddw	xmm0, [esp+640-496]
+	paddw	xmm0, [esp+640-592]
+	psraw	xmm0, 3
+	pand	xmm0, xmm3
+	movdqa	xmm7, xmm1
+	pandn	xmm3, xmm2
+	por	xmm0, xmm3
+	pand	xmm7, xmm0
+
+	movdqa	xmm0, [esp+656-272]
+	movdqa	 [edx], xmm0
+
+	movdqa	xmm0, [esp+672-272]
+
+	mov	edx, dword [esp+640-596]
+	movdqa	 [esi], xmm0
+	movdqa	xmm0, [esp+688-272]
+	movdqa	 [edi], xmm0
+	movdqa	xmm0, [esp+704-272]
+
+	pop	edi
+	pandn	xmm1, xmm2
+	movdqa	 [eax], xmm0
+	por	xmm7, xmm1
+	pop	esi
+	packuswb xmm4, xmm7
+	movdqa	 [edx], xmm6
+	movdqa	 [ecx], xmm4
+	pop	ebx
+	mov	esp, ebp
+	pop	ebp
+	ret
+  
+    
+;********************************************************************************
+;
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;
+;********************************************************************************
+
+WELS_EXTERN  DeblockLumaTransposeH2V_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeH2V_sse2:
+    push    ebp
+    push    ebx
+    mov     ebp,   esp
+    and     esp,0FFFFFFF0h
+    sub     esp,   10h    
+    
+    mov     eax,   [ebp + 0Ch]  
+    mov     ecx,   [ebp + 10h]
+    lea     edx,   [eax + ecx * 8]
+    lea     ebx,   [ecx*3]
+    
+    movq    xmm0,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm0,  xmm7  
+    movq    xmm1,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm1,  xmm7
+    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm2,  xmm7
+    movq    xmm3,  [eax + ebx]
+    movq    xmm7,  [edx + ebx]
+    punpcklqdq   xmm3,  xmm7
+    
+    lea     eax,   [eax + ecx * 4]
+    lea     edx,   [edx + ecx * 4]
+    movq    xmm4,  [eax] 
+    movq    xmm7,  [edx]
+    punpcklqdq   xmm4,  xmm7  
+    movq    xmm5,  [eax + ecx]
+    movq    xmm7,  [edx + ecx]
+    punpcklqdq   xmm5,  xmm7
+    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm7,  [edx + ecx*2]
+    punpcklqdq   xmm6,  xmm7
+    
+    movdqa  [esp],   xmm0
+    movq    xmm7,  [eax + ebx]
+    movq    xmm0,  [edx + ebx]
+    punpcklqdq   xmm7,  xmm0
+    movdqa  xmm0,   [esp]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    mov    eax,   [ebp + 14h]
+    movdqa  [eax],    xmm4 
+    movdqa  [eax + 10h],  xmm2
+    movdqa  [eax + 20h],  xmm3
+    movdqa  [eax + 30h],  xmm7
+    movdqa  [eax + 40h],  xmm5
+    movdqa  [eax + 50h],  xmm1
+    movdqa  [eax + 60h],  xmm6
+    movdqa  [eax + 70h],  xmm0   
+    
+    mov     esp,   ebp
+    pop     ebx
+    pop     ebp
+    ret
+    
+    
+    
+;*******************************************************************************************
+;
+;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
+;
+;*******************************************************************************************
+
+WELS_EXTERN   DeblockLumaTransposeV2H_sse2
+
+ALIGN  16
+
+DeblockLumaTransposeV2H_sse2:
+    push     ebp
+    mov      ebp,   esp
+    
+    and     esp,  0FFFFFFF0h
+    sub     esp,   10h  
+    
+    mov      eax,   [ebp + 10h]  
+    mov      ecx,   [ebp + 0Ch]
+    mov      edx,   [ebp + 08h]
+      
+    movdqa   xmm0,  [eax]
+    movdqa   xmm1,  [eax + 10h]
+    movdqa   xmm2,  [eax + 20h]
+    movdqa   xmm3,	[eax + 30h]
+    movdqa   xmm4,	[eax + 40h]
+    movdqa   xmm5,	[eax + 50h]
+    movdqa   xmm6,	[eax + 60h]
+    movdqa   xmm7,	[eax + 70h]
+    
+    SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
+    ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
+    
+    lea      eax,   [ecx * 3]
+    
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0    
+    
+    psrldq    xmm4,   8
+    psrldq    xmm2,   8
+    psrldq    xmm3,   8
+    psrldq    xmm7,   8
+    psrldq    xmm5,   8
+    psrldq    xmm1,   8
+    psrldq    xmm6,   8
+    psrldq    xmm0,   8
+    
+    lea       edx,  [edx + ecx*4]
+    movq     [edx],  xmm4 
+    movq     [edx + ecx],  xmm2
+    movq     [edx + ecx*2],  xmm3
+    movq     [edx + eax],  xmm7
+    
+    lea      edx,   [edx + ecx*4]
+    movq     [edx],  xmm5 
+    movq     [edx + ecx],  xmm1
+    movq     [edx + ecx*2],  xmm6
+    movq     [edx + eax],  xmm0   
+    
+    
+    mov      esp,   ebp
+    pop      ebp
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -1,317 +1,317 @@
-;*!
-;* \copy
-;*     Copyright (c)  2004-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  mc_chroma.asm
-;*
-;*  Abstract
-;*      mmx motion compensation for chroma
-;*
-;*  History
-;*      10/13/2004 Created
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-h264_d0x20_sse2:
-	dw 32,32,32,32,32,32,32,32
-ALIGN 16
-h264_d0x20_mmx:
-	dw 32,32,32,32
-
-
-;=============================================================================
-; Code
-;=============================================================================
-
-SECTION .text
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
-;							int32_t iHeigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq4_mmx
-McChromaWidthEq4_mmx:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd mm3, [eax]
-	WELS_Zero mm7
-	punpcklbw mm3, mm3
-	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
-	movq	  mm5, mm3
-	punpcklbw mm3, mm7
-	punpckhbw mm5, mm7
-	
-	movq	  mm6, mm4
-	punpcklbw mm4, mm7
-	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movd mm0, [esi]
-	movd mm1, [esi+1]
-	punpcklbw mm0, mm7
-	punpcklbw mm1, mm7
-.xloop:
-	
-	pmullw mm0, mm3
-	pmullw mm1, mm5
-	paddw  mm0, mm1
-	
-	movd  mm1, [ebx]
-	punpcklbw mm1, mm7
-	movq mm2, mm1
-	pmullw mm1, mm4
-	paddw mm0, mm1
-	
-	movd mm1, [ebx+1]
-	punpcklbw mm1, mm7
-	movq mm7, mm1
-	pmullw mm1,mm6
-	paddw mm0, mm1
-	movq mm1,mm7
-
-	paddw mm0, [h264_d0x20_mmx]
-	psrlw mm0, 6
-	
-	WELS_Zero mm7
-	packuswb mm0, mm7
-	movd [edi], mm0	
-
-	movq mm0, mm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	WELSEMMS
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-ALIGN 16
-;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
-;						int32_t iheigh );
-;*******************************************************************************
-WELS_EXTERN McChromaWidthEq8_sse2
-McChromaWidthEq8_sse2:
-	push esi
-	push edi
-	push ebx
-	
-	mov eax, [esp +12 + 20]
-	movd xmm3, [eax]
-	WELS_Zero xmm7
-	punpcklbw  xmm3, xmm3
-	punpcklwd  xmm3, xmm3
-	
-	movdqa	   xmm4, xmm3
-	punpckldq  xmm3, xmm3
-	punpckhdq  xmm4, xmm4
-	movdqa     xmm5, xmm3
-	movdqa	   xmm6, xmm4
-	
-	punpcklbw  xmm3, xmm7
-	punpckhbw  xmm5, xmm7
-	punpcklbw  xmm4, xmm7
-	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
-	lea ebx, [esi + eax]
-	movq xmm0, [esi]
-	movq xmm1, [esi+1]
-	punpcklbw xmm0, xmm7
-	punpcklbw xmm1, xmm7
-.xloop:
-	
-	pmullw xmm0, xmm3
-	pmullw xmm1, xmm5
-	paddw  xmm0, xmm1
-	
-	movq  xmm1, [ebx]
-	punpcklbw xmm1, xmm7
-	movdqa xmm2, xmm1
-	pmullw xmm1, xmm4
-	paddw xmm0, xmm1
-	
-	movq xmm1, [ebx+1]
-	punpcklbw xmm1, xmm7
-	movdqa xmm7, xmm1
-	pmullw xmm1, xmm6
-	paddw xmm0, xmm1
-	movdqa xmm1,xmm7
-
-	paddw xmm0, [h264_d0x20_sse2]
-	psrlw xmm0, 6
-	
-	WELS_Zero xmm7
-	packuswb xmm0, xmm7
-	movq [edi], xmm0	
-
-	movdqa xmm0, xmm2
-	
-	lea edi, [edi +edx  ]
-	lea ebx, [ebx + eax]
-
-	dec ecx
-	jnz near .xloop
-	
-	pop ebx
-	pop edi
-	pop esi
-	ret
-
-
-
-
-ALIGN 16
-;***********************************************************************
-; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
-;                        int32_t iDstStride,
-;                        uint8_t *pABCD,
-;					     int32_t iHeigh);
-;***********************************************************************
-WELS_EXTERN McChromaWidthEq8_ssse3
-McChromaWidthEq8_ssse3:
-	push ebx
-	push esi
-	push edi
-		
-	mov eax, [esp + 12 + 20]
-
-    pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
-    movdqa    xmm6, xmm5
-    punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
-    sub esi, edi
-    sub esi, edi
-	movdqa xmm7, [h264_d0x20_sse2]
-
-	movdqu xmm0, [eax]
-	movdqa xmm1, xmm0
-	psrldq xmm1, 1
-	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
-	lea	esi, [esi+2*edi]
-	
-	movdqu xmm2, [eax+edx]
-	movdqa xmm3, xmm2
-	psrldq xmm3, 1
-	punpcklbw xmm2, xmm3
-	movdqa      xmm4, xmm2
-	
-    pmaddubsw  xmm0, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm0, xmm2
-    paddw      xmm0, xmm7
-	psrlw      xmm0, 6
-    packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
-    lea eax, [eax+2*edx]
-    movdqu xmm2, [eax]
-    movdqa xmm3, xmm2
-    psrldq xmm3, 1
-    punpcklbw xmm2, xmm3
-    movdqa      xmm0, xmm2
-    
-    pmaddubsw  xmm4, xmm5
-    pmaddubsw  xmm2, xmm6
-    paddw      xmm4, xmm2
-    paddw      xmm4, xmm7
-	psrlw      xmm4, 6
-    packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
-	sub ecx, 2
-	jnz .hloop_chroma
-	pop edi
-	pop esi
-	pop ebx
-
-	ret
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2004-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  mc_chroma.asm
+;*
+;*  Abstract
+;*      mmx motion compensation for chroma
+;*
+;*  History
+;*      10/13/2004 Created
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+h264_d0x20_sse2:
+	dw 32,32,32,32,32,32,32,32
+ALIGN 16
+h264_d0x20_mmx:
+	dw 32,32,32,32
+
+
+;=============================================================================
+; Code
+;=============================================================================
+
+SECTION .text
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq4_mmx( uint8_t *src, 
+;							int32_t iSrcStride, 
+;							uint8_t *pDst, 
+;							int32_t iDstStride, 
+;							uint8_t *pABCD, 
+;							int32_t iHeigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq4_mmx
+McChromaWidthEq4_mmx:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd mm3, [eax]
+	WELS_Zero mm7
+	punpcklbw mm3, mm3
+	movq      mm4, mm3
+	punpcklwd mm3, mm3       
+	punpckhwd mm4, mm4		 
+	
+	movq	  mm5, mm3
+	punpcklbw mm3, mm7
+	punpckhbw mm5, mm7
+	
+	movq	  mm6, mm4
+	punpcklbw mm4, mm7
+	punpckhbw mm6, mm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movd mm0, [esi]
+	movd mm1, [esi+1]
+	punpcklbw mm0, mm7
+	punpcklbw mm1, mm7
+.xloop:
+	
+	pmullw mm0, mm3
+	pmullw mm1, mm5
+	paddw  mm0, mm1
+	
+	movd  mm1, [ebx]
+	punpcklbw mm1, mm7
+	movq mm2, mm1
+	pmullw mm1, mm4
+	paddw mm0, mm1
+	
+	movd mm1, [ebx+1]
+	punpcklbw mm1, mm7
+	movq mm7, mm1
+	pmullw mm1,mm6
+	paddw mm0, mm1
+	movq mm1,mm7
+
+	paddw mm0, [h264_d0x20_mmx]
+	psrlw mm0, 6
+	
+	WELS_Zero mm7
+	packuswb mm0, mm7
+	movd [edi], mm0	
+
+	movq mm0, mm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	WELSEMMS
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+ALIGN 16
+;*******************************************************************************
+; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
+;						int32_t iSrcStride, 
+;						uint8_t *pDst, 
+;						int32_t iDstStride, 
+;						uint8_t *pABCD, 
+;						int32_t iheigh );
+;*******************************************************************************
+WELS_EXTERN McChromaWidthEq8_sse2
+McChromaWidthEq8_sse2:
+	push esi
+	push edi
+	push ebx
+	
+	mov eax, [esp +12 + 20]
+	movd xmm3, [eax]
+	WELS_Zero xmm7
+	punpcklbw  xmm3, xmm3
+	punpcklwd  xmm3, xmm3
+	
+	movdqa	   xmm4, xmm3
+	punpckldq  xmm3, xmm3
+	punpckhdq  xmm4, xmm4
+	movdqa     xmm5, xmm3
+	movdqa	   xmm6, xmm4
+	
+	punpcklbw  xmm3, xmm7
+	punpckhbw  xmm5, xmm7
+	punpcklbw  xmm4, xmm7
+	punpckhbw  xmm6, xmm7
+	
+	mov esi, [esp +12+ 4]   
+	mov eax, [esp + 12 + 8]   
+	mov edi, [esp + 12 + 12]  
+	mov edx, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+		
+	lea ebx, [esi + eax]
+	movq xmm0, [esi]
+	movq xmm1, [esi+1]
+	punpcklbw xmm0, xmm7
+	punpcklbw xmm1, xmm7
+.xloop:
+	
+	pmullw xmm0, xmm3
+	pmullw xmm1, xmm5
+	paddw  xmm0, xmm1
+	
+	movq  xmm1, [ebx]
+	punpcklbw xmm1, xmm7
+	movdqa xmm2, xmm1
+	pmullw xmm1, xmm4
+	paddw xmm0, xmm1
+	
+	movq xmm1, [ebx+1]
+	punpcklbw xmm1, xmm7
+	movdqa xmm7, xmm1
+	pmullw xmm1, xmm6
+	paddw xmm0, xmm1
+	movdqa xmm1,xmm7
+
+	paddw xmm0, [h264_d0x20_sse2]
+	psrlw xmm0, 6
+	
+	WELS_Zero xmm7
+	packuswb xmm0, xmm7
+	movq [edi], xmm0	
+
+	movdqa xmm0, xmm2
+	
+	lea edi, [edi +edx  ]
+	lea ebx, [ebx + eax]
+
+	dec ecx
+	jnz near .xloop
+	
+	pop ebx
+	pop edi
+	pop esi
+	ret
+
+
+
+
+ALIGN 16
+;***********************************************************************
+; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
+;						 int32_t iSrcStride, 
+;                        uint8_t *pDst,  
+;                        int32_t iDstStride,
+;                        uint8_t *pABCD,
+;					     int32_t iHeigh);
+;***********************************************************************
+WELS_EXTERN McChromaWidthEq8_ssse3
+McChromaWidthEq8_ssse3:
+	push ebx
+	push esi
+	push edi
+		
+	mov eax, [esp + 12 + 20]
+
+    pxor      xmm7, xmm7
+    movd   xmm5, [eax]   
+    punpcklwd xmm5, xmm5  
+    punpckldq xmm5, xmm5 
+    movdqa    xmm6, xmm5
+    punpcklqdq xmm5, xmm5
+    punpckhqdq xmm6, xmm6    
+    
+	mov eax, [esp + 12 + 4]   
+	mov edx, [esp + 12 + 8]   
+	mov esi, [esp + 12 + 12]  
+	mov edi, [esp + 12 + 16]  
+    mov ecx, [esp + 12 + 24]   
+    
+    sub esi, edi
+    sub esi, edi
+	movdqa xmm7, [h264_d0x20_sse2]
+
+	movdqu xmm0, [eax]
+	movdqa xmm1, xmm0
+	psrldq xmm1, 1
+	punpcklbw xmm0, xmm1
+	
+.hloop_chroma:	
+	lea	esi, [esi+2*edi]
+	
+	movdqu xmm2, [eax+edx]
+	movdqa xmm3, xmm2
+	psrldq xmm3, 1
+	punpcklbw xmm2, xmm3
+	movdqa      xmm4, xmm2
+	
+    pmaddubsw  xmm0, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm0, xmm2
+    paddw      xmm0, xmm7
+	psrlw      xmm0, 6
+    packuswb   xmm0, xmm0
+    movq       [esi],xmm0	
+    
+    lea eax, [eax+2*edx]
+    movdqu xmm2, [eax]
+    movdqa xmm3, xmm2
+    psrldq xmm3, 1
+    punpcklbw xmm2, xmm3
+    movdqa      xmm0, xmm2
+    
+    pmaddubsw  xmm4, xmm5
+    pmaddubsw  xmm2, xmm6
+    paddw      xmm4, xmm2
+    paddw      xmm4, xmm7
+	psrlw      xmm4, 6
+    packuswb   xmm4, xmm4
+    movq       [esi+edi],xmm4	
+	
+	sub ecx, 2
+	jnz .hloop_chroma
+	pop edi
+	pop esi
+	pop ebx
+
+	ret
+
+
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -1,94 +1,94 @@
-NASM = 1
-NAME      = libwelsvp
-
-OUTDIR    = ../../../bin/linux
-BINDIR    = ../../bin
-OBJDIR    = ../../obj  
-SRCDIRS   = ../../src/asm \
-            ../../src/common \
-            ../../src/adaptivequantization \
-            ../../src/backgounddetection \
-            ../../src/denoise \
-            ../../src/downsample \
-            ../../src/scenechangedetection \
-            ../../src/vaacalc \
-            ../../src/complexityanalysis 
-SRCDIRS  += ../../src/imagerotate
-
-
-TARGETLIB =  $(BINDIR)/$(NAME).so
-
-CC        = $(shell which gcc)
-AS        = $(shell which nasm)
-GCC       = gcc -m32
-
-CPPFLAGS  = -Wall -g -O3
-ifeq ($(NASM), 1)
-CPPFLAGS += -DX86_ASM
-endif
-ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
-LDFLAGS   = -lstdc++ -ldl
-          
-SRCEXTS  = .cpp
-ifeq ($(NASM), 1)
-SRCEXTS += .asm
-endif
-HDREXTS  = .h
-SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
-HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
-SRC_CPP  = $(filter %.cpp,$(SOURCES))
-SRC_ASM  = $(filter %.asm,$(SOURCES))
-OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
-DEPS     = $(OBJS:.o=.d)
-
-DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
-                  echo "-MM -MP"; else echo "-M"; fi )
-DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
-DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
-COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
-COMPILE.asm   = $(AS)  $(ASMFLAGS)
-LINK          = $(GCC) $(LDFLAGS)
-
-.PHONY: all objs tags ctags clean distclean
-
-.SUFFIXES:
-
-all: $(TARGETLIB)
-	
-%.d:%.cpp
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_cpp.d) $< >> $@
-	
-%.d:%.asm
-	@echo -n $(dir $<) > $@
-	@$(DEPEND_asm.d) $< >> $@
-
-objs:$(OBJS)
-
-%.o:%.cpp
-	$(COMPILE.cpp) $< -o $@
-	
-%.o:%.asm
-	$(COMPILE.asm) $< -o $@	
-
-tags: $(HEADERS) $(SOURCES)
-	etags $(HEADERS) $(SOURCES)
-
-ctags: $(HEADERS) $(SOURCES)
-	ctags $(HEADERS) $(SOURCES)
-
-$(TARGETLIB):$(OBJS)
-	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
-	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
-	@echo produce the lib to $(TARGETLIB).
-	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
-	@cp -f $(TARGETLIB) $(OUTDIR)
-	@cp -f $(TARGETLIB) ../../../testbin
-	@echo copy the lib to $(OUTDIR).
-
-clean:
-	rm -f $(OBJS) $(TARGETLIB)
-
-distclean: clean
-	rm -f $(DEPS) TAGS
-
+NASM = 1
+NAME      = libwelsvp
+
+OUTDIR    = ../../../bin/linux
+BINDIR    = ../../bin
+OBJDIR    = ../../obj  
+SRCDIRS   = ../../src/asm \
+            ../../src/common \
+            ../../src/adaptivequantization \
+            ../../src/backgounddetection \
+            ../../src/denoise \
+            ../../src/downsample \
+            ../../src/scenechangedetection \
+            ../../src/vaacalc \
+            ../../src/complexityanalysis 
+SRCDIRS  += ../../src/imagerotate
+
+
+TARGETLIB =  $(BINDIR)/$(NAME).so
+
+CC        = $(shell which gcc)
+AS        = $(shell which nasm)
+GCC       = gcc -m32
+
+CPPFLAGS  = -Wall -g -O3
+ifeq ($(NASM), 1)
+CPPFLAGS += -DX86_ASM
+endif
+ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
+LDFLAGS   = -lstdc++ -ldl
+          
+SRCEXTS  = .cpp
+ifeq ($(NASM), 1)
+SRCEXTS += .asm
+endif
+HDREXTS  = .h
+SOURCES  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(SRCEXTS))))
+HEADERS  = $(foreach d,$(SRCDIRS),$(wildcard $(addprefix $(d)/*,$(HDREXTS))))
+SRC_CPP  = $(filter %.cpp,$(SOURCES))
+SRC_ASM  = $(filter %.asm,$(SOURCES))
+OBJS     = $(addsuffix .o, $(basename $(SOURCES)))
+DEPS     = $(OBJS:.o=.d)
+
+DEP_OPT  = $(shell if `$(CC) --version | grep "GCC" >/dev/null`; then \
+                  echo "-MM -MP"; else echo "-M"; fi )
+DEPEND_cpp.d  = $(subst -g ,,$(CC) $(DEP_OPT) $(CPPFLAGS))
+DEPEND_asm.d  = $(subst -g ,,$(AS) $(DEP_OPT) $(ASMFLAGS))
+COMPILE.cpp   = $(GCC) $(CPPFLAGS) -c
+COMPILE.asm   = $(AS)  $(ASMFLAGS)
+LINK          = $(GCC) $(LDFLAGS)
+
+.PHONY: all objs tags ctags clean distclean
+
+.SUFFIXES:
+
+all: $(TARGETLIB)
+	
+%.d:%.cpp
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_cpp.d) $< >> $@
+	
+%.d:%.asm
+	@echo -n $(dir $<) > $@
+	@$(DEPEND_asm.d) $< >> $@
+
+objs:$(OBJS)
+
+%.o:%.cpp
+	$(COMPILE.cpp) $< -o $@
+	
+%.o:%.asm
+	$(COMPILE.asm) $< -o $@	
+
+tags: $(HEADERS) $(SOURCES)
+	etags $(HEADERS) $(SOURCES)
+
+ctags: $(HEADERS) $(SOURCES)
+	ctags $(HEADERS) $(SOURCES)
+
+$(TARGETLIB):$(OBJS)
+	@if test ! -d $(BINDIR) ; then mkdir -p $(BINDIR) ; fi
+	$(LINK) $(OBJS) -shared -Wl,-Bsymbolic -o $@
+	@echo produce the lib to $(TARGETLIB).
+	@if test ! -d $(OUTDIR) ; then mkdir -p $(OUTDIR) ; fi
+	@cp -f $(TARGETLIB) $(OUTDIR)
+	@cp -f $(TARGETLIB) ../../../testbin
+	@echo copy the lib to $(OUTDIR).
+
+clean:
+	rm -f $(OBJS) $(TARGETLIB)
+
+distclean: clean
+	rm -f $(DEPS) TAGS
+
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -1,263 +1,263 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  predenoise.asm
-;*
-;*  Abstract
-;*      denoise for SVC2.1
-;*  History
-;*      4/13/2010 Created
-;*      7/30/2010 Modified
-;*
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-
-;***********************************************************************
-; Constant
-;***********************************************************************
-SECTION .rodata align=16
-
-sse2_32 times 8 dw 32
-sse2_20 times 8 dw 20
-
-
-BITS 32
-;***********************************************************************
-; Code
-;***********************************************************************
-SECTION .text
-	
-%macro	WEIGHT_LINE	9
-		movq		%2,	%9
-		punpcklbw	%2,	%7
-		movdqa		%8,	%2
-		
-		movdqa		%1,	%6
-		psubusb		%1,	%8
-		psubusb		%8,	%6
-		por			%8,	%1		; ABS(curPixel - centerPixel);
-		
-		movdqa		%1,	%3
-		psubusb		%1,	%8
-
-		pmullw		%1,	%1
-		psrlw		%1,	5
-		pmullw		%2,	%1		
-		paddusw		%4,	%1
-		paddusw		%5,	%2	
-%endmacro
-
-%macro	WEIGHT_LINE1_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE2_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		paddw		%3,	%2
-%endmacro
-
-%macro	WEIGHT_LINE3_UV	4
-		movdqa		%2,	%1
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	1
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-
-		movdqa		%2,	%1
-		psrldq		%2,	2
-		punpcklbw	%2,	%4
-		pmullw		%2,	[sse2_20]
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	3
-		punpcklbw	%2,	%4
-		psllw		%2,	2
-		paddw		%3,	%2
-		
-		movdqa		%2,	%1
-		psrldq		%2,	4
-		punpcklbw	%2,	%4
-		psllw		%2,	1
-		paddw		%3,	%2
-%endmacro
-
-ALIGN 16
-WELS_EXTERN BilateralLumaFilter8_sse2
-;***********************************************************************
-;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;	1	2	3
-;	4	0	5
-;	6	7	8
-;	0:	the center point
-%define		pushsize	4
-%define		pixel		esp + pushsize + 4
-%define		stride		esp + pushsize + 8
-BilateralLumaFilter8_sse2:
-		push		ebx
-		
-		pxor		xmm7,	xmm7
-		mov			eax,	[pixel]
-		mov			ebx,	eax
-		movq		xmm6,	[eax]
-		punpcklbw	xmm6,	xmm7
-		movdqa		xmm3,	[sse2_32]
-		pxor		xmm4,	xmm4		; nTotWeight
-		pxor		xmm5,	xmm5		; nSum
-		
-		dec			eax
-		mov			ecx,	[stride]
-		
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-		
-		sub			eax,	ecx
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-		
-		lea			eax,	[eax + ecx * 2]
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
-		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-		
-		pcmpeqw		xmm0,	xmm0
-		psrlw		xmm0,	15
-		psllw		xmm0,	8
-		psubusw		xmm0,	xmm4
-		pmullw		xmm0,	xmm6
-		paddusw		xmm5,	xmm0
-		psrlw		xmm5,	8
-		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5		
-		
-		pop ebx
-		ret	
-
-WELS_EXTERN WaverageChromaFilter8_sse2
-;***********************************************************************
-; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
-;***********************************************************************
-;5x5 filter:
-;1	1	2	1	1
-;1	2	4	2	1
-;2	4	20	4	2
-;1	2	4	2	1
-;1	1	2	1	1
-
-ALIGN 16
-WaverageChromaFilter8_sse2:
-		mov		edx,	[esp + 4]	; pixels
-		mov		ecx,	[esp + 8]	; stride
-		
-		mov		eax,	ecx
-		add		eax,	eax
-		sub		edx,	eax			; pixels - 2 * stride
-		sub		edx,	2
-			
-		pxor	xmm0,	xmm0	
-		pxor	xmm3,	xmm3
-	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		add		edx,	eax	
-		movdqu		xmm1,	[edx]
-		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
-		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
-	
-		psrlw		xmm3,		6
-		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3			
-
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  predenoise.asm
+;*
+;*  Abstract
+;*      denoise for SVC2.1
+;*  History
+;*      4/13/2010 Created
+;*      7/30/2010 Modified
+;*
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+;***********************************************************************
+; Constant
+;***********************************************************************
+SECTION .rodata align=16
+
+sse2_32 times 8 dw 32
+sse2_20 times 8 dw 20
+
+
+BITS 32
+;***********************************************************************
+; Code
+;***********************************************************************
+SECTION .text
+	
+%macro	WEIGHT_LINE	9
+		movq		%2,	%9
+		punpcklbw	%2,	%7
+		movdqa		%8,	%2
+		
+		movdqa		%1,	%6
+		psubusb		%1,	%8
+		psubusb		%8,	%6
+		por			%8,	%1		; ABS(curPixel - centerPixel);
+		
+		movdqa		%1,	%3
+		psubusb		%1,	%8
+
+		pmullw		%1,	%1
+		psrlw		%1,	5
+		pmullw		%2,	%1		
+		paddusw		%4,	%1
+		paddusw		%5,	%2	
+%endmacro
+
+%macro	WEIGHT_LINE1_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE2_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		paddw		%3,	%2
+%endmacro
+
+%macro	WEIGHT_LINE3_UV	4
+		movdqa		%2,	%1
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	1
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+
+		movdqa		%2,	%1
+		psrldq		%2,	2
+		punpcklbw	%2,	%4
+		pmullw		%2,	[sse2_20]
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	3
+		punpcklbw	%2,	%4
+		psllw		%2,	2
+		paddw		%3,	%2
+		
+		movdqa		%2,	%1
+		psrldq		%2,	4
+		punpcklbw	%2,	%4
+		psllw		%2,	1
+		paddw		%3,	%2
+%endmacro
+
+ALIGN 16
+WELS_EXTERN BilateralLumaFilter8_sse2
+;***********************************************************************
+;  BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;	1	2	3
+;	4	0	5
+;	6	7	8
+;	0:	the center point
+%define		pushsize	4
+%define		pixel		esp + pushsize + 4
+%define		stride		esp + pushsize + 8
+BilateralLumaFilter8_sse2:
+		push		ebx
+		
+		pxor		xmm7,	xmm7
+		mov			eax,	[pixel]
+		mov			ebx,	eax
+		movq		xmm6,	[eax]
+		punpcklbw	xmm6,	xmm7
+		movdqa		xmm3,	[sse2_32]
+		pxor		xmm4,	xmm4		; nTotWeight
+		pxor		xmm5,	xmm5		; nSum
+		
+		dec			eax
+		mov			ecx,	[stride]
+		
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
+		
+		sub			eax,	ecx
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
+		
+		lea			eax,	[eax + ecx * 2]
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
+		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
+		
+		pcmpeqw		xmm0,	xmm0
+		psrlw		xmm0,	15
+		psllw		xmm0,	8
+		psubusw		xmm0,	xmm4
+		pmullw		xmm0,	xmm6
+		paddusw		xmm5,	xmm0
+		psrlw		xmm5,	8
+		packuswb	xmm5,	xmm5
+		movq		[ebx],	xmm5		
+		
+		pop ebx
+		ret	
+
+WELS_EXTERN WaverageChromaFilter8_sse2
+;***********************************************************************
+; void		WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
+;***********************************************************************
+;5x5 filter:
+;1	1	2	1	1
+;1	2	4	2	1
+;2	4	20	4	2
+;1	2	4	2	1
+;1	1	2	1	1
+
+ALIGN 16
+WaverageChromaFilter8_sse2:
+		mov		edx,	[esp + 4]	; pixels
+		mov		ecx,	[esp + 8]	; stride
+		
+		mov		eax,	ecx
+		add		eax,	eax
+		sub		edx,	eax			; pixels - 2 * stride
+		sub		edx,	2
+			
+		pxor	xmm0,	xmm0	
+		pxor	xmm3,	xmm3
+	
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+		
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
+		
+		add		edx,	eax	
+		movdqu		xmm1,	[edx]
+		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
+		
+		movdqu		xmm1,	[edx + ecx]
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
+		
+		movdqu		xmm1,	[edx + ecx * 2]
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
+	
+		psrlw		xmm3,		6
+		packuswb	xmm3,		xmm3
+		movq		[edx + 2],		xmm3			
+
 		ret	
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -1,1225 +1,1225 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	upsampling.asm
-;*
-;*  Abstract
-;*		SIMD for pixel domain down sampling
-;*
-;*  History
-;*		10/22/2009	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-
-;***********************************************************************
-; Some constants
-;***********************************************************************
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-SECTION .rodata align=16
-
-;***********************************************************************
-; Various memory constants (trigonometric values or rounding values)
-;***********************************************************************
-
-ALIGN 16
-shufb_mask_low:
-	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
-shufb_mask_high:
-	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
-
-
-ALIGN 16
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-	
-	; 2nd part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-	movq mm1, [esi+16]		; 1st pSrc line + 16
-	movq mm2, [esi+24]		; 1st pSrc line + 24
-	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
-	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
-
-	; to handle mm1, mm2, mm3, mm4
-	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm5, mm6		; d c D C b a B A
-	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm6, mm7		; h g H G f e F E
-	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm7, mm1		; l k L K j i J I
-	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
-
-	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm1, mm2 		; p o P O n m N M
-	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
-
-	; to handle mm5, mm6, mm7, mm1
-	movq mm2, mm5
-	punpckldq mm2, mm6 	; H G F E D C B A
-	punpckhdq mm5, mm6 	; h g f e d c b a
-
-	movq mm3, mm7
-	punpckldq mm3, mm1 	; P O N M L K J I
-	punpckhdq mm7, mm1 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
-
-	movq [edi  ], mm0
-	movq [edi+8], mm2
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
-	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line
-	movq mm1, [esi+8]		; 1st pSrc line + 8
-	movq mm2, [esi+ecx]		; 2nd pSrc line
-	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm4, mm5		; d c D C b a B A
-	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm5, mm6		; h g H G f e F E
-	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
-
-	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
-	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
-	punpcklbw mm6, mm7		; l k L K j i J I
-	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
-
-	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
-	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
-	punpcklbw mm7, mm0 		; p o P O n m N M
-	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
-
-	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
-	punpckldq mm0, mm5 	; H G F E D C B A
-	punpckhdq mm4, mm5 	; h g f e d c b a
-
-	movq mm1, mm6
-	punpckldq mm1, mm7 	; P O N M L K J I
-	punpckhdq mm6, mm7 	; p o n m l k j i
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
-	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movq [edi  ], mm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx8_sse:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 8 bytes
-.xloops:
-	; 1st part horizonal loop: x8 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	mm0: d D c C b B a A
-	;2nd Line Src:	mm1: h H g G f F e E
-	;=> target:
-	;: H G F E D C B A
-	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line	
-	movq mm1, [esi+ecx]		; 2nd pSrc line	
-
-	; to handle mm0, mm1, mm2, mm3
-	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
-	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
-	punpcklbw mm2, mm3		; d c D C b a B A
-	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
-
-	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
-	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
-	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
-
-	; to handle mm2, mm4
-	movq mm0, mm2		; 
-	punpckldq mm0, mm4 	; H G F E D C B A
-	punpckhdq mm2, mm4 	; h g f e d c b a
-
-	; avg within MB horizon width (16 x 2 lines)
-	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B	
-	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-
-	movd [edi],	mm0	
-
-	; next unit
-	lea esi, [esi+8]
-	lea edi, [edi+4]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-
-	WELSEMMS
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line
-	movdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm4 high bits
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8	
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8	
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8	
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line	
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	; another implementation for xmm2 high bits
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8	
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
-WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
-;					unsigned char* pSrc, const int iSrcStride,
-;					const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx32_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
-
-	movdqa xmm7, [shufb_mask_low]	; mask low	
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1			; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
-	neg ebx				; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 32 bytes
-.xloops:
-	; 1st part horizonal loop: x16 bytes
-	;               mem  hi<-       ->lo
-	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
-	;				xmm1: p P o O n N m M l L k K j J i I
-	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
-	;				xmm3: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: P O N M L K J I H G F E D C B A
-	;: p o n m l k j i h g f e d c b a
-	;: P ..                          A
-	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line
-	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
-	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
-	; packing & avg
-	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm4
-
-	movdqa xmm5, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm1
-;	psrlw xmm5, 8
-	pavgb xmm1, xmm5
-
-	movdqa xmm4, xmm2
-	pshufb xmm2, xmm7
-	pshufb xmm4, xmm6
-;	psubb xmm4, xmm2
-;	psrlw xmm4, 8
-	pavgb xmm2, xmm4
-
-	movdqa xmm5, xmm3
-	pshufb xmm3, xmm7
-	pshufb xmm5, xmm6
-;	psubb xmm5, xmm3
-;	psrlw xmm5, 8
-	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
-
-	; write pDst
-	movdqa [edi], xmm0
-
-	; next SMB
-	lea esi, [esi+32]
-	lea edi, [edi+16]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop	edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
-;***********************************************************************
-;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
-;					  unsigned char* pSrc, const int iSrcStride,
-;					  const int iSrcWidth, const int iSrcHeight );
-;***********************************************************************
-ALIGN 16
-DyadicBilinearDownsamplerWidthx16_sse4:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov edi, [esp+24]	; pDst
-	mov edx, [esp+28]	; iDstStride
-	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
-	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low
-	movdqa xmm6, [shufb_mask_high]	; mask high
-
-.yloops:
-	mov eax, [esp+40]	; iSrcWidth
-	sar eax, $1		; iSrcWidth >> 1
-	mov ebx, eax		; iDstWidth restored at ebx
-	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
-	neg ebx			; - (iSrcWidth >> 1)
-	; each loop = source bandwidth: 16 bytes
-.xloops:
-	; horizonal loop: x16 bytes by source
-	;               mem  hi<-       ->lo
-	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
-	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
-	;=> target:
-	;: H G F E D C B A, P O N M L K J I
-	;: h g f e d c b a, p o n m l k j i
-
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line	
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
-	; packing & avg
-	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
-	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
-	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
-	pavgb xmm0, xmm2
-
-	movdqa xmm3, xmm1
-	pshufb xmm1, xmm7
-	pshufb xmm3, xmm6
-;	psubb xmm3, xmm1
-;	psrlw xmm3, 8
-	pavgb xmm1, xmm3
-
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
-
-	; write pDst
-	movq [edi], xmm0	
-
-	; next SMB
-	lea esi, [esi+16]
-	lea edi, [edi+8]
-
-	dec eax
-	jg near .xloops
-
-	; next line
-	lea esi, [esi+2*ecx]	; next end of lines
-	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
-	lea edi, [edi+edx]
-	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
-
-	dec ebp
-	jg near .yloops
-	
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-
-
-
-
-
-WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;                           unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearAccurateDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	32767
-	mov		eax,	[uiScaleX]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
-	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
-	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		40003fffh
-	movd	xmm5,		edx
-	punpcklwd	xmm5,	xmm0					; 16384 16383
-	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-	
-
-DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,			16384
-	mov		[yInverse],		eax
-	
-	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-	
-HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-	
-WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	pxor	xmm0,		xmm0
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	movdqa	xmm0,	xmm2
-	pmuludq	xmm2,	xmm1
-	psrlq	xmm0,	32
-	psrlq	xmm1,	32
-	pmuludq	xmm0,	xmm1
-	paddq	xmm2,	xmm0
-	pshufd	xmm1,	xmm2,	00001110b
-	paddq	xmm2,	xmm1
-	psrlq	xmm2,	29
-	
-	movd	eax,	xmm2
-	inc		eax
-	shr		eax,	1
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	psllw	xmm3,		1
-	psrlw	xmm3,		1
-	
-	loop	WIDTH
-
-WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		HEIGHT
-
-
-LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		16384
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		15
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	LAST_ROW_WIDTH
-
-LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
-	ret
-	
-	
-	
-	
-WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
-;**************************************************************************************************************
-;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
-;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
-;               unsigned int uiScaleX, unsigned int uiScaleY );
-;{
-;**************************************************************************************************************
-
-ALIGN 16
-GeneralBilinearFastDownsampler_sse2:
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-%define		pushsize	16
-%define		localsize	28
-%define		pDstData		esp + pushsize + localsize + 4
-%define		dwDstStride		esp + pushsize + localsize + 8
-%define		dwDstWidth		esp + pushsize + localsize + 12
-%define		dwDstHeight		esp + pushsize + localsize + 16
-%define		pSrcData		esp + pushsize + localsize + 20
-%define		dwSrcStride		esp + pushsize + localsize + 24
-%define		dwSrcWidth		esp + pushsize + localsize + 28
-%define		dwSrcHeight		esp + pushsize + localsize + 32
-%define		scale			esp + 0
-%define		uiScaleX			esp + pushsize + localsize + 36
-%define		uiScaleY			esp + pushsize + localsize + 40
-%define		tmpHeight		esp + 12
-%define		yInverse		esp + 16
-%define		xInverse		esp + 20
-%define		dstStep			esp + 24
-	sub		esp,			localsize
-	
-	pxor	xmm0,	xmm0
-	mov		edx,	65535
-	mov		eax,	[uiScaleX]
-	and		eax,	edx
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	65535
-	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
-	movd	xmm2,		ebx						; -uinc
-	psllq	xmm1,		32
-	por		xmm1,		xmm2					; 0 uinc 0 -uinc
-	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-	
-	mov		eax,	[uiScaleY]
-	and		eax,	32767
-	mov		ebx,	eax
-	neg		ebx
-	and		ebx,	32767
-	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
-	movd	xmm2,		ebx						; -vinc
-	psllq	xmm6,		32
-	por		xmm6,		xmm2					; 0 vinc 0 -vinc
-	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-	
-	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx					
-	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
-	mov		ebx,		16384
-	
-
-FAST_DOWNSAMPLE:
-	
-	mov		eax,			[dwDstHeight]
-	mov		edi,			[pDstData]
-	mov		edx,			[dwDstStride]
-	mov		ecx,			[dwDstWidth]
-	sub		edx,			ecx
-	mov		[dstStep],	edx				; stride - width
-	dec		eax
-	mov		[tmpHeight],	eax
-	mov		eax,		16384
-	mov		[yInverse],		eax
-	
-	pshuflw	xmm4,		xmm5,	01010000b
-	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-	
-FAST_HEIGHT:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	mov		ebp,	esi
-	add		ebp,	[dwSrcStride]
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	dec		ecx
-	
-	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-	
-FAST_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	movd	xmm1,		[esi+eax]		; xxxxxxba
-	movd	xmm2,		[ebp+eax]		; xxxxxxdc
-	punpcklwd	xmm1,	xmm2			; xxxxdcba
-	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	
-	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
-	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
-	pmaddwd		xmm2,	xmm1
-	pshufd	xmm1,	xmm2,	00000001b
-	paddd	xmm2,	xmm1
-	movd	xmm1,	ebx
-	paddd	xmm2,	xmm1
-	psrld	xmm2,	15
-	
-	packuswb	xmm2,	xmm0
-	movd	eax,	xmm2
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	paddw	xmm3,		xmm7			; inc u
-	
-	loop	FAST_WIDTH
-
-FAST_WIDTH_END:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	mov		cl,			[esi+eax]
-	mov		[edi],		cl
-	inc		edi
-	
-	mov		eax,		[uiScaleY]
-	add		[yInverse],	eax
-	add		edi,		[dstStep]
-	
-	paddw	xmm4,	xmm6				; inc v
-	psllw	xmm4,	1
-	psrlw	xmm4,	1
-	
-	dec		dword [tmpHeight]
-	jg		FAST_HEIGHT
-
-
-FAST_LAST_ROW:	
-	mov		eax,	[yInverse]
-	mov		esi,	[pSrcData]
-	shr		eax,	15
-	mul		dword [dwSrcStride]
-	add		esi,	eax					; get current row address
-	
-	mov		eax,		32768
-	mov		[xInverse],		eax
-	mov		ecx,			[dwDstWidth]
-	
-FAST_LAST_ROW_WIDTH:
-	mov		eax,		[xInverse]
-	shr		eax,		16
-	
-	mov		al,			[esi+eax]
-	mov		[edi],	al
-	inc		edi
-	
-	mov		eax,		[uiScaleX]
-	add		[xInverse],	eax
-	
-	loop	FAST_LAST_ROW_WIDTH
-
-FAST_LAST_ROW_END:
-
-	add		esp,			localsize
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		pushsize
-%undef		localsize
-%undef		pSrcData
-%undef		dwSrcWidth
-%undef		dwSrcHeight
-%undef		dwSrcStride
-%undef		pDstData
-%undef		dwDstWidth
-%undef		dwDstHeight
-%undef		dwDstStride
-%undef		scale
-%undef		uiScaleX
-%undef		uiScaleY
-%undef		tmpHeight
-%undef		yInverse
-%undef		xInverse
-%undef		dstStep
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	upsampling.asm
+;*
+;*  Abstract
+;*		SIMD for pixel domain down sampling
+;*
+;*  History
+;*		10/22/2009	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+
+;***********************************************************************
+; Some constants
+;***********************************************************************
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+SECTION .rodata align=16
+
+;***********************************************************************
+; Various memory constants (trigonometric values or rounding values)
+;***********************************************************************
+
+ALIGN 16
+shufb_mask_low:
+	db 00h, 80h, 02h, 80h, 04h, 80h, 06h, 80h, 08h, 80h, 0ah, 80h, 0ch, 80h, 0eh, 80h
+shufb_mask_high:
+	db 01h, 80h, 03h, 80h, 05h, 80h, 07h, 80h, 09h, 80h, 0bh, 80h, 0dh, 80h, 0fh, 80h
+
+
+ALIGN 16
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		; 
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+	
+	; 2nd part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm1, [esi+16]		; 1st pSrc line + 16
+	movq mm2, [esi+24]		; 1st pSrc line + 24
+	movq mm3, [esi+ecx+16]	; 2nd pSrc line + 16
+	movq mm4, [esi+ecx+24]	; 2nd pSrc line + 24
+
+	; to handle mm1, mm2, mm3, mm4
+	pshufw mm5, mm1, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm6, mm5, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm5, mm6		; d c D C b a B A
+	pshufw mm5, mm5, 0d8h  	; d c b a D C B A ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm7, mm6, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm6, mm7		; h g H G f e F E
+	pshufw mm6, mm6, 0d8h  	; h g f e H G F E ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm1, mm7, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm7, mm1		; l k L K j i J I
+	pshufw mm7, mm7, 0d8h  	; l k j i L K J I ; 11011000 B: mm7
+
+	pshufw mm1, mm4, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm2, mm1, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm1, mm2 		; p o P O n m N M
+	pshufw mm1, mm1, 0d8h  	; p o n m P O N M ; 11011000 B: mm1
+
+	; to handle mm5, mm6, mm7, mm1
+	movq mm2, mm5
+	punpckldq mm2, mm6 	; H G F E D C B A
+	punpckhdq mm5, mm6 	; h g f e d c b a
+
+	movq mm3, mm7
+	punpckldq mm3, mm1 	; P O N M L K J I
+	punpckhdq mm7, mm1 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm2, mm5		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm3, mm7		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm2, mm3		; (temp_row1+temp_row2+1)>>1, done in another 2nd horizonal part
+
+	movq [edi  ], mm0
+	movq [edi+8], mm2
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
+	;2nd Line Src:	mm2: l L k K j J i I   	mm3: p P o O n N m M
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+8]		; 1st pSrc line + 8
+	movq mm2, [esi+ecx]		; 2nd pSrc line
+	movq mm3, [esi+ecx+8]	; 2nd pSrc line + 8
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm4, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm5, mm4, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm4, mm5		; d c D C b a B A
+	pshufw mm4, mm4, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm5, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm6, mm5, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm5, mm6		; h g H G f e F E
+	pshufw mm5, mm5, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
+
+	pshufw mm6, mm2, 0d8h	; l L j J k K i I ; 11011000 B
+	pshufw mm7, mm6, 04eh	; k K i I l L j J ; 01001110 B
+	punpcklbw mm6, mm7		; l k L K j i J I
+	pshufw mm6, mm6, 0d8h  	; l k j i L K J I ; 11011000 B: mm6
+
+	pshufw mm7, mm3, 0d8h	; p P n N o O m M ; 11011000 B
+	pshufw mm0, mm7, 04eh	; o O m M p P n N ; 01001110 B
+	punpcklbw mm7, mm0 		; p o P O n m N M
+	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
+
+	; to handle mm4, mm5, mm6, mm7
+	movq mm0, mm4		; 
+	punpckldq mm0, mm5 	; H G F E D C B A
+	punpckhdq mm4, mm5 	; h g f e d c b a
+
+	movq mm1, mm6
+	punpckldq mm1, mm7 	; P O N M L K J I
+	punpckhdq mm6, mm7 	; p o n m l k j i
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
+	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movq [edi  ], mm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx8_sse
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx8_sse( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx8_sse:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $2		; (iSrcWidth >> 1) / 4		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 8 bytes
+.xloops:
+	; 1st part horizonal loop: x8 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	mm0: d D c C b B a A
+	;2nd Line Src:	mm1: h H g G f F e E
+	;=> target:
+	;: H G F E D C B A
+	;: h g f e d c b a
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movq mm0, [esi]			; 1st pSrc line	
+	movq mm1, [esi+ecx]		; 2nd pSrc line	
+
+	; to handle mm0, mm1, mm2, mm3
+	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
+	pshufw mm3, mm2, 04eh	; c C a A d D b B ; 01001110 B
+	punpcklbw mm2, mm3		; d c D C b a B A
+	pshufw mm2, mm2, 0d8h  	; d c b a D C B A ; 11011000 B: mm4
+
+	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
+	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
+	punpcklbw mm4, mm5		; h g H G f e F E
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
+
+	; to handle mm2, mm4
+	movq mm0, mm2		; 
+	punpckldq mm0, mm4 	; H G F E D C B A
+	punpckhdq mm2, mm4 	; h g f e d c b a
+
+	; avg within MB horizon width (16 x 2 lines)
+	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
+	pshufw mm1, mm0, 04eh	; 01001110 B	
+	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
+
+	movd [edi],	mm0	
+
+	; next unit
+	lea esi, [esi+8]
+	lea edi, [edi+4]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+
+	WELSEMMS
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+; got about 50% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_ssse3(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
+	
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm4 high bits
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8	
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8	
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8	
+	pavgb xmm3, xmm5
+	
+	packuswb xmm0, xmm1	
+	packuswb xmm2, xmm3	
+	pavgb xmm0, xmm2	
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_ssse3
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_ssse3( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+	movdqa xmm7, [shufb_mask_low]	; mask low	
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movdqa xmm0, [esi]			; 1st_src_line	
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
+	
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	; another implementation for xmm2 high bits
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8	
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1	
+	packuswb xmm0, xmm1	
+
+	; write pDst
+	movq [edi], xmm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+; got about 65% improvement over DyadicBilinearDownsamplerWidthx32_sse
+WELS_EXTERN DyadicBilinearDownsamplerWidthx32_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx32_sse4(	unsigned char* pDst, const int iDstStride,
+;					unsigned char* pSrc, const int iSrcStride,
+;					const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx32_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1			; iSrcHeight >> 1	
+
+	movdqa xmm7, [shufb_mask_low]	; mask low	
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1			; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $4			; (iSrcWidth >> 1) / 16		; loop count = num_of_mb
+	neg ebx				; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 32 bytes
+.xloops:
+	; 1st part horizonal loop: x16 bytes
+	;               mem  hi<-       ->lo
+	;1st Line Src:	xmm0: h H g G f F e E d D c C b B a A
+	;				xmm1: p P o O n N m M l L k K j J i I
+	;2nd Line Src:	xmm2: h H g G f F e E d D c C b B a A
+	;				xmm3: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: P O N M L K J I H G F E D C B A
+	;: p o n m l k j i h g f e d c b a
+	;: P ..                          A
+	;: p ..                          a
+	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
+	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
+	
+	; packing & avg
+	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm4, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm4, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm4, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm4
+
+	movdqa xmm5, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm1
+;	psrlw xmm5, 8
+	pavgb xmm1, xmm5
+
+	movdqa xmm4, xmm2
+	pshufb xmm2, xmm7
+	pshufb xmm4, xmm6
+;	psubb xmm4, xmm2
+;	psrlw xmm4, 8
+	pavgb xmm2, xmm4
+
+	movdqa xmm5, xmm3
+	pshufb xmm3, xmm7
+	pshufb xmm5, xmm6
+;	psubb xmm5, xmm3
+;	psrlw xmm5, 8
+	pavgb xmm3, xmm5
+	
+	packuswb xmm0, xmm1	
+	packuswb xmm2, xmm3	
+	pavgb xmm0, xmm2	
+
+	; write pDst
+	movdqa [edi], xmm0
+
+	; next SMB
+	lea esi, [esi+32]
+	lea edi, [edi+16]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop	edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+WELS_EXTERN DyadicBilinearDownsamplerWidthx16_sse4
+;***********************************************************************
+;	void DyadicBilinearDownsamplerWidthx16_sse4( unsigned char* pDst, const int iDstStride,
+;					  unsigned char* pSrc, const int iSrcStride,
+;					  const int iSrcWidth, const int iSrcHeight );
+;***********************************************************************
+ALIGN 16
+DyadicBilinearDownsamplerWidthx16_sse4:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov edi, [esp+24]	; pDst
+	mov edx, [esp+28]	; iDstStride
+	mov esi, [esp+32]	; pSrc
+	mov ecx, [esp+36]	; iSrcStride	
+	mov ebp, [esp+44]	; iSrcHeight
+	
+	sar ebp, $1		; iSrcHeight >> 1	
+	movdqa xmm7, [shufb_mask_low]	; mask low
+	movdqa xmm6, [shufb_mask_high]	; mask high
+
+.yloops:
+	mov eax, [esp+40]	; iSrcWidth
+	sar eax, $1		; iSrcWidth >> 1
+	mov ebx, eax		; iDstWidth restored at ebx
+	sar eax, $3		; (iSrcWidth >> 1) / 8		; loop count = num_of_mb
+	neg ebx			; - (iSrcWidth >> 1)
+	; each loop = source bandwidth: 16 bytes
+.xloops:
+	; horizonal loop: x16 bytes by source
+	;               mem  hi<-       ->lo
+	;1st line pSrc:	xmm0: h H g G f F e E d D c C b B a A
+	;2nd line pSrc:  xmm1: p P o O n N m M l L k K j J i I
+	;=> target:
+	;: H G F E D C B A, P O N M L K J I
+	;: h g f e d c b a, p o n m l k j i
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	movntdqa xmm0, [esi]			; 1st_src_line	
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
+	
+	; packing & avg
+	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
+	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
+	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
+	pavgb xmm0, xmm2
+
+	movdqa xmm3, xmm1
+	pshufb xmm1, xmm7
+	pshufb xmm3, xmm6
+;	psubb xmm3, xmm1
+;	psrlw xmm3, 8
+	pavgb xmm1, xmm3
+
+	pavgb xmm0, xmm1	
+	packuswb xmm0, xmm1	
+
+	; write pDst
+	movq [edi], xmm0	
+
+	; next SMB
+	lea esi, [esi+16]
+	lea edi, [edi+8]
+
+	dec eax
+	jg near .xloops
+
+	; next line
+	lea esi, [esi+2*ecx]	; next end of lines
+	lea esi, [esi+2*ebx]	; reset to base 0 [- 2 * iDstWidth]
+	lea edi, [edi+edx]
+	lea edi, [edi+ebx]		; reset to base 0 [- iDstWidth]
+
+	dec ebp
+	jg near .yloops
+	
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+
+
+
+
+
+WELS_EXTERN	GeneralBilinearAccurateDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearAccurateDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;							unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;                           unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearAccurateDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+	
+	pxor	xmm0,	xmm0
+	mov		edx,	32767
+	mov		eax,	[uiScaleX]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm1,		eax						; uinc(uiScaleX mod 32767)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
+	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
+	
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
+	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
+	
+	mov		edx,		40003fffh
+	movd	xmm5,		edx
+	punpcklwd	xmm5,	xmm0					; 16384 16383
+	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
+	
+
+DOWNSAMPLE:
+	
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,			16384
+	mov		[yInverse],		eax
+	
+	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
+	
+HEIGHT:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+	
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+	
+	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
+	
+WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	pxor	xmm0,		xmm0
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
+	
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	movdqa	xmm0,	xmm2
+	pmuludq	xmm2,	xmm1
+	psrlq	xmm0,	32
+	psrlq	xmm1,	32
+	pmuludq	xmm0,	xmm1
+	paddq	xmm2,	xmm0
+	pshufd	xmm1,	xmm2,	00001110b
+	paddq	xmm2,	xmm1
+	psrlq	xmm2,	29
+	
+	movd	eax,	xmm2
+	inc		eax
+	shr		eax,	1
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	paddw	xmm3,		xmm7			; inc u
+	psllw	xmm3,		1
+	psrlw	xmm3,		1
+	
+	loop	WIDTH
+
+WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+	
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+	
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+	
+	dec		dword [tmpHeight]
+	jg		HEIGHT
+
+
+LAST_ROW:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	
+	mov		eax,		16384
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	
+LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		15
+	
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	loop	LAST_ROW_WIDTH
+
+LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
+	ret
+	
+	
+	
+	
+WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
+;**************************************************************************************************************
+;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
+;				unsigned char* pSrc, const int iSrcStride, const int iSrcWidth, const int iSrcHeight,
+;               unsigned int uiScaleX, unsigned int uiScaleY );
+;{
+;**************************************************************************************************************
+
+ALIGN 16
+GeneralBilinearFastDownsampler_sse2:
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+%define		pushsize	16
+%define		localsize	28
+%define		pDstData		esp + pushsize + localsize + 4
+%define		dwDstStride		esp + pushsize + localsize + 8
+%define		dwDstWidth		esp + pushsize + localsize + 12
+%define		dwDstHeight		esp + pushsize + localsize + 16
+%define		pSrcData		esp + pushsize + localsize + 20
+%define		dwSrcStride		esp + pushsize + localsize + 24
+%define		dwSrcWidth		esp + pushsize + localsize + 28
+%define		dwSrcHeight		esp + pushsize + localsize + 32
+%define		scale			esp + 0
+%define		uiScaleX			esp + pushsize + localsize + 36
+%define		uiScaleY			esp + pushsize + localsize + 40
+%define		tmpHeight		esp + 12
+%define		yInverse		esp + 16
+%define		xInverse		esp + 20
+%define		dstStep			esp + 24
+	sub		esp,			localsize
+	
+	pxor	xmm0,	xmm0
+	mov		edx,	65535
+	mov		eax,	[uiScaleX]
+	and		eax,	edx
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	65535
+	movd	xmm1,		eax						; uinc(uiScaleX mod 65536)
+	movd	xmm2,		ebx						; -uinc
+	psllq	xmm1,		32
+	por		xmm1,		xmm2					; 0 uinc 0 -uinc
+	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
+	
+	mov		eax,	[uiScaleY]
+	and		eax,	32767
+	mov		ebx,	eax
+	neg		ebx
+	and		ebx,	32767
+	movd	xmm6,		eax						; vinc(uiScaleY mod 32767)
+	movd	xmm2,		ebx						; -vinc
+	psllq	xmm6,		32
+	por		xmm6,		xmm2					; 0 vinc 0 -vinc
+	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
+	
+	mov		edx,		80007fffh				; 32768 32767
+	movd	xmm5,		edx					
+	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
+	mov		ebx,		16384
+	
+
+FAST_DOWNSAMPLE:
+	
+	mov		eax,			[dwDstHeight]
+	mov		edi,			[pDstData]
+	mov		edx,			[dwDstStride]
+	mov		ecx,			[dwDstWidth]
+	sub		edx,			ecx
+	mov		[dstStep],	edx				; stride - width
+	dec		eax
+	mov		[tmpHeight],	eax
+	mov		eax,		16384
+	mov		[yInverse],		eax
+	
+	pshuflw	xmm4,		xmm5,	01010000b
+	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
+	
+FAST_HEIGHT:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	mov		ebp,	esi
+	add		ebp,	[dwSrcStride]
+	
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	dec		ecx
+	
+	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
+	
+FAST_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	
+	movd	xmm1,		[esi+eax]		; xxxxxxba
+	movd	xmm2,		[ebp+eax]		; xxxxxxdc
+	punpcklwd	xmm1,	xmm2			; xxxxdcba
+	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
+	
+	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
+	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
+	pmaddwd		xmm2,	xmm1
+	pshufd	xmm1,	xmm2,	00000001b
+	paddd	xmm2,	xmm1
+	movd	xmm1,	ebx
+	paddd	xmm2,	xmm1
+	psrld	xmm2,	15
+	
+	packuswb	xmm2,	xmm0
+	movd	eax,	xmm2
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	paddw	xmm3,		xmm7			; inc u
+	
+	loop	FAST_WIDTH
+
+FAST_WIDTH_END:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	mov		cl,			[esi+eax]
+	mov		[edi],		cl
+	inc		edi
+	
+	mov		eax,		[uiScaleY]
+	add		[yInverse],	eax
+	add		edi,		[dstStep]
+	
+	paddw	xmm4,	xmm6				; inc v
+	psllw	xmm4,	1
+	psrlw	xmm4,	1
+	
+	dec		dword [tmpHeight]
+	jg		FAST_HEIGHT
+
+
+FAST_LAST_ROW:	
+	mov		eax,	[yInverse]
+	mov		esi,	[pSrcData]
+	shr		eax,	15
+	mul		dword [dwSrcStride]
+	add		esi,	eax					; get current row address
+	
+	mov		eax,		32768
+	mov		[xInverse],		eax
+	mov		ecx,			[dwDstWidth]
+	
+FAST_LAST_ROW_WIDTH:
+	mov		eax,		[xInverse]
+	shr		eax,		16
+	
+	mov		al,			[esi+eax]
+	mov		[edi],	al
+	inc		edi
+	
+	mov		eax,		[uiScaleX]
+	add		[xInverse],	eax
+	
+	loop	FAST_LAST_ROW_WIDTH
+
+FAST_LAST_ROW_END:
+
+	add		esp,			localsize
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		pushsize
+%undef		localsize
+%undef		pSrcData
+%undef		dwSrcWidth
+%undef		dwSrcHeight
+%undef		dwSrcStride
+%undef		pDstData
+%undef		dwDstWidth
+%undef		dwDstHeight
+%undef		dwDstStride
+%undef		scale
+%undef		uiScaleX
+%undef		uiScaleY
+%undef		tmpHeight
+%undef		yInverse
+%undef		xInverse
+%undef		dstStep
 	ret
\ No newline at end of file
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -1,145 +1,145 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  intra_pred.asm
-;*
-;*  Abstract
-;*      sse2 function for intra predict operations
-;*
-;*  History
-;*      18/09/2009 Created
-;*
-;*
-;*************************************************************************/
-%include "../../src/asm/asm_inc.asm"
-
-BITS 32
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-%ifdef FORMAT_COFF
-SECTION .rodata data
-%else
-SECTION .rodata align=16
-%endif
-
-
-align 16
-mmx_01bytes:		times 16	db 1
-
-;***********************************************************************
-; macros
-;***********************************************************************
-%macro  COPY_16_TIMES 2
-		movdqa		%2,	[%1-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-%macro  COPY_16_TIMESS 3
-		movdqa		%2,	[%1+%3-16]
-		psrldq		%2,	15
-		pmuludq		%2,	[mmx_01bytes]
-		pshufd		%2,	%2, 0
-%endmacro
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-;***********************************************************************
-; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-
-%macro SSE2_PRED_H_16X16_TWO_LINE 1
-    lea     eax,	[eax+ecx*2]
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx+%1],	xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+%1+0x10],	xmm0
-%endmacro
-
-WELS_EXTERN WelsI16x16LumaPredH_sse2
-WelsI16x16LumaPredH_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    COPY_16_TIMES eax,	xmm0
-    movdqa  [edx],		xmm0
-    COPY_16_TIMESS eax,	xmm0,	ecx
-    movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
-	SSE2_PRED_H_16X16_TWO_LINE   0x40
-	SSE2_PRED_H_16X16_TWO_LINE   0x60
-	SSE2_PRED_H_16X16_TWO_LINE   0x80
-	SSE2_PRED_H_16X16_TWO_LINE   0xa0
-	SSE2_PRED_H_16X16_TWO_LINE   0xc0
-	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
-    ret
-    
-;***********************************************************************
-; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
-;***********************************************************************
-WELS_EXTERN WelsI16x16LumaPredV_sse2
-WelsI16x16LumaPredV_sse2:
-    mov     edx, [esp+4]    ; pred
-    mov     eax, [esp+8]	; pRef
-    mov     ecx, [esp+12]   ; stride
-    
-    sub     eax, ecx
-    movdqa  xmm0, [eax]
-    
-    movdqa  [edx], xmm0
-    movdqa  [edx+10h], xmm0
-    movdqa  [edx+20h], xmm0
-    movdqa  [edx+30h], xmm0
-    movdqa  [edx+40h], xmm0
-    movdqa  [edx+50h], xmm0
-    movdqa  [edx+60h], xmm0
-    movdqa  [edx+70h], xmm0
-    movdqa  [edx+80h], xmm0
-    movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
-	movdqa  [edx+176], xmm0
-    movdqa  [edx+192], xmm0
-    movdqa  [edx+208], xmm0
-    movdqa  [edx+224], xmm0
-    movdqa  [edx+240], xmm0
-    
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  intra_pred.asm
+;*
+;*  Abstract
+;*      sse2 function for intra predict operations
+;*
+;*  History
+;*      18/09/2009 Created
+;*
+;*
+;*************************************************************************/
+%include "../../src/asm/asm_inc.asm"
+
+BITS 32
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+%ifdef FORMAT_COFF
+SECTION .rodata data
+%else
+SECTION .rodata align=16
+%endif
+
+
+align 16
+mmx_01bytes:		times 16	db 1
+
+;***********************************************************************
+; macros
+;***********************************************************************
+%macro  COPY_16_TIMES 2
+		movdqa		%2,	[%1-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+%macro  COPY_16_TIMESS 3
+		movdqa		%2,	[%1+%3-16]
+		psrldq		%2,	15
+		pmuludq		%2,	[mmx_01bytes]
+		pshufd		%2,	%2, 0
+%endmacro
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+;***********************************************************************
+; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+
+%macro SSE2_PRED_H_16X16_TWO_LINE 1
+    lea     eax,	[eax+ecx*2]
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx+%1],	xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+%1+0x10],	xmm0
+%endmacro
+
+WELS_EXTERN WelsI16x16LumaPredH_sse2
+WelsI16x16LumaPredH_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    COPY_16_TIMES eax,	xmm0
+    movdqa  [edx],		xmm0
+    COPY_16_TIMESS eax,	xmm0,	ecx
+    movdqa  [edx+0x10],	xmm0
+    
+	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+	SSE2_PRED_H_16X16_TWO_LINE   0x40
+	SSE2_PRED_H_16X16_TWO_LINE   0x60
+	SSE2_PRED_H_16X16_TWO_LINE   0x80
+	SSE2_PRED_H_16X16_TWO_LINE   0xa0
+	SSE2_PRED_H_16X16_TWO_LINE   0xc0
+	SSE2_PRED_H_16X16_TWO_LINE   0xe0
+   
+    ret
+    
+;***********************************************************************
+; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
+;***********************************************************************
+WELS_EXTERN WelsI16x16LumaPredV_sse2
+WelsI16x16LumaPredV_sse2:
+    mov     edx, [esp+4]    ; pred
+    mov     eax, [esp+8]	; pRef
+    mov     ecx, [esp+12]   ; stride
+    
+    sub     eax, ecx
+    movdqa  xmm0, [eax]
+    
+    movdqa  [edx], xmm0
+    movdqa  [edx+10h], xmm0
+    movdqa  [edx+20h], xmm0
+    movdqa  [edx+30h], xmm0
+    movdqa  [edx+40h], xmm0
+    movdqa  [edx+50h], xmm0
+    movdqa  [edx+60h], xmm0
+    movdqa  [edx+70h], xmm0
+    movdqa  [edx+80h], xmm0
+    movdqa  [edx+90h], xmm0
+    movdqa  [edx+160], xmm0 
+	movdqa  [edx+176], xmm0
+    movdqa  [edx+192], xmm0
+    movdqa  [edx+208], xmm0
+    movdqa  [edx+224], xmm0
+    movdqa  [edx+240], xmm0
+    
     ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -1,79 +1,79 @@
-;*!
-;* \copy
-;*     Copyright (c)  2009-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*  pixel_sse2.asm
-;*
-;*  Abstract
-;*      WelsSampleSad8x8_sse21
-;*
-;*  History
-;*      8/5/2009 Created
-;*
-;*
-;*************************************************************************/
-
-%include "asm_inc.asm"
-
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-%macro SAD_8x4 0
-	movq   xmm0,   [eax]
-	movq   xmm1,   [eax+ebx]
-	lea    eax,    [eax+2*ebx]
-	movhps xmm0,   [eax]
-	movhps xmm1,   [eax+ebx]
-
-	movq   xmm2,   [ecx]
-	movq   xmm3,   [ecx+edx]
-	lea    ecx,    [ecx+2*edx]
-	movhps xmm2,   [ecx]
-	movhps xmm3,   [ecx+edx]
-	psadbw xmm0,   xmm2
-	psadbw xmm1,   xmm3
-	paddw  xmm6,   xmm0
-	paddw  xmm6,   xmm1
-%endmacro
-
-
-  
-%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
-and    %1,  0x1f|(%3>>1)
-cmp    %1,  (32-%2)|(%3>>1)
-%endmacro
-
-
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*  pixel_sse2.asm
+;*
+;*  Abstract
+;*      WelsSampleSad8x8_sse21
+;*
+;*  History
+;*      8/5/2009 Created
+;*
+;*
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+%macro SAD_8x4 0
+	movq   xmm0,   [eax]
+	movq   xmm1,   [eax+ebx]
+	lea    eax,    [eax+2*ebx]
+	movhps xmm0,   [eax]
+	movhps xmm1,   [eax+ebx]
+
+	movq   xmm2,   [ecx]
+	movq   xmm3,   [ecx+edx]
+	lea    ecx,    [ecx+2*edx]
+	movhps xmm2,   [ecx]
+	movhps xmm3,   [ecx+edx]
+	psadbw xmm0,   xmm2
+	psadbw xmm1,   xmm3
+	paddw  xmm6,   xmm0
+	paddw  xmm6,   xmm1
+%endmacro
+
+
+  
+%macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
+and    %1,  0x1f|(%3>>1)
+cmp    %1,  (32-%2)|(%3>>1)
+%endmacro
+
+
 %macro SSE2_GetSad8x4 0
 	movq   xmm0,   [eax]
 	movq   xmm1,   [eax+ebx]
@@ -90,12 +90,12 @@
 	psadbw xmm1,   xmm3
 	paddw  xmm6,   xmm0
 	paddw  xmm6,   xmm1
-%endmacro
+%endmacro
 
 
-;***********************************************************************
-; Code
-;***********************************************************************
+;***********************************************************************
+; Code
+;***********************************************************************
 SECTION .text
 
 WELS_EXTERN WelsSampleSad8x8_sse21
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -1,1589 +1,1589 @@
-;*!
-;* \copy
-;*     Copyright (c)  2010-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-;*	vaa.asm
-;*
-;*	Abstract
-;*      sse2 for pVaa routines
-;*
-;*  History
-;*      04/14/2010	Created
-;*
-;*************************************************************************/
-%include "asm_inc.asm"
-BITS 32
-
-;***********************************************************************
-; Macros and other preprocessor constants
-;***********************************************************************
-
-;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
-;	movdqa %1, %2
-;	punpcklbw %1, %3
-;	punpckhbw %2, %3
-;	paddw %1, %2
-;	pmaddwd %1, %4
-;	pshufd %2, %1, 04Eh	; 01001110 B
-;	paddd %1, %2
-;	pshufd %2, %1, 0B1h	; 10110001 B
-;	paddd %1, %2
-;%endmacro	; END OF SUM_SSE2
-
-; by comparing it outperforms than phaddw(SSSE3) sets
-%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
-	; @sum_8x2 begin
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 04Eh	; 01001110 B
-	paddw %1, %2
-	pshuflw %2, %1, 0B1h	; 10110001 B
-	paddw %1, %2
-	; end of @sum_8x2
-%endmacro	; END of SUM_WORD_8x2_SSE2
-
-%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
-	movdqa %1, %2
-	punpcklbw %1, %3
-	punpckhbw %2, %3
-	pmaddwd %1, %1
-	pmaddwd %2, %2
-	paddd %1, %2
-	pshufd %2, %1, 04Eh	; 01001110 B
-	paddd %1, %2
-	pshufd %2, %1, 0B1h	; 10110001 B
-	paddd %1, %2
-%endmacro	; END OF SUM_SQR_SSE2
-
-%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	pshufd %3, %1, 0B1h
-	pshufd %4, %2, 0B1h
-	paddw %1, %3
-	paddw %2, %4
-	movdqa %3, %1
-	movdqa %4, %2
-	pshuflw %5, %1, 0B1h
-	pshufhw %6, %3, 0B1h
-	paddw %1, %5
-	paddw %3, %6
-	pshuflw %5, %2, 0B1h
-	pshufhw %6, %4, 0B1h
-	paddw %2, %5
-	paddw %4, %6
-	punpcklwd %1, %2
-	punpckhwd %3, %4
-	punpcklwd %1, %3
-	psraw %1, $4
-%endmacro
-
-%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
-	movdqa %1, [esi    ]	; line 0
-	movdqa %2, [esi+ecx]	; line 1
-	movdqa %3, %1
-	punpcklbw %1, xmm7
-	punpckhbw %3, xmm7
-	movdqa %4, %2
-	punpcklbw %4, xmm7
-	punpckhbw %2, xmm7
-	paddw %1, %4
-	paddw %2, %3
-	movdqa %3, [esi+ebx]	; line 2
-	movdqa %4, [esi+edx]	; line 3
-	movdqa %5, %3
-	punpcklbw %3, xmm7
-	punpckhbw %5, xmm7
-	movdqa %6, %4
-	punpcklbw %6, xmm7
-	punpckhbw %4, xmm7
-	paddw %3, %6
-	paddw %4, %5
-	paddw %1, %3	; block 0, 1
-	paddw %2, %4	; block 2, 3
-	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
-	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
-	psraw %1, $4
-%endmacro
-
-%macro WELS_SAD_16x2_SSE2  0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	[esi+ebx]
-	movdqa	xmm4,	[edi+ebx]
-	psadbw	xmm1,	xmm2
-	psadbw	xmm3,	xmm4
-	paddd	xmm6,	xmm1
-	paddd	xmm6,	xmm3
-	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]	
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm6,	xmm3
-	
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm0
-	paddd	xmm5,	xmm3
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm2
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	movdqa	xmm3,	xmm1
-	psadbw	xmm3,	xmm2
-	paddd	xmm7,	xmm3	; sad
-	
-	movdqa	xmm3,	xmm1
-	pmaxub	xmm3,	xmm2
-	pminub	xmm2,	xmm1
-	psubb	xmm3,	xmm2	; diff
-	
-	movdqa	xmm2,	xmm1
-	psadbw	xmm2,	xmm0
-	paddd	xmm6,	xmm2	; sum
-	
-	movdqa		xmm2,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm2,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm2,	xmm2
-	paddd		xmm5,	xmm1
-	paddd		xmm5,	xmm2	; sqsum
-	
-	movdqa		xmm1,	xmm3
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm4,	xmm1
-	paddd		xmm4,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
-%define sad_reg			%1
-%define	sum_cur_reg		%2
-%define sum_ref_reg		%3
-%define	mad_reg			%4
-	movdqa	xmm1,		[esi]
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_cur_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	paddd	sum_ref_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-	
-	add			esi,		ebx
-	add			edi,		ebx
-%endmacro
-
-
-%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
-%define max_reg  %1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		4
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		2
-	pmaxub	max_reg,	xmm1
-	movdqa	xmm1,		max_reg
-	psrldq	xmm1,		1
-	pmaxub	max_reg,	xmm1
-%endmacro
-
-%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
-%define sad_reg		%1
-%define	sum_reg		%2
-%define mad_reg		%3
-%define sqdiff_reg	%4
-	movdqa		xmm1,		[esi]
-	movdqa		xmm2,		xmm1
-	movdqa		xmm3,		xmm1
-	punpcklbw	xmm2,		xmm0
-	punpckhbw	xmm3,		xmm0
-	pmaddwd		xmm2,		xmm2
-	pmaddwd		xmm3,		xmm3
-	paddd		xmm2,		xmm3
-	movdqa		xmm3,		xmm2
-	psllq		xmm2,		32
-	psrlq		xmm3,		32
-	psllq		xmm3,		32
-	paddd		xmm2,		xmm3
-	paddd		sad_reg,	xmm2		; sqsum
-	
-	movdqa	xmm2,		[edi]
-	movdqa	xmm3,		xmm1
-	psadbw	xmm3,		xmm0
-	paddd	sum_reg,			xmm3	; sum_cur
-	movdqa	xmm3,		xmm2
-	psadbw	xmm3,		xmm0
-	pslldq	xmm3,		4
-	paddd	sum_reg,			xmm3	; sum_ref
-	
-	movdqa	xmm3,		xmm1
-	pmaxub	xmm3,		xmm2
-	pminub	xmm2,		xmm1
-	psubb	xmm3,		xmm2	; abs diff
-	pmaxub	mad_reg,	xmm3	; max abs diff
-	
-	movdqa	xmm1,		xmm3
-	psadbw	xmm3,		xmm0
-	paddd	sad_reg,	xmm3	; sad
-
-	movdqa		xmm3,	xmm1
-	punpcklbw	xmm1,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm1,	xmm1
-	pmaddwd		xmm3,	xmm3
-	paddd		sqdiff_reg,	xmm1
-	paddd		sqdiff_reg,	xmm3	; sqdiff
-	
-	add		esi,	ebx
-	add		edi,	ebx
-%endmacro
-
-
-;***********************************************************************
-; Local Data (Read Only)
-;***********************************************************************
-
-;SECTION .rodata align=16
-
-;ALIGN 16
-;pack1_8x2:
-;	dw 1, 1, 1, 1, 1, 1, 1, 1
-
-;***********************************************************************
-; Code
-;***********************************************************************
-
-SECTION .text
-
-WELS_EXTERN rc_sad_frame_sse2
-;***********************************************************************
-;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
-;***********************************************************************
-ALIGN 16
-rc_sad_frame_sse2:
-	push esi
-	push edi
-	push ebp
-	push ebx
-	push edx
-
-	mov esi, [esp+24]
-	mov edi, [esp+28]
-	mov ebx, [esp+32]
-	mov ecx, [esp+36]
-	mov edx, [esp+40]
-	pxor xmm0, xmm0	
-.hloop:
-	mov eax, ebx
-	mov ebp, $0
-.wloop:
-	movdqa xmm1, [esi+ebp]
-	movdqa xmm2, [edi+ebp]
-	psadbw xmm1, xmm2
-	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
-	paddd xmm1, xmm2
-	paddd xmm0, xmm1	
-	add ebp, 010h
-	dec eax
-	jnz near .wloop
-	lea esi, [esi+edx]
-	lea edi, [edi+edx]
-	dec ecx
-	jnz near .hloop
-
-	movd eax, xmm0
-	pop edx
-	pop ebx
-	pop ebp
-	pop edi
-	pop esi
-	ret
-
-
-WELS_EXTERN SampleVariance16x16_sse2
-;***********************************************************************
-;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
-;***********************************************************************
-ALIGN 16
-SampleVariance16x16_sse2:	
-	push esi
-	push edi
-	push ebx
-	
-	sub esp, 16
-	%define SUM			[esp]
-	%define SUM_CUR		[esp+4]
-	%define SQR			[esp+8]
-	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16	
-
-	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
-	mov esi, [esp+PUSH_SIZE+12]	; y_src
-	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
-	mov ecx, 010h				; height = 16
-
-	pxor xmm7, xmm7
-	movdqu SUM, xmm7
-
-.hloops:
-	movdqa xmm0, [edi]		; y_ref
-	movdqa xmm1, [esi]		; y_src
-	movdqa xmm2, xmm0		; store first for future process
-	movdqa xmm3, xmm1
-	; sum += diff;
-	movdqa xmm4, xmm0
-	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
-	; to be continued for sum
-	pshufd xmm5, xmm4, 0C6h	; 11000110 B
-	paddw xmm4, xmm5
-	movd ebx, xmm4
-	add SUM, ebx
-
-	; sqr += diff * diff;
-	pmaxub xmm0, xmm1
-	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff	
-	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
-	movd ebx, xmm1
-	add SQR, ebx
-
-	; sum_cur += y_src[x];
-	movdqa xmm0, xmm3		; cur_orig
-	movdqa xmm1, xmm0
-	punpcklbw xmm0, xmm7
-	punpckhbw xmm1, xmm7
-	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1	
-	movd ebx, xmm0
-	and ebx, 0ffffh
-	add SUM_CUR, ebx
-
-	; sqr_cur += y_src[x] * y_src[x];
-	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
-	movd ebx, xmm0
-	add SQR_CUR, ebx
-	
-	lea edi, [edi+edx]
-	lea esi, [esi+eax]
-	dec ecx
-	jnz near .hloops
-	
-	mov ebx, 0
-	mov bx, word SUM
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR
-	sar ecx, 8
-	sub ecx, ebx
-	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
-	mov [edi], cx				; to store uiMotionIndex
-	mov ebx, 0
-	mov bx, word SUM_CUR
-	sar ebx, 8
-	imul ebx, ebx
-	mov ecx, SQR_CUR
-	sar ecx, 8
-	sub ecx, ebx
-	mov [edi+2], cx				; to store uiTextureIndex
-	
-	%undef SUM
-	%undef SUM_CUR
-	%undef SQR
-	%undef SQR_CUR
-	%undef PUSH_SIZE
-
-	add esp, 16	
-	pop ebx
-	pop edi
-	pop esi	
-
-	ret
-
-; , 6/7/2010
-
-%ifndef NO_DYNAMIC_VP
-WELS_EXTERN AnalysisVaaInfoIntra_sse2
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_sse2:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+24], xmm0
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low word truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-        
-WELS_EXTERN AnalysisVaaInfoIntra_ssse3
-;***********************************************************************
-;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
-;***********************************************************************
-ALIGN 16
-AnalysisVaaInfoIntra_ssse3:
-	push ebx
-	push edx
-	push esi
-	push edi
-	push ebp
-
-	mov ebp, esp
-	and ebp, 0fh
-	sub esp, ebp
-	sub esp, 32	
-	%define PUSH_SIZE	52	; 20 + 32
-
-	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
-	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
-
-	mov ebx, ecx
-	sal ebx, $1			; linesize x 2 [ebx]
-	mov edx, ebx
-	add edx, ecx		; linesize x 3 [edx]
-	mov eax, ebx
-	sal eax, $1			; linesize x 4 [eax]
-	
-	pxor xmm7, xmm7
-	
-	; loops
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
-
-	lea esi, [esi+eax]
-	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+24], xmm1
-		
-	movdqa xmm0, [esp]		; block 0~7
-	movdqa xmm1, [esp+16]	; block 8~15
-	movdqa xmm2, xmm0
-	paddw xmm0, xmm1
-	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
-
-	pmullw xmm1, xmm1
-	pmullw xmm2, xmm2
-	movdqa xmm3, xmm1
-	movdqa xmm4, xmm2
-	punpcklwd xmm1, xmm7
-	punpckhwd xmm3, xmm7
-	punpcklwd xmm2, xmm7
-	punpckhwd xmm4, xmm7
-	paddd xmm1, xmm2
-	paddd xmm3, xmm4
-	paddd xmm1, xmm3
-	pshufd xmm2, xmm1, 01Bh
-	paddd xmm1, xmm2
-	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2
-	
-	movd ebx, xmm0
-	and ebx, 0ffffh		; effective low work truncated
-	mov ecx, ebx
-	imul ebx, ecx
-	sar ebx, $4
-	movd eax, xmm1
-	sub eax, ebx
-	
-	%undef PUSH_SIZE
-	add esp, 32
-	add esp, ebp
-	pop ebp
-	pop edi
-	pop esi
-	pop edx
-	pop ebx
-	ret
-%endif
-	
-	
-
-WELS_EXTERN abs_difference_mbrow_sse2
-;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
-;								 int32_t gom_pixel_num, int32_t *pSum)
-;*************************************************************************************************************
-ALIGN 16
-abs_difference_mbrow_sse2:
-%define		ref_orig			esp + pushsize + 4
-%define		cur_orig			esp + pushsize + 8
-%define		iPicStride			esp + pushsize + 12
-%define		gom_pixel_num		esp + pushsize + 16
-%define		pSum				esp + pushsize + 20
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[ref_orig]
-	mov		edi,	[cur_orig]
-	mov		ebx,	[iPicStride]
-	mov		eax,	[gom_pixel_num]
-	mov		ecx,	16					;MB_WIDTH_LUMA
-	pxor	xmm0,	xmm0
-mb_width_loop_p:
-	mov		edx,	esi
-	add		edx,	eax			; end address
-gom_row_loop_p:
-	movdqa	xmm1,	[esi]
-	movdqa	xmm2,	[edi]
-	psadbw	xmm1,	xmm2
-	paddd	xmm0,	xmm1
-	add		esi,	16
-	add		edi,	16
-	cmp		esi,	edx
-	jl		gom_row_loop_p
-	
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	ebx
-	add		edi,	ebx
-	loop	mb_width_loop_p
-	
-	movdqa	xmm1,	xmm0
-	psrldq	xmm1,	8
-	paddd	xmm1,	xmm0
-	movd	eax,	xmm1
-	mov		edx,	[pSum]	; pSum
-	add		[edx],	eax
-
-%undef		ref_orig
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-
-
-
-
-WELS_EXTERN sum_sqrsum_mbrow_sse2
-;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
-;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
-;*************************************************************************************************************
-ALIGN 16
-sum_sqrsum_mbrow_sse2:
-%define		cur_orig			esp + pushsize + 4
-%define		iPicStride			esp + pushsize + 8
-%define		gom_pixel_num		esp + pushsize + 12
-%define		pSum				esp + pushsize + 16
-%define		pSqrSum				esp + pushsize + 20
-%define		pushsize			8
-	push		esi
-	push		ebx
-	mov			esi,	[cur_orig]
-	mov			eax,	[gom_pixel_num]
-	mov			ebx,	[iPicStride]
-	mov			ecx,	16					;MB_WIDTH_LUMA
-	pxor		xmm0,	xmm0				; zero
-	pxor		xmm1,	xmm1				; sum
-	pxor		xmm2,	xmm2				; sqr sum
-mb_width_loop_i:
-	mov			edx,	esi
-	add			edx,	eax			; end address
-gom_row_loop_i:
-	movdqa		xmm3,	[esi]
-	movdqa		xmm4,	xmm3
-	psadbw		xmm4,	xmm0
-	paddd		xmm1,	xmm4
-	movdqa		xmm4,	xmm3
-	punpcklbw	xmm4,	xmm0
-	punpckhbw	xmm3,	xmm0
-	pmaddwd		xmm4,	xmm4
-	pmaddwd		xmm3,	xmm3
-	paddd		xmm2,	xmm3
-	paddd		xmm2,	xmm4
-	add			esi,	16
-	cmp			esi,	edx
-	jl			gom_row_loop_i
-	
-	sub			esi,	eax
-	add			esi,	ebx
-	loop		mb_width_loop_i
-	
-	movdqa		xmm3,	xmm1
-	psrldq		xmm3,	8
-	paddd		xmm1,	xmm3
-	movd		eax,	xmm1
-	mov			edx,	[pSum]
-	add			[edx],	eax
-	
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	8
-	paddd		xmm2,	xmm3
-	movdqa		xmm3,	xmm2
-	psrldq		xmm3,	4
-	paddd		xmm2,	xmm3
-	movd		eax,	xmm2
-	mov			edx,	[pSqrSum]
-	add			[edx],	eax
-
-
-%undef		cur_orig
-%undef		iPicStride
-%undef		gom_pixel_num
-%undef		pSum
-%undef		pSqrSum
-%undef		pushsize	
-	pop			ebx
-	pop			esi
-	ret
-
-
-
-WELS_EXTERN VAACalcSad_sse2
-;*************************************************************************************************************
-;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
-;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSad_sse2:
-%define		cur_data			esp + pushsize + 4
-%define		ref_data			esp + pushsize + 8
-%define		iPicWidth			esp + pushsize + 12
-%define		iPicHeight			esp + pushsize + 16
-%define		iPicStride			esp + pushsize + 20
-%define		psadframe			esp + pushsize + 24
-%define		psad8x8				esp + pushsize + 28
-%define		pushsize	12
-	push	esi
-	push	edi
-	push	ebx
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4								; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-height_loop:
-	mov		ecx,	dword [iPicWidth]
-	push	esi
-	push	edi
-width_loop:
-	pxor	xmm6,	xmm6		;
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	WELS_SAD_16x2_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		width_loop
-	
-	pop		edi
-	pop		esi
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		pushsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	ret
-	
-	
-WELS_EXTERN VAACalcSadVar_sse2
-;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
-;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadVar_sse2:
-%define		localsize		8
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	pxor	xmm7,	xmm7		; iFrameSad
-var_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-var_width_loop:
-	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
-	pxor	xmm5,	xmm5		; pSum16x16
-	pxor	xmm4,	xmm4		; sqsum_16x16
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx],		xmm6
-	psrldq	xmm6,		8
-	movd	[edx+4],	xmm6
-	
-	pxor	xmm6,	xmm6
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_16x1_SSE2
-	paddd	xmm7,		xmm6
-	movd	[edx+8],	xmm6
-	psrldq	xmm6,		8
-	movd	[edx+12],	xmm6
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm5
-	psrldq	xmm1,	8
-	paddd	xmm5,	xmm1
-	movd	[ebp],	xmm5
-	add		dword [psum16x16], 4
-	
-	movdqa	xmm5,	xmm4
-	psrldq	xmm5,	8
-	paddd	xmm4,	xmm5
-	movdqa	xmm3,	xmm4
-	psrldq	xmm3,	4
-	paddd	xmm4,	xmm3
-	
-	mov		ebp,	[psqsum16x16]
-	movd	[ebp],	xmm4
-	add		dword [psqsum16x16], 4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		var_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		var_height_loop
-	
-	mov		edx,	[psadframe]
-	movdqa	xmm5,	xmm7
-	psrldq	xmm7,	8
-	paddd	xmm7,	xmm5
-	movd	[edx],	xmm7
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-
-WELS_EXTERN VAACalcSadSsd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
-;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		edx,	[psad8x8]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	mov		ecx,	[iPicWidth]
-	mov		ecx,	[iPicHeight]
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_width_loop:
-	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
-	pxor	xmm6,	xmm6		; pSum16x16
-	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx],		xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+4],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	pxor	xmm7,	xmm7
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
-	movdqa	xmm1,		xmm7
-	movd	[edx+8],	xmm7
-	psrldq	xmm7,		8
-	paddd	xmm1,		xmm7
-	movd	[edx+12],	xmm7
-	movd	ebp,		xmm1
-	add		[tmp_sadframe],	ebp
-	
-	mov		ebp,	[psum16x16]
-	movdqa	xmm1,	xmm6
-	psrldq	xmm1,	8
-	paddd	xmm6,	xmm1
-	movd	[ebp],	xmm6
-	add		dword [psum16x16], 4
-	
-	mov		ebp,	[psqsum16x16]
-	pshufd	xmm6,	xmm5,	14 ;00001110
-	paddd	xmm6,	xmm5
-	pshufd	xmm5,	xmm6,	1  ;00000001
-	paddd	xmm5,	xmm6
-	movd	[ebp],	xmm5
-	add		dword [psqsum16x16], 4
-	
-	mov		ebp,	[psqdiff16x16]
-	pshufd	xmm5,	xmm4,	14	; 00001110
-	paddd	xmm5,	xmm4
-	pshufd	xmm4,	xmm5,	1	; 00000001
-	paddd	xmm4,	xmm5
-	movd	[ebp],	xmm4
-	add		dword	[psqdiff16x16],	4
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	dec		ecx
-	jnz		sqdiff_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_height_loop
-	
-	mov		ebx,	[tmp_sadframe]
-	mov		eax,	[psadframe]
-	mov		[eax],	ebx
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		tmp_sadframe
-%undef		pushsize
-%undef		localsize
-	ret
-	
-	
-	
-	
-
-WELS_EXTERN VAACalcSadBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadBgd_sse2:
-%define		localsize		12
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		p_sd8x8				esp + pushsize + localsize + 32
-%define		p_mad8x8			esp + pushsize + localsize + 36
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_ecx				esp + 8
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	xor		ebp,	ebp
-	pxor	xmm0,	xmm0
-bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8
-	pxor	xmm6,	xmm6		; sum_cur_8x8
-	pxor	xmm5,	xmm5		; sum_ref_8x8
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-
-	
-	pslldq		xmm7,	4
-	pslldq		xmm6,	4
-	pslldq		xmm5,	4
-	
-	
-	pxor	xmm4,	xmm4		; pMad8x8
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm4
-	
-	;movdqa		xmm1,	xmm4
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm4,	xmm0
-	;punpcklwd	xmm4,	xmm0
-	;movd		[edx+4],	xmm4
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm4
-	movd		ecx,	xmm4
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-	
-	mov		edx,	[psad8x8]
-	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1					
-	add		edx,	16
-	mov		[psad8x8],	edx					; sad8x8
-	
-	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
-	pshufd	xmm2,	xmm1,	00000011b
-	paddd	xmm1,	xmm2
-	movd	edx,	xmm1
-	add		ebp,	edx						; sad frame
-	
-	mov		edx,	[p_sd8x8]
-	psubd	xmm6,	xmm5
-	pshufd	xmm1,	xmm6,	10001101b
-	movdqa	[edx],	xmm1
-	add		edx,	16
-	mov		[p_sd8x8],	edx
-	
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec		dword [iPicHeight]
-	jnz		bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
-
-
-
-WELS_EXTERN VAACalcSadSsdBgd_sse2
-;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
-;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
-;*************************************************************************************************************
-
-
-ALIGN 16
-VAACalcSadSsdBgd_sse2:
-%define		localsize		16
-%define		cur_data			esp + pushsize + localsize + 4
-%define		ref_data			esp + pushsize + localsize + 8
-%define		iPicWidth			esp + pushsize + localsize + 12
-%define		iPicHeight			esp + pushsize + localsize + 16
-%define		iPicStride			esp + pushsize + localsize + 20
-%define		psadframe			esp + pushsize + localsize + 24
-%define		psad8x8				esp + pushsize + localsize + 28
-%define		psum16x16			esp + pushsize + localsize + 32
-%define		psqsum16x16			esp + pushsize + localsize + 36
-%define		psqdiff16x16		esp + pushsize + localsize + 40
-%define		p_sd8x8				esp + pushsize + localsize + 44
-%define		p_mad8x8			esp + pushsize + localsize + 48
-%define		tmp_esi				esp + 0
-%define		tmp_edi				esp + 4
-%define		tmp_sadframe		esp + 8
-%define		tmp_ecx				esp + 12
-%define		pushsize		16
-	push	ebp
-	push	esi
-	push	edi
-	push	ebx
-	sub		esp,	localsize
-	mov		esi,	[cur_data]
-	mov		edi,	[ref_data]
-	mov		ebx,	[iPicStride]
-	mov		eax,	ebx
-	
-	shr		dword [iPicWidth],	4					; iPicWidth/16
-	shr		dword [iPicHeight],	4					; iPicHeight/16
-	shl		eax,	4							; iPicStride*16
-	pxor	xmm0,	xmm0
-	movd	[tmp_sadframe],	xmm0
-sqdiff_bgd_height_loop:
-	mov		ecx,	dword [iPicWidth]
-	mov		[tmp_esi],	esi
-	mov		[tmp_edi],	edi
-sqdiff_bgd_width_loop:
-	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,		[psum16x16]
-	movdqa	xmm1,		xmm6
-	pshufd	xmm2,		xmm1,		00001110b
-	paddd	xmm1,		xmm2
-	movd	[edx],		xmm1				; sum
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov			edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx
-	mov			[tmp_ecx],	ecx
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	psrlq	xmm7,	32
-	psllq	xmm7,	32			; clear sad
-	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
-	pxor	xmm5,	xmm5		; pMad8x8
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	mov		edx,		[psad8x8]
-	movdqa	xmm2,		xmm7
-	pshufd	xmm1,		xmm2,		00001110b
-	movd	[edx],		xmm2
-	movd	[edx+4],	xmm1
-	add		edx,		8
-	mov		[psad8x8],	edx			; sad8x8
-	
-	paddd	xmm1,				xmm2
-	movd	edx,				xmm1
-	add		[tmp_sadframe],		edx			; iFrameSad
-	
-	mov		edx,			[psum16x16]
-	movdqa	xmm1,			xmm6
-	pshufd	xmm2,			xmm1,		00001110b
-	paddd	xmm1,			xmm2
-	movd	ebp,			xmm1				; sum
-	add		[edx],			ebp
-	add		edx,			4
-	mov		[psum16x16],	edx
-	
-	mov		edx,			[psqsum16x16]
-	psrlq	xmm7,			32
-	pshufd	xmm2,			xmm7,		00001110b
-	paddd	xmm2,			xmm7
-	movd	[edx],			xmm2				; sqsum
-	add		edx,			4
-	mov		[psqsum16x16],	edx
-	
-	mov		edx,		[p_sd8x8]
-	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
-	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
-	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
-	movq	[edx],		xmm1
-	add		edx,		8
-	mov		[p_sd8x8],	edx
-	
-	mov		edx,		[p_mad8x8]
-	WELS_MAX_REG_SSE2	xmm5
-	;movdqa		xmm1,	xmm5
-	;punpcklbw	xmm1,	xmm0
-	;punpcklwd	xmm1,	xmm0
-	;movd		[edx],	xmm1
-	;punpckhbw	xmm5,	xmm0
-	;punpcklwd	xmm5,	xmm0
-	;movd		[edx+4],	xmm5
-	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
-	movhlps		xmm1,	xmm5
-	movd		ecx,	xmm5
-	mov			[edx],	cl
-	movd		ecx,	xmm1
-	mov			[edx+1],cl
-	add			edx,	2
-	mov			[p_mad8x8],	edx
-	
-	mov		edx,		[psqdiff16x16]
-	pshufd	xmm1,		xmm4,		00001110b
-	paddd	xmm4,		xmm1
-	pshufd	xmm1,		xmm4,		00000001b
-	paddd	xmm4,		xmm1
-	movd	[edx],		xmm4
-	add		edx,		4
-	mov		[psqdiff16x16],	edx
-	
-	add		edx,	16
-	sub		esi,	eax
-	sub		edi,	eax
-	add		esi,	16
-	add		edi,	16
-	
-	mov		ecx,	[tmp_ecx]
-	dec		ecx
-	jnz		sqdiff_bgd_width_loop
-	
-	mov		esi,	[tmp_esi]
-	mov		edi,	[tmp_edi]
-	add		esi,	eax
-	add		edi,	eax
-	
-	dec	dword [iPicHeight]
-	jnz		sqdiff_bgd_height_loop
-	
-	mov		edx,	[psadframe]
-	mov		ebp,	[tmp_sadframe]
-	mov		[edx],	ebp
-
-	add		esp,	localsize	
-	pop		ebx
-	pop		edi
-	pop		esi
-	pop		ebp
-%undef		cur_data
-%undef		ref_data
-%undef		iPicWidth
-%undef		iPicHeight
-%undef		iPicStride
-%undef		psadframe
-%undef		psad8x8
-%undef		psum16x16
-%undef		psqsum16x16
-%undef		psqdiff16x16
-%undef		p_sd8x8
-%undef		p_mad8x8
-%undef		tmp_esi
-%undef		tmp_edi
-%undef		pushsize
-%undef		localsize
-	ret
+;*!
+;* \copy
+;*     Copyright (c)  2010-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+;*	vaa.asm
+;*
+;*	Abstract
+;*      sse2 for pVaa routines
+;*
+;*  History
+;*      04/14/2010	Created
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+BITS 32
+
+;***********************************************************************
+; Macros and other preprocessor constants
+;***********************************************************************
+
+;%macro SUM_SSE2	4	; dst, pSrc, zero, pack1_8x2
+;	movdqa %1, %2
+;	punpcklbw %1, %3
+;	punpckhbw %2, %3
+;	paddw %1, %2
+;	pmaddwd %1, %4
+;	pshufd %2, %1, 04Eh	; 01001110 B
+;	paddd %1, %2
+;	pshufd %2, %1, 0B1h	; 10110001 B
+;	paddd %1, %2
+;%endmacro	; END OF SUM_SSE2
+
+; by comparing it outperforms than phaddw(SSSE3) sets
+%macro SUM_WORD_8x2_SSE2	2	; dst(pSrc), tmp
+	; @sum_8x2 begin
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 04Eh	; 01001110 B
+	paddw %1, %2
+	pshuflw %2, %1, 0B1h	; 10110001 B
+	paddw %1, %2
+	; end of @sum_8x2
+%endmacro	; END of SUM_WORD_8x2_SSE2
+
+%macro SUM_SQR_SSE2	3	; dst, pSrc, zero
+	movdqa %1, %2
+	punpcklbw %1, %3
+	punpckhbw %2, %3
+	pmaddwd %1, %1
+	pmaddwd %2, %2
+	paddd %1, %2
+	pshufd %2, %1, 04Eh	; 01001110 B
+	paddd %1, %2
+	pshufd %2, %1, 0B1h	; 10110001 B
+	paddd %1, %2
+%endmacro	; END OF SUM_SQR_SSE2
+
+%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	pshufd %3, %1, 0B1h
+	pshufd %4, %2, 0B1h
+	paddw %1, %3
+	paddw %2, %4
+	movdqa %3, %1
+	movdqa %4, %2
+	pshuflw %5, %1, 0B1h
+	pshufhw %6, %3, 0B1h
+	paddw %1, %5
+	paddw %3, %6
+	pshuflw %5, %2, 0B1h
+	pshufhw %6, %4, 0B1h
+	paddw %2, %5
+	paddw %4, %6
+	punpcklwd %1, %2
+	punpckhwd %3, %4
+	punpcklwd %1, %3
+	psraw %1, $4
+%endmacro
+
+%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
+	movdqa %1, [esi    ]	; line 0
+	movdqa %2, [esi+ecx]	; line 1
+	movdqa %3, %1
+	punpcklbw %1, xmm7
+	punpckhbw %3, xmm7
+	movdqa %4, %2
+	punpcklbw %4, xmm7
+	punpckhbw %2, xmm7
+	paddw %1, %4
+	paddw %2, %3
+	movdqa %3, [esi+ebx]	; line 2
+	movdqa %4, [esi+edx]	; line 3
+	movdqa %5, %3
+	punpcklbw %3, xmm7
+	punpckhbw %5, xmm7
+	movdqa %6, %4
+	punpcklbw %6, xmm7
+	punpckhbw %4, xmm7
+	paddw %3, %6
+	paddw %4, %5
+	paddw %1, %3	; block 0, 1
+	paddw %2, %4	; block 2, 3
+	phaddw %1, %2	; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
+	phaddw %1, xmm7	; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
+	psraw %1, $4
+%endmacro
+
+%macro WELS_SAD_16x2_SSE2  0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	[esi+ebx]
+	movdqa	xmm4,	[edi+ebx]
+	psadbw	xmm1,	xmm2
+	psadbw	xmm3,	xmm4
+	paddd	xmm6,	xmm1
+	paddd	xmm6,	xmm3
+	lea		esi,	[esi+ebx*2]
+	lea		edi,	[edi+ebx*2]	
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm6,	xmm3
+	
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm0
+	paddd	xmm5,	xmm3
+	
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm2
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2 0
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	movdqa	xmm3,	xmm1
+	psadbw	xmm3,	xmm2
+	paddd	xmm7,	xmm3	; sad
+	
+	movdqa	xmm3,	xmm1
+	pmaxub	xmm3,	xmm2
+	pminub	xmm2,	xmm1
+	psubb	xmm3,	xmm2	; diff
+	
+	movdqa	xmm2,	xmm1
+	psadbw	xmm2,	xmm0
+	paddd	xmm6,	xmm2	; sum
+	
+	movdqa		xmm2,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm2,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm2,	xmm2
+	paddd		xmm5,	xmm1
+	paddd		xmm5,	xmm2	; sqsum
+	
+	movdqa		xmm1,	xmm3
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm4,	xmm1
+	paddd		xmm4,	xmm3	; sqdiff
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+%macro	WELS_SAD_SD_MAD_16x1_SSE2	4
+%define sad_reg			%1
+%define	sum_cur_reg		%2
+%define sum_ref_reg		%3
+%define	mad_reg			%4
+	movdqa	xmm1,		[esi]
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_cur_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	paddd	sum_ref_reg,			xmm3	; sum_ref
+	
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+	
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+	
+	add			esi,		ebx
+	add			edi,		ebx
+%endmacro
+
+
+%macro	WELS_MAX_REG_SSE2	1	; xmm1, xmm2, xmm3 can be used
+%define max_reg  %1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		4
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		2
+	pmaxub	max_reg,	xmm1
+	movdqa	xmm1,		max_reg
+	psrldq	xmm1,		1
+	pmaxub	max_reg,	xmm1
+%endmacro
+
+%macro	WELS_SAD_BGD_SQDIFF_16x1_SSE2	4
+%define sad_reg		%1
+%define	sum_reg		%2
+%define mad_reg		%3
+%define sqdiff_reg	%4
+	movdqa		xmm1,		[esi]
+	movdqa		xmm2,		xmm1
+	movdqa		xmm3,		xmm1
+	punpcklbw	xmm2,		xmm0
+	punpckhbw	xmm3,		xmm0
+	pmaddwd		xmm2,		xmm2
+	pmaddwd		xmm3,		xmm3
+	paddd		xmm2,		xmm3
+	movdqa		xmm3,		xmm2
+	psllq		xmm2,		32
+	psrlq		xmm3,		32
+	psllq		xmm3,		32
+	paddd		xmm2,		xmm3
+	paddd		sad_reg,	xmm2		; sqsum
+	
+	movdqa	xmm2,		[edi]
+	movdqa	xmm3,		xmm1
+	psadbw	xmm3,		xmm0
+	paddd	sum_reg,			xmm3	; sum_cur
+	movdqa	xmm3,		xmm2
+	psadbw	xmm3,		xmm0
+	pslldq	xmm3,		4
+	paddd	sum_reg,			xmm3	; sum_ref
+	
+	movdqa	xmm3,		xmm1
+	pmaxub	xmm3,		xmm2
+	pminub	xmm2,		xmm1
+	psubb	xmm3,		xmm2	; abs diff
+	pmaxub	mad_reg,	xmm3	; max abs diff
+	
+	movdqa	xmm1,		xmm3
+	psadbw	xmm3,		xmm0
+	paddd	sad_reg,	xmm3	; sad
+
+	movdqa		xmm3,	xmm1
+	punpcklbw	xmm1,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm1,	xmm1
+	pmaddwd		xmm3,	xmm3
+	paddd		sqdiff_reg,	xmm1
+	paddd		sqdiff_reg,	xmm3	; sqdiff
+	
+	add		esi,	ebx
+	add		edi,	ebx
+%endmacro
+
+
+;***********************************************************************
+; Local Data (Read Only)
+;***********************************************************************
+
+;SECTION .rodata align=16
+
+;ALIGN 16
+;pack1_8x2:
+;	dw 1, 1, 1, 1, 1, 1, 1, 1
+
+;***********************************************************************
+; Code
+;***********************************************************************
+
+SECTION .text
+
+WELS_EXTERN rc_sad_frame_sse2
+;***********************************************************************
+;	uint32_t rc_sad_frame_sse2(	uint8_t *ref_orig, uint8_t *cur_orig, const int mb_width, const int iPicHeight, const int iPicStride );
+;***********************************************************************
+ALIGN 16
+rc_sad_frame_sse2:
+	push esi
+	push edi
+	push ebp
+	push ebx
+	push edx
+
+	mov esi, [esp+24]
+	mov edi, [esp+28]
+	mov ebx, [esp+32]
+	mov ecx, [esp+36]
+	mov edx, [esp+40]
+	pxor xmm0, xmm0	
+.hloop:
+	mov eax, ebx
+	mov ebp, $0
+.wloop:
+	movdqa xmm1, [esi+ebp]
+	movdqa xmm2, [edi+ebp]
+	psadbw xmm1, xmm2
+	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
+	paddd xmm1, xmm2
+	paddd xmm0, xmm1	
+	add ebp, 010h
+	dec eax
+	jnz near .wloop
+	lea esi, [esi+edx]
+	lea edi, [edi+edx]
+	dec ecx
+	jnz near .hloop
+
+	movd eax, xmm0
+	pop edx
+	pop ebx
+	pop ebp
+	pop edi
+	pop esi
+	ret
+
+
+WELS_EXTERN SampleVariance16x16_sse2
+;***********************************************************************
+;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
+;***********************************************************************
+ALIGN 16
+SampleVariance16x16_sse2:	
+	push esi
+	push edi
+	push ebx
+	
+	sub esp, 16
+	%define SUM			[esp]
+	%define SUM_CUR		[esp+4]
+	%define SQR			[esp+8]
+	%define SQR_CUR		[esp+12]
+	%define PUSH_SIZE	28	; 12 + 16	
+
+	mov edi, [esp+PUSH_SIZE+4]	; y_ref
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
+	mov esi, [esp+PUSH_SIZE+12]	; y_src
+	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
+	mov ecx, 010h				; height = 16
+
+	pxor xmm7, xmm7
+	movdqu SUM, xmm7
+
+.hloops:
+	movdqa xmm0, [edi]		; y_ref
+	movdqa xmm1, [esi]		; y_src
+	movdqa xmm2, xmm0		; store first for future process
+	movdqa xmm3, xmm1
+	; sum += diff;
+	movdqa xmm4, xmm0
+	psadbw xmm4, xmm1		; 2 parts, [0,..,15], [64,..,79]
+	; to be continued for sum
+	pshufd xmm5, xmm4, 0C6h	; 11000110 B
+	paddw xmm4, xmm5
+	movd ebx, xmm4
+	add SUM, ebx
+
+	; sqr += diff * diff;
+	pmaxub xmm0, xmm1
+	pminub xmm1, xmm2
+	psubb xmm0, xmm1				; diff	
+	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
+	movd ebx, xmm1
+	add SQR, ebx
+
+	; sum_cur += y_src[x];
+	movdqa xmm0, xmm3		; cur_orig
+	movdqa xmm1, xmm0
+	punpcklbw xmm0, xmm7
+	punpckhbw xmm1, xmm7
+	paddw xmm0, xmm1		; 8x2
+	SUM_WORD_8x2_SSE2 xmm0, xmm1	
+	movd ebx, xmm0
+	and ebx, 0ffffh
+	add SUM_CUR, ebx
+
+	; sqr_cur += y_src[x] * y_src[x];
+	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
+	movd ebx, xmm0
+	add SQR_CUR, ebx
+	
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
+	dec ecx
+	jnz near .hloops
+	
+	mov ebx, 0
+	mov bx, word SUM
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR
+	sar ecx, 8
+	sub ecx, ebx
+	mov edi, [esp+PUSH_SIZE+20]	; pMotionTexture
+	mov [edi], cx				; to store uiMotionIndex
+	mov ebx, 0
+	mov bx, word SUM_CUR
+	sar ebx, 8
+	imul ebx, ebx
+	mov ecx, SQR_CUR
+	sar ecx, 8
+	sub ecx, ebx
+	mov [edi+2], cx				; to store uiTextureIndex
+	
+	%undef SUM
+	%undef SUM_CUR
+	%undef SQR
+	%undef SQR_CUR
+	%undef PUSH_SIZE
+
+	add esp, 16	
+	pop ebx
+	pop edi
+	pop esi	
+
+	ret
+
+; , 6/7/2010
+
+%ifndef NO_DYNAMIC_VP
+WELS_EXTERN AnalysisVaaInfoIntra_sse2
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_sse2(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_sse2:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+8], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+24], xmm0
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3
+	
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low word truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+        
+WELS_EXTERN AnalysisVaaInfoIntra_ssse3
+;***********************************************************************
+;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
+;***********************************************************************
+ALIGN 16
+AnalysisVaaInfoIntra_ssse3:
+	push ebx
+	push edx
+	push esi
+	push edi
+	push ebp
+
+	mov ebp, esp
+	and ebp, 0fh
+	sub esp, ebp
+	sub esp, 32	
+	%define PUSH_SIZE	52	; 20 + 32
+
+	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
+	mov ecx, [esp+ebp+PUSH_SIZE+8]	; linesize
+
+	mov ebx, ecx
+	sal ebx, $1			; linesize x 2 [ebx]
+	mov edx, ebx
+	add edx, ecx		; linesize x 3 [edx]
+	mov eax, ebx
+	sal eax, $1			; linesize x 4 [eax]
+	
+	pxor xmm7, xmm7
+	
+	; loops
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+8], xmm1	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+	movq [esp+16], xmm0	
+
+	lea esi, [esi+eax]
+	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
+	movq [esp+24], xmm1
+		
+	movdqa xmm0, [esp]		; block 0~7
+	movdqa xmm1, [esp+16]	; block 8~15
+	movdqa xmm2, xmm0
+	paddw xmm0, xmm1
+	SUM_WORD_8x2_SSE2 xmm0, xmm3	; better performance than that of phaddw sets
+
+	pmullw xmm1, xmm1
+	pmullw xmm2, xmm2
+	movdqa xmm3, xmm1
+	movdqa xmm4, xmm2
+	punpcklwd xmm1, xmm7
+	punpckhwd xmm3, xmm7
+	punpcklwd xmm2, xmm7
+	punpckhwd xmm4, xmm7
+	paddd xmm1, xmm2
+	paddd xmm3, xmm4
+	paddd xmm1, xmm3
+	pshufd xmm2, xmm1, 01Bh
+	paddd xmm1, xmm2
+	pshufd xmm2, xmm1, 0B1h
+	paddd xmm1, xmm2
+	
+	movd ebx, xmm0
+	and ebx, 0ffffh		; effective low work truncated
+	mov ecx, ebx
+	imul ebx, ecx
+	sar ebx, $4
+	movd eax, xmm1
+	sub eax, ebx
+	
+	%undef PUSH_SIZE
+	add esp, 32
+	add esp, ebp
+	pop ebp
+	pop edi
+	pop esi
+	pop edx
+	pop ebx
+	ret
+%endif
+	
+	
+
+WELS_EXTERN abs_difference_mbrow_sse2
+;*************************************************************************************************************
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
+;								 int32_t gom_pixel_num, int32_t *pSum)
+;*************************************************************************************************************
+ALIGN 16
+abs_difference_mbrow_sse2:
+%define		ref_orig			esp + pushsize + 4
+%define		cur_orig			esp + pushsize + 8
+%define		iPicStride			esp + pushsize + 12
+%define		gom_pixel_num		esp + pushsize + 16
+%define		pSum				esp + pushsize + 20
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[ref_orig]
+	mov		edi,	[cur_orig]
+	mov		ebx,	[iPicStride]
+	mov		eax,	[gom_pixel_num]
+	mov		ecx,	16					;MB_WIDTH_LUMA
+	pxor	xmm0,	xmm0
+mb_width_loop_p:
+	mov		edx,	esi
+	add		edx,	eax			; end address
+gom_row_loop_p:
+	movdqa	xmm1,	[esi]
+	movdqa	xmm2,	[edi]
+	psadbw	xmm1,	xmm2
+	paddd	xmm0,	xmm1
+	add		esi,	16
+	add		edi,	16
+	cmp		esi,	edx
+	jl		gom_row_loop_p
+	
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	ebx
+	add		edi,	ebx
+	loop	mb_width_loop_p
+	
+	movdqa	xmm1,	xmm0
+	psrldq	xmm1,	8
+	paddd	xmm1,	xmm0
+	movd	eax,	xmm1
+	mov		edx,	[pSum]	; pSum
+	add		[edx],	eax
+
+%undef		ref_orig
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pushsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+
+
+
+
+WELS_EXTERN sum_sqrsum_mbrow_sse2
+;*************************************************************************************************************
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
+;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
+;*************************************************************************************************************
+ALIGN 16
+sum_sqrsum_mbrow_sse2:
+%define		cur_orig			esp + pushsize + 4
+%define		iPicStride			esp + pushsize + 8
+%define		gom_pixel_num		esp + pushsize + 12
+%define		pSum				esp + pushsize + 16
+%define		pSqrSum				esp + pushsize + 20
+%define		pushsize			8
+	push		esi
+	push		ebx
+	mov			esi,	[cur_orig]
+	mov			eax,	[gom_pixel_num]
+	mov			ebx,	[iPicStride]
+	mov			ecx,	16					;MB_WIDTH_LUMA
+	pxor		xmm0,	xmm0				; zero
+	pxor		xmm1,	xmm1				; sum
+	pxor		xmm2,	xmm2				; sqr sum
+mb_width_loop_i:
+	mov			edx,	esi
+	add			edx,	eax			; end address
+gom_row_loop_i:
+	movdqa		xmm3,	[esi]
+	movdqa		xmm4,	xmm3
+	psadbw		xmm4,	xmm0
+	paddd		xmm1,	xmm4
+	movdqa		xmm4,	xmm3
+	punpcklbw	xmm4,	xmm0
+	punpckhbw	xmm3,	xmm0
+	pmaddwd		xmm4,	xmm4
+	pmaddwd		xmm3,	xmm3
+	paddd		xmm2,	xmm3
+	paddd		xmm2,	xmm4
+	add			esi,	16
+	cmp			esi,	edx
+	jl			gom_row_loop_i
+	
+	sub			esi,	eax
+	add			esi,	ebx
+	loop		mb_width_loop_i
+	
+	movdqa		xmm3,	xmm1
+	psrldq		xmm3,	8
+	paddd		xmm1,	xmm3
+	movd		eax,	xmm1
+	mov			edx,	[pSum]
+	add			[edx],	eax
+	
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	8
+	paddd		xmm2,	xmm3
+	movdqa		xmm3,	xmm2
+	psrldq		xmm3,	4
+	paddd		xmm2,	xmm3
+	movd		eax,	xmm2
+	mov			edx,	[pSqrSum]
+	add			[edx],	eax
+
+
+%undef		cur_orig
+%undef		iPicStride
+%undef		gom_pixel_num
+%undef		pSum
+%undef		pSqrSum
+%undef		pushsize	
+	pop			ebx
+	pop			esi
+	ret
+
+
+
+WELS_EXTERN VAACalcSad_sse2
+;*************************************************************************************************************
+;void VAACalcSad_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
+;								int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSad_sse2:
+%define		cur_data			esp + pushsize + 4
+%define		ref_data			esp + pushsize + 8
+%define		iPicWidth			esp + pushsize + 12
+%define		iPicHeight			esp + pushsize + 16
+%define		iPicStride			esp + pushsize + 20
+%define		psadframe			esp + pushsize + 24
+%define		psad8x8				esp + pushsize + 28
+%define		pushsize	12
+	push	esi
+	push	edi
+	push	ebx
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4								; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+height_loop:
+	mov		ecx,	dword [iPicWidth]
+	push	esi
+	push	edi
+width_loop:
+	pxor	xmm6,	xmm6		;
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+	
+	pxor	xmm6,	xmm6
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	WELS_SAD_16x2_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		width_loop
+	
+	pop		edi
+	pop		esi
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		height_loop
+	
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		pushsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	ret
+	
+	
+WELS_EXTERN VAACalcSadVar_sse2
+;*************************************************************************************************************
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
+;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadVar_sse2:
+%define		localsize		8
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	pxor	xmm7,	xmm7		; iFrameSad
+var_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+var_width_loop:
+	pxor	xmm6,	xmm6		; hiQuad_loQuad pSad8x8
+	pxor	xmm5,	xmm5		; pSum16x16
+	pxor	xmm4,	xmm4		; sqsum_16x16
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx],		xmm6
+	psrldq	xmm6,		8
+	movd	[edx+4],	xmm6
+	
+	pxor	xmm6,	xmm6
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_16x1_SSE2
+	paddd	xmm7,		xmm6
+	movd	[edx+8],	xmm6
+	psrldq	xmm6,		8
+	movd	[edx+12],	xmm6
+	
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm5
+	psrldq	xmm1,	8
+	paddd	xmm5,	xmm1
+	movd	[ebp],	xmm5
+	add		dword [psum16x16], 4
+	
+	movdqa	xmm5,	xmm4
+	psrldq	xmm5,	8
+	paddd	xmm4,	xmm5
+	movdqa	xmm3,	xmm4
+	psrldq	xmm3,	4
+	paddd	xmm4,	xmm3
+	
+	mov		ebp,	[psqsum16x16]
+	movd	[ebp],	xmm4
+	add		dword [psqsum16x16], 4
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		var_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		var_height_loop
+	
+	mov		edx,	[psadframe]
+	movdqa	xmm5,	xmm7
+	psrldq	xmm7,	8
+	paddd	xmm7,	xmm5
+	movd	[edx],	xmm7
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+	
+	
+
+WELS_EXTERN VAACalcSadSsd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
+;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		edx,	[psad8x8]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	mov		ecx,	[iPicWidth]
+	mov		ecx,	[iPicHeight]
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_width_loop:
+	pxor	xmm7,	xmm7		; hiQuad_loQuad pSad8x8
+	pxor	xmm6,	xmm6		; pSum16x16
+	pxor	xmm5,	xmm5		; sqsum_16x16  four dword
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx],		xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+4],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+	
+	pxor	xmm7,	xmm7
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
+	movdqa	xmm1,		xmm7
+	movd	[edx+8],	xmm7
+	psrldq	xmm7,		8
+	paddd	xmm1,		xmm7
+	movd	[edx+12],	xmm7
+	movd	ebp,		xmm1
+	add		[tmp_sadframe],	ebp
+	
+	mov		ebp,	[psum16x16]
+	movdqa	xmm1,	xmm6
+	psrldq	xmm1,	8
+	paddd	xmm6,	xmm1
+	movd	[ebp],	xmm6
+	add		dword [psum16x16], 4
+	
+	mov		ebp,	[psqsum16x16]
+	pshufd	xmm6,	xmm5,	14 ;00001110
+	paddd	xmm6,	xmm5
+	pshufd	xmm5,	xmm6,	1  ;00000001
+	paddd	xmm5,	xmm6
+	movd	[ebp],	xmm5
+	add		dword [psqsum16x16], 4
+	
+	mov		ebp,	[psqdiff16x16]
+	pshufd	xmm5,	xmm4,	14	; 00001110
+	paddd	xmm5,	xmm4
+	pshufd	xmm4,	xmm5,	1	; 00000001
+	paddd	xmm4,	xmm5
+	movd	[ebp],	xmm4
+	add		dword	[psqdiff16x16],	4
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	dec		ecx
+	jnz		sqdiff_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		sqdiff_height_loop
+	
+	mov		ebx,	[tmp_sadframe]
+	mov		eax,	[psadframe]
+	mov		[eax],	ebx
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		tmp_sadframe
+%undef		pushsize
+%undef		localsize
+	ret
+	
+	
+	
+	
+
+WELS_EXTERN VAACalcSadBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
+;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadBgd_sse2:
+%define		localsize		12
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		p_sd8x8				esp + pushsize + localsize + 32
+%define		p_mad8x8			esp + pushsize + localsize + 36
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_ecx				esp + 8
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	xor		ebp,	ebp
+	pxor	xmm0,	xmm0
+bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8
+	pxor	xmm6,	xmm6		; sum_cur_8x8
+	pxor	xmm5,	xmm5		; sum_ref_8x8
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+	
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+
+	
+	pslldq		xmm7,	4
+	pslldq		xmm6,	4
+	pslldq		xmm5,	4
+	
+	
+	pxor	xmm4,	xmm4		; pMad8x8
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm4
+	
+	;movdqa		xmm1,	xmm4
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm4,	xmm0
+	;punpcklwd	xmm4,	xmm0
+	;movd		[edx+4],	xmm4
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	movhlps		xmm1,	xmm4
+	movd		ecx,	xmm4
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
+	
+	mov		edx,	[psad8x8]
+	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
+	movdqa	[edx],	xmm1					
+	add		edx,	16
+	mov		[psad8x8],	edx					; sad8x8
+	
+	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
+	pshufd	xmm2,	xmm1,	00000011b
+	paddd	xmm1,	xmm2
+	movd	edx,	xmm1
+	add		ebp,	edx						; sad frame
+	
+	mov		edx,	[p_sd8x8]
+	psubd	xmm6,	xmm5
+	pshufd	xmm1,	xmm6,	10001101b
+	movdqa	[edx],	xmm1
+	add		edx,	16
+	mov		[p_sd8x8],	edx
+	
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		bgd_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec		dword [iPicHeight]
+	jnz		bgd_height_loop
+	
+	mov		edx,	[psadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
+
+
+
+WELS_EXTERN VAACalcSadSsdBgd_sse2
+;*************************************************************************************************************
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
+;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
+;*************************************************************************************************************
+
+
+ALIGN 16
+VAACalcSadSsdBgd_sse2:
+%define		localsize		16
+%define		cur_data			esp + pushsize + localsize + 4
+%define		ref_data			esp + pushsize + localsize + 8
+%define		iPicWidth			esp + pushsize + localsize + 12
+%define		iPicHeight			esp + pushsize + localsize + 16
+%define		iPicStride			esp + pushsize + localsize + 20
+%define		psadframe			esp + pushsize + localsize + 24
+%define		psad8x8				esp + pushsize + localsize + 28
+%define		psum16x16			esp + pushsize + localsize + 32
+%define		psqsum16x16			esp + pushsize + localsize + 36
+%define		psqdiff16x16		esp + pushsize + localsize + 40
+%define		p_sd8x8				esp + pushsize + localsize + 44
+%define		p_mad8x8			esp + pushsize + localsize + 48
+%define		tmp_esi				esp + 0
+%define		tmp_edi				esp + 4
+%define		tmp_sadframe		esp + 8
+%define		tmp_ecx				esp + 12
+%define		pushsize		16
+	push	ebp
+	push	esi
+	push	edi
+	push	ebx
+	sub		esp,	localsize
+	mov		esi,	[cur_data]
+	mov		edi,	[ref_data]
+	mov		ebx,	[iPicStride]
+	mov		eax,	ebx
+	
+	shr		dword [iPicWidth],	4					; iPicWidth/16
+	shr		dword [iPicHeight],	4					; iPicHeight/16
+	shl		eax,	4							; iPicStride*16
+	pxor	xmm0,	xmm0
+	movd	[tmp_sadframe],	xmm0
+sqdiff_bgd_height_loop:
+	mov		ecx,	dword [iPicWidth]
+	mov		[tmp_esi],	esi
+	mov		[tmp_edi],	edi
+sqdiff_bgd_width_loop:
+	pxor	xmm7,	xmm7		; pSad8x8 interleaves sqsum16x16:  sqsum1 sad1 sqsum0 sad0
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	pxor	xmm4,	xmm4		; sqdiff_16x16	four Dword
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+	
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+	
+	mov		edx,		[psum16x16]
+	movdqa	xmm1,		xmm6
+	pshufd	xmm2,		xmm1,		00001110b
+	paddd	xmm1,		xmm2
+	movd	[edx],		xmm1				; sum
+	
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+	
+	mov			edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx
+	mov			[tmp_ecx],	ecx
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	psrlq	xmm7,	32
+	psllq	xmm7,	32			; clear sad
+	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
+	pxor	xmm5,	xmm5		; pMad8x8
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
+	
+	mov		edx,		[psad8x8]
+	movdqa	xmm2,		xmm7
+	pshufd	xmm1,		xmm2,		00001110b
+	movd	[edx],		xmm2
+	movd	[edx+4],	xmm1
+	add		edx,		8
+	mov		[psad8x8],	edx			; sad8x8
+	
+	paddd	xmm1,				xmm2
+	movd	edx,				xmm1
+	add		[tmp_sadframe],		edx			; iFrameSad
+	
+	mov		edx,			[psum16x16]
+	movdqa	xmm1,			xmm6
+	pshufd	xmm2,			xmm1,		00001110b
+	paddd	xmm1,			xmm2
+	movd	ebp,			xmm1				; sum
+	add		[edx],			ebp
+	add		edx,			4
+	mov		[psum16x16],	edx
+	
+	mov		edx,			[psqsum16x16]
+	psrlq	xmm7,			32
+	pshufd	xmm2,			xmm7,		00001110b
+	paddd	xmm2,			xmm7
+	movd	[edx],			xmm2				; sqsum
+	add		edx,			4
+	mov		[psqsum16x16],	edx
+	
+	mov		edx,		[p_sd8x8]
+	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
+	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
+	pshufd	xmm1,		xmm6,		00001000b			;  xx xx diff1 diff0
+	movq	[edx],		xmm1
+	add		edx,		8
+	mov		[p_sd8x8],	edx
+	
+	mov		edx,		[p_mad8x8]
+	WELS_MAX_REG_SSE2	xmm5
+	;movdqa		xmm1,	xmm5
+	;punpcklbw	xmm1,	xmm0
+	;punpcklwd	xmm1,	xmm0
+	;movd		[edx],	xmm1
+	;punpckhbw	xmm5,	xmm0
+	;punpcklwd	xmm5,	xmm0
+	;movd		[edx+4],	xmm5
+	;add			edx,		8
+	;mov			[p_mad8x8],	edx	
+	movhlps		xmm1,	xmm5
+	movd		ecx,	xmm5
+	mov			[edx],	cl
+	movd		ecx,	xmm1
+	mov			[edx+1],cl
+	add			edx,	2
+	mov			[p_mad8x8],	edx
+	
+	mov		edx,		[psqdiff16x16]
+	pshufd	xmm1,		xmm4,		00001110b
+	paddd	xmm4,		xmm1
+	pshufd	xmm1,		xmm4,		00000001b
+	paddd	xmm4,		xmm1
+	movd	[edx],		xmm4
+	add		edx,		4
+	mov		[psqdiff16x16],	edx
+	
+	add		edx,	16
+	sub		esi,	eax
+	sub		edi,	eax
+	add		esi,	16
+	add		edi,	16
+	
+	mov		ecx,	[tmp_ecx]
+	dec		ecx
+	jnz		sqdiff_bgd_width_loop
+	
+	mov		esi,	[tmp_esi]
+	mov		edi,	[tmp_edi]
+	add		esi,	eax
+	add		edi,	eax
+	
+	dec	dword [iPicHeight]
+	jnz		sqdiff_bgd_height_loop
+	
+	mov		edx,	[psadframe]
+	mov		ebp,	[tmp_sadframe]
+	mov		[edx],	ebp
+
+	add		esp,	localsize	
+	pop		ebx
+	pop		edi
+	pop		esi
+	pop		ebp
+%undef		cur_data
+%undef		ref_data
+%undef		iPicWidth
+%undef		iPicHeight
+%undef		iPicStride
+%undef		psadframe
+%undef		psad8x8
+%undef		psum16x16
+%undef		psqsum16x16
+%undef		psqdiff16x16
+%undef		p_sd8x8
+%undef		p_mad8x8
+%undef		tmp_esi
+%undef		tmp_edi
+%undef		pushsize
+%undef		localsize
+	ret
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -1,36 +1,36 @@
-;*!
-;* \copy
-;*     Copyright (c)  2011-2013, Cisco Systems
-;*     All rights reserved.
-;*
-;*     Redistribution and use in source and binary forms, with or without
-;*     modification, are permitted provided that the following conditions
-;*     are met:
-;*
-;*        * Redistributions of source code must retain the above copyright
-;*          notice, this list of conditions and the following disclaimer.
-;*
-;*        * Redistributions in binary form must reproduce the above copyright
-;*          notice, this list of conditions and the following disclaimer in
-;*          the documentation and/or other materials provided with the
-;*          distribution.
-;*
-;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-;*     POSSIBILITY OF SUCH DAMAGE.
-;*
-;*
-
-LIBRARY		    welsvp.dll
-EXPORTS
-                CreateVpInterface    PRIVATE
+;*!
+;* \copy
+;*     Copyright (c)  2011-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*
+
+LIBRARY		    welsvp.dll
+EXPORTS
+                CreateVpInterface    PRIVATE
                 DestroyVpInterface   PRIVATE      
\ No newline at end of file
--- a/testbin/layer2.cfg
+++ b/testbin/layer2.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd.cfg
+++ b/testbin/layer2_vd.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/layer2_vd_rc.cfg
+++ b/testbin/layer2_vd_rc.cfg
@@ -1,39 +1,39 @@
-# Layer Configuration File
-
-
-#============================== INPUT / OUTPUT ==============================
-SourceWidth     320                     # Input  frame width
-SourceHeight    192                    # Input  frame height
-FrameRateIn     12                      # Input  frame rate [Hz]
-FrameRateOut    12                     # Output frame rate [Hz]
-InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
-ReconFile       rec_layer2.yuv          # Reconstructed file
-
-#============================== CODING ==============================
-ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
-
-InitialQP       24			# Quantization parameters for base quality layer
-#================================ RATE CONTROL ===============================
-SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
-#============================== MultiSlice Slice Argument ==============================
-# for S/M Slice(s) mode settings
-SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
-SliceSize			1500
-SliceNum			1		# multiple slices number specified
-
-SlicesAssign0		960		# count number of MBs in slice #0
-SlicesAssign1		0		# count number of MBs in slice #1
-SlicesAssign2		0		# count number of MBs in slice #2
-SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
-SlicesAssign4		0		# count number of MBs in slice #4
-SlicesAssign5		0		# count number of MBs in slice #5
-SlicesAssign6		0		# count number of MBs in slice #6
-SlicesAssign7		0		# count number of MBs in slice #7
-
-### DESIGN OF SLICE MODE, 100804, Sijia ####
-# 0 SM_SINGLE_SLICE			| SliceNum==1
-# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
-# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
-# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
-# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
-
+# Layer Configuration File
+
+
+#============================== INPUT / OUTPUT ==============================
+SourceWidth     320                     # Input  frame width
+SourceHeight    192                    # Input  frame height
+FrameRateIn     12                      # Input  frame rate [Hz]
+FrameRateOut    12                     # Output frame rate [Hz]
+InputFile       CiscoVT2people_320x192_12fps.yuv # Input  file
+ReconFile       rec_layer2.yuv          # Reconstructed file
+
+#============================== CODING ==============================
+ProfileIdc      66          # value of profile_idc (or 0 for auto detection)
+
+InitialQP       24			# Quantization parameters for base quality layer
+#================================ RATE CONTROL ===============================
+SpatialBitrate		600		# Unit: kbps, controled by DisableRC also
+#============================== MultiSlice Slice Argument ==============================
+# for S/M Slice(s) mode settings
+SliceMode			0		# 0: sigle slice mode; >0: multiple slices mode, see below;
+SliceSize			1500
+SliceNum			1		# multiple slices number specified
+
+SlicesAssign0		960		# count number of MBs in slice #0
+SlicesAssign1		0		# count number of MBs in slice #1
+SlicesAssign2		0		# count number of MBs in slice #2
+SlicesAssign3		0		# count number of MBs in slice #3 -- seting here is for better testing
+SlicesAssign4		0		# count number of MBs in slice #4
+SlicesAssign5		0		# count number of MBs in slice #5
+SlicesAssign6		0		# count number of MBs in slice #6
+SlicesAssign7		0		# count number of MBs in slice #7
+
+### DESIGN OF SLICE MODE, 100804, Sijia ####
+# 0 SM_SINGLE_SLICE			| SliceNum==1
+# 1 SM_FIXEDSLCNUM_SLICE	| according to SliceNum			| Enabled dynamic slicing for multi-thread
+# 2 SM_RASTER_SLICE			| according to SlicesAssign		| Need input of MB numbers each slice. In addition, if other constraint in slice_argument is presented, need to follow the constraints. Typically if MB num and slice size are both constrained, re-encoding may be involved.
+# 3 SM_ROWMB_SLICE			| according to PictureMBHeight	| Specially for TP. Typical of single row of mbs each slice?+ slice size constraint which including re-encoding
+# 4 SM_DYN_SLICE			| according to SliceSize		| Dynamic slicing (have no idea about slice_nums until encoding current frame)
+
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				1						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0.cfg		# Layer 0 configuration file
-//LayerCfg                layer1.cfg		# Layer 1 configuration file
-LayerCfg                layer2.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				1						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0.cfg		# Layer 0 configuration file
+//LayerCfg                layer1.cfg		# Layer 1 configuration file
+LayerCfg                layer2.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_1d.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
-
-GOPSize                 4                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC				0						# ENABLE RC
-TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_1d.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate)
+
+GOPSize                 4                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC				0						# ENABLE RC
+TargetBitrate			5000				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   0              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			0			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd.cfg		# Layer 2 configuration file
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -1,63 +1,63 @@
-# Cisco Scalable H.264/AVC Extension Encoder Configuration File
-
-#============================== GENERAL ==============================
-OutputFile              test_vd_rc.264               # Bitstream file
-MaxFrameRate            30                     # Maximum frame rate [Hz]
-FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
-
-GOPSize                 8                     # GOP Size (at maximum frame rate), 16
-IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
-EnableSpsPpsIDAddition  1
-
-EnableFrameCropping 	1 		       # enable frame cropping flag
-
-#============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
-LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
-
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
-                                               # 2: on except for slice boundaries,
-                                               # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
-                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
-                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
-InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
-InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
-
-#============================== SOFTWARE IMPLEMENTATION ==============================
-MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
-
-#============================== RATE CONTROL ==============================
-EnableRC			1						# ENABLE RC
-TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
-
-#============================== DENOISE CONTROL ==============================
-EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
-
-#============================== SCENE CHANGE DETECTION CONTROL =======================
-EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
-
-#============================== BACKGROUND DETECTION CONTROL ==============================
-EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
-
-#============================== ADAPTIVE QUANTIZATION CONTROL =======================
-EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
-
-#============================== LONG TERM REFERENCE CONTROL ==============================
-EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
-
-#============================== LAYER DEFINITION ==============================
-PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
-												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
-												# Can be disabled when no inter spatial layer prediction in case of its value as 0
-NumLayers              1                      # Number of layers
-//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
-//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
-LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file
+# Cisco Scalable H.264/AVC Extension Encoder Configuration File
+
+#============================== GENERAL ==============================
+OutputFile              test_vd_rc.264               # Bitstream file
+MaxFrameRate            30                     # Maximum frame rate [Hz]
+FramesToBeEncoded       -1                    # Number of frames (at input frame rate), -1
+
+GOPSize                 8                     # GOP Size (at maximum frame rate), 16
+IntraPeriod            0                    # Intra Period ( multipler of GoP size or -1)
+EnableSpsPpsIDAddition  1
+
+EnableFrameCropping 	1 		       # enable frame cropping flag
+
+#============================== LOOP FILTER ==============================
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
+LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
+
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+                                               # 2: on except for slice boundaries,
+                                               # 3: two stage. slice boundries on in second stage
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
+                                               # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
+InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
+InterLayerLoopFilterBetaOffset    0            # BetaOffset for inter-layer deblocking
+
+#============================== SOFTWARE IMPLEMENTATION ==============================
+MultipleThreadIdc			    1	# 0: auto(dynamic imp. internal encoder); 1: multiple threads imp. disabled; > 1: count number of threads;
+
+#============================== RATE CONTROL ==============================
+EnableRC			1						# ENABLE RC
+TargetBitrate			600				    # Unit: kbps, controled by EnableRC also
+
+#============================== DENOISE CONTROL ==============================
+EnableDenoise                   1              # Enable Denoise (1: enable, 0: disable)
+
+#============================== SCENE CHANGE DETECTION CONTROL =======================
+EnableSceneChangeDetection			1			# Enable Scene Change Detection (1: enable, 0: disable)
+
+#============================== BACKGROUND DETECTION CONTROL ==============================
+EnableBackgroundDetection		 1     # BGD control(1: enable, 0: disable)
+
+#============================== ADAPTIVE QUANTIZATION CONTROL =======================
+EnableAdaptiveQuantization			1			# Enable Adaptive Quantization (1: enable, 0: disable)
+
+#============================== LONG TERM REFERENCE CONTROL ==============================
+EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
+LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+
+#============================== LAYER DEFINITION ==============================
+PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
+												# It shall always be on in SVC contexts (i.e. when there are CGS/MGS/spatial enhancement layers)
+												# Can be disabled when no inter spatial layer prediction in case of its value as 0
+NumLayers              1                      # Number of layers
+//LayerCfg                layer0_vd.cfg		# Layer 0 configuration file
+//LayerCfg                layer1_vd.cfg		# Layer 1 configuration file
+LayerCfg                layer2_vd_rc.cfg		# Layer 2 configuration file