shithub: openh264

Download patch

ref: 65b339815efb168f48e256231299a932f397fb51
parent: ddcfc09c495c81dd6cf4824d1f3f345481c75d61
author: Martin Storsjö <martin@martin.st>
date: Sun Jan 5 09:16:22 EST 2014

Get rid of trailing whitespace in the assembly source files

--- a/codec/common/asm_inc.asm
+++ b/codec/common/asm_inc.asm
@@ -154,7 +154,7 @@
 %define  PUSHRFLAGS     pushfq
 %define  POPRFLAGS      popfq
 %define  retrq          rax
-%define  retrd          eax 
+%define  retrd          eax
 
 %elifdef X86_32 ; X86_32 ;************************************
 
@@ -233,7 +233,7 @@
 %macro LOAD_4_PARA 0
     %ifdef X86_32
         push r3
-        %assign  push_num push_num+1	
+        %assign  push_num push_num+1
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -245,7 +245,7 @@
     %ifdef X86_32
         push r3
         push r4
-        %assign  push_num push_num+2	
+        %assign  push_num push_num+2
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -261,7 +261,7 @@
 	push r3
         push r4
         push r5
-        %assign  push_num push_num+3	
+        %assign  push_num push_num+3
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -280,7 +280,7 @@
         push r4
         push r5
         push r6
-        %assign  push_num push_num+4	
+        %assign  push_num push_num+4
         mov r0, [esp + push_num*4 + 4]
         mov r1, [esp + push_num*4 + 8]
         mov r2, [esp + push_num*4 + 12]
@@ -334,7 +334,7 @@
             movsx %1, %2
     %endif
 %endmacro
- 
+
 %macro WELS_EXTERN 1
     %ifdef PREFIX
         global _%1
--- a/codec/common/cpuid.asm
+++ b/codec/common/cpuid.asm
@@ -81,17 +81,17 @@
 %ifdef       WIN64
 
 WelsCPUId:
-    push     rbx        
-    push     rdx    
- 
+    push     rbx
+    push     rdx
+
     mov      eax,     ecx
     mov      rcx,     [r9]
-    cpuid  
+    cpuid
     mov      [r9],    ecx
     mov      [r8],    ebx
-    mov      rcx,    [rsp + 2*8 + 40]        
+    mov      rcx,    [rsp + 2*8 + 40]
     mov      [rcx],   edx
-    pop      rdx 
+    pop      rdx
     mov      [rdx],   eax
 
     pop      rbx
@@ -103,8 +103,8 @@
     push     rcx
     push     rdx
 
-    mov      eax,     edi 
-    mov      rcx,     [rcx]   
+    mov      eax,     edi
+    mov      rcx,     [rcx]
     cpuid
     mov      [r8],    edx
     pop      rdx
@@ -156,9 +156,9 @@
 %elifdef   UNIX64
         mov eax, edi
         mov ecx, esi
-%else 
+%else
         mov eax, [esp+4]
-        mov ecx, [esp+8]  
+        mov ecx, [esp+8]
 %endif
 
         ; refer to detection of AVX addressed in INTEL AVX manual document
--- a/codec/common/deblock.asm
+++ b/codec/common/deblock.asm
@@ -57,264 +57,264 @@
 
 SECTION .text
 
-%ifdef  WIN64 
+%ifdef  WIN64
 
 
 WELS_EXTERN   DeblockLumaLt4V_sse2
 
 DeblockLumaLt4V_sse2:
-  push        rbp      
+  push        rbp
   mov         r11,[rsp + 16 + 20h]  ; pTC
-  sub         rsp,1B0h                                                       
-  lea         rbp,[rsp+20h]                                                  
-  movd        xmm4,r8d                                                                                                  
-  movd        xmm2,r9d                                                       
-  mov         qword [rbp+180h],r12                                       
-  mov         r10,rcx                                                        
-  movsxd      r12,edx                                                        
-  add         edx,edx                                                        
-  movsxd      rdx,edx                                                        
-  sub         r10,r12                                                        
-  movsx       r8d,byte [r11]                                             
-  pxor        xmm3,xmm3                                                      
-  punpcklwd   xmm2,xmm2                                                      
-  movaps      [rbp+50h],xmm14                                    
-  lea         rax,[r12+r12*2]                                                
-  movdqa      xmm14,[rdx+rcx]                                    
-  neg         rax                                                            
-  pshufd      xmm0,xmm2,0                                                    
-  movd        xmm2,r8d                                                       
-  movsx       edx,byte [r11+1]                                           
-  movsx       r8d,byte [r11+2]                                           
-  movsx       r11d,byte [r11+3]                                          
-  movaps      [rbp+70h],xmm12                                    
-  movd        xmm1,edx                                                       
-  movaps      [rbp+80h],xmm11                                    
-  movd        xmm12,r8d                                                      
-  movd        xmm11,r11d                                                     
-  movdqa      xmm5, [rax+rcx]                                     
-  lea         rax,[r12+r12]                                                  
-  punpcklwd   xmm12,xmm12                                                    
-  neg         rax                                                            
-  punpcklwd   xmm11,xmm11                                                    
-  movaps      [rbp],xmm8                                         
-  movdqa      xmm8, [r10]                                         
-  punpcklwd   xmm2,xmm2                                                      
-  punpcklwd   xmm1,xmm1                                                      
-  punpcklqdq  xmm12,xmm12                                                    
-  punpcklqdq  xmm11,xmm11                                                    
-  punpcklqdq  xmm2,xmm2                                                      
-  punpcklqdq  xmm1,xmm1                                                      
-  shufps      xmm12,xmm11,88h                                                
-  movdqa      xmm11,xmm8                                                     
-  movaps      [rbp+30h],xmm9                                     
-  movdqa      xmm9,[rcx]                                         
-  shufps      xmm2,xmm1,88h                                                  
-  movdqa      xmm1,xmm5                                                      
-  punpcklbw   xmm11,xmm3                                                     
-  movaps      [rbp+20h],xmm6                                     
-  movaps      [rbp+60h],xmm13                                    
-  movdqa      xmm13,xmm11                                                    
-  movaps      [rbp+90h],xmm10                                    
-  movdqa      xmm10,xmm9                                                     
-  movdqa      xmm6,[rax+rcx]                                     
-  punpcklbw   xmm1,xmm3                                                      
-  movaps      [rbp+0A0h],xmm12                                   
-  psubw       xmm13,xmm1                                                     
-  movaps      [rbp+40h],xmm15                                    
-  movdqa      xmm15,xmm14                                                    
-  movaps      [rbp+10h],xmm7                                     
-  movdqa      xmm7,xmm6                                                      
-  punpcklbw   xmm10,xmm3                                                     
-  movdqa      xmm12,[r12+rcx]                                    
-  punpcklbw   xmm7,xmm3                                                      
-  punpcklbw   xmm12,xmm3                                                     
-  punpcklbw   xmm15,xmm3                                                     
-  pabsw       xmm3,xmm13                                                     
-  movdqa      xmm13,xmm10                                                    
-  psubw       xmm13,xmm15                                                    
-  movdqa      [rbp+0F0h],xmm15                                   
-  pabsw       xmm15,xmm13                                                    
-  movdqa      xmm13,xmm11                                                    
-  movdqa      [rbp+0B0h],xmm1                                    
-  movdqa      xmm1,xmm0                                                      
-  pavgw       xmm13,xmm10                                                    
-  pcmpgtw     xmm1,xmm3                                                      
-  movdqa      [rbp+120h],xmm13                                   
-  movaps      xmm13,xmm2                                                     
-  punpcklwd   xmm4,xmm4                                                      
-  movdqa      xmm3,xmm0                                                      
-  movdqa      [rbp+100h],xmm1                                    
-  psubw       xmm13,xmm1                                                     
-  movdqa      xmm1,xmm10                                                     
-  pcmpgtw     xmm3,xmm15                                                     
-  pshufd      xmm4,xmm4,0                                                    
-  psubw       xmm1,xmm11                                                     
-  movdqa      [rbp+0D0h],xmm10                                   
-  psubw       xmm13,xmm3                                                     
-  movdqa      [rbp+110h],xmm3                                    
-  pabsw       xmm15,xmm1                                                     
-  movdqa      xmm3,xmm4                                                      
-  psubw       xmm10,xmm12                                                    
-  pcmpgtw     xmm3,xmm15                                                     
-  pabsw       xmm15,xmm10                                                    
-  movdqa      xmm10,xmm0                                                     
-  psllw       xmm1,2                                                         
-  movdqa      [rbp+0C0h],xmm11                                   
-  psubw       xmm11,xmm7                                                     
-  pcmpgtw     xmm10,xmm15                                                    
-  pabsw       xmm11,xmm11                                                    
-  movdqa      xmm15,xmm0                                                     
-  pand        xmm3,xmm10                                                     
-  pcmpgtw     xmm15,xmm11                                                    
-  movaps      xmm11,xmm2                                                     
-  pxor        xmm10,xmm10                                                    
-  pand        xmm3,xmm15                                                     
-  pcmpgtw     xmm11,xmm10                                                    
-  pcmpeqw     xmm10,xmm2                                                     
-  por         xmm11,xmm10                                                    
-  pand        xmm3,xmm11                                                     
-  movdqa      xmm11,xmm7                                                     
-  psubw       xmm11,xmm12                                                    
-  pxor        xmm15,xmm15                                                    
-  paddw       xmm11,xmm1                                                     
-  psubw       xmm15,xmm13                                                    
-  movdqa      [rbp+0E0h],xmm12                                   
-  paddw       xmm11,[FOUR_16B_SSE2] 
-  pxor        xmm12,xmm12                                                    
-  psraw       xmm11,3                                                        
-  punpckhbw   xmm8,xmm12                                                     
-  pmaxsw      xmm15,xmm11                                                    
-  punpckhbw   xmm5,xmm12                                                     
-  movdqa      xmm11,xmm8                                                     
-  pminsw      xmm13,xmm15                                                    
-  psubw       xmm11,xmm5                                                     
-  punpckhbw   xmm9,xmm12                                                     
-  pand        xmm13,xmm3                                                     
-  movdqa      [rbp+130h],xmm13                                   
-  pabsw       xmm13,xmm11                                                    
-  punpckhbw   xmm14,xmm12                                                    
-  movdqa      xmm11,xmm9                                                     
-  psubw       xmm11,xmm14                                                    
-  movdqa      xmm15,xmm0                                                     
-  movdqa      [rbp+140h],xmm14                                   
-  pabsw       xmm14,xmm11                                                    
-  movdqa      xmm11,xmm8                                                     
-  pcmpgtw     xmm15,xmm14                                                    
-  movdqa      xmm1,[r12+rcx]                                     
-  pavgw       xmm11,xmm9                                                     
-  movdqa      [rbp+170h],xmm11                                   
-  movdqa      xmm10,xmm9                                                     
-  punpckhbw   xmm6,xmm12                                                     
-  psubw       xmm10,xmm8                                                     
-  punpckhbw   xmm1,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  movaps      xmm11,[rbp+0A0h]                                   
-  pcmpgtw     xmm12,xmm13                                                    
-  movaps      xmm13,xmm11                                                    
-  psubw       xmm13,xmm12                                                    
-  movdqa      [rbp+160h],xmm15                                   
-  psubw       xmm13,xmm15                                                    
-  movdqa      xmm15,xmm9                                                     
-  psubw       xmm15,xmm1                                                     
-  movdqa      [rbp+150h],xmm12                                   
-  pabsw       xmm12,xmm10                                                    
-  pabsw       xmm14,xmm15                                                    
-  movdqa      xmm15,xmm8                                                     
-  pcmpgtw     xmm4,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  psubw       xmm15,xmm6                                                     
-  pcmpgtw     xmm12,xmm14                                                    
-  pabsw       xmm14,xmm15                                                    
-  psllw       xmm10,2                                                        
-  pcmpgtw     xmm0,xmm14                                                     
-  movdqa      xmm14,xmm6                                                     
-  psubw       xmm14,xmm1                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm14,xmm10                                                    
-  pand        xmm4,xmm0                                                      
-  paddw       xmm14,[FOUR_16B_SSE2] 
-  pxor        xmm15,xmm15                                                    
-  movaps      xmm12,xmm11                                                    
-  psubw       xmm15,xmm13                                                    
-  pxor        xmm0,xmm0                                                      
-  psraw       xmm14,3                                                        
-  pcmpgtw     xmm12,xmm0                                                     
-  pcmpeqw     xmm0,xmm11                                                     
-  pmaxsw      xmm15,xmm14                                                    
-  por         xmm12,xmm0                                                     
-  movdqa      xmm0,[rbp+120h]                                    
-  pminsw      xmm13,xmm15                                                    
-  movdqa      xmm15,[rbp+0B0h]                                   
-  movdqa      xmm10,xmm7                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm15,xmm0                                                     
-  pxor        xmm12,xmm12                                                    
-  paddw       xmm10,xmm7                                                     
-  movdqa      xmm14,xmm12                                                    
-  psubw       xmm15,xmm10                                                    
-  psubw       xmm14,xmm2                                                     
-  psraw       xmm15,1                                                        
-  pmaxsw      xmm15,xmm14                                                    
-  movdqa      xmm10,xmm6                                                     
-  pminsw      xmm15,xmm2                                                     
-  paddw       xmm10,xmm6                                                     
-  pand        xmm15,xmm3                                                     
-  psubw       xmm12,xmm11                                                    
-  pand        xmm15,[rbp+100h]                                   
-  pand        xmm13,xmm4                                                     
-  paddw       xmm7,xmm15                                                     
-  paddw       xmm8,xmm13                                                     
-  movdqa      xmm15,[rbp+170h]                                   
-  psubw       xmm9,xmm13                                                     
-  paddw       xmm5,xmm15                                                     
-  psubw       xmm5,xmm10                                                     
-  psraw       xmm5,1                                                         
-  pmaxsw      xmm5,xmm12                                                     
-  pminsw      xmm5,xmm11                                                     
-  pand        xmm5,xmm4                                                      
-  pand        xmm5,[rbp+150h]                                    
-  paddw       xmm6,xmm5                                                      
-  movdqa      xmm5,[rbp+0C0h]                                    
-  packuswb    xmm7,xmm6                                                      
-  movdqa      xmm6,[rbp+130h]                                    
-  paddw       xmm5,xmm6                                                      
-  packuswb    xmm5,xmm8                                                      
-  movdqa      xmm8,[rbp+0D0h]                                    
-  psubw       xmm8,xmm6                                                      
-  movdqa      xmm6,[rbp+0F0h]                                    
-  paddw       xmm6,xmm0                                                      
-  movdqa      xmm0,[rbp+0E0h]                                    
-  packuswb    xmm8,xmm9                                                      
-  movdqa      xmm9,xmm0                                                      
-  paddw       xmm9,xmm0                                                      
-  psubw       xmm6,xmm9                                                      
-  psraw       xmm6,1                                                         
-  pmaxsw      xmm14,xmm6                                                     
-  pminsw      xmm2,xmm14                                                     
-  pand        xmm2,xmm3                                                      
-  pand        xmm2,[rbp+110h]                                    
-  paddw       xmm0,xmm2                                                      
-  movdqa      xmm2,[rbp+140h]                                    
-  paddw       xmm2,xmm15                                                     
-  movdqa      xmm15,xmm1                                                     
-  paddw       xmm15,xmm1                                                     
-  psubw       xmm2,xmm15                                                     
-  psraw       xmm2,1                                                         
-  pmaxsw      xmm12,xmm2                                                     
-  pminsw      xmm11,xmm12                                                    
-  pand        xmm11,xmm4                                                     
-  pand        xmm11,[rbp+160h]                                   
-  paddw       xmm1,xmm11                                                     
-  movdqa      [rax+rcx],xmm7                                     
-  movdqa      [r10],xmm5                                         
-  packuswb    xmm0,xmm1                                                      
-  movdqa      [rcx],xmm8                                         
-  movdqa      [r12+rcx],xmm0                                                                        
-  mov         r12,qword [rbp+180h]                                       
-  lea         rsp,[rbp+190h]                                                 
-  pop         rbp                                                            
-  ret                                                                        
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,r8d
+  movd        xmm2,r9d
+  mov         qword [rbp+180h],r12
+  mov         r10,rcx
+  movsxd      r12,edx
+  add         edx,edx
+  movsxd      rdx,edx
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rcx]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       edx,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,edx
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rcx]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rcx]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rcx]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rcx]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rcx]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rcx],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rcx],xmm8
+  movdqa      [r12+rcx],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  pop         rbp
+  ret
 
 
 WELS_EXTERN   DeblockLumaEq4V_sse2
@@ -321,462 +321,462 @@
 
 ALIGN  16
 DeblockLumaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        rsi  
-  push        rdi  
-  sub         rsp,1D8h 
-  movaps      [rax-38h],xmm6 
-  movaps      [rax-48h],xmm7 
-  movaps      [rax-58h],xmm8 
-  pxor        xmm1,xmm1 
-  movsxd      r10,edx 
-  mov         rbp,rcx 
-  mov         r11d,r8d 
-  mov         rdx,rcx 
-  mov         rdi,rbp 
-  mov         rbx,rbp 
-  movdqa      xmm5,[rbp] 
-  movaps      [rax-68h],xmm9 
-  movaps      [rax-78h],xmm10 
-  punpcklbw   xmm5,xmm1 
-  movaps      [rax-88h],xmm11 
-  movaps      [rax-98h],xmm12 
-  movaps      [rax-0A8h],xmm13 
-  movaps      [rax-0B8h],xmm14 
-  movdqa      xmm14,[r10+rbp] 
-  movaps      [rax-0C8h],xmm15 
-  lea         eax,[r10*4] 
-  movsxd      r8,eax 
-  lea         eax,[r10+r10*2] 
-  movsxd      rcx,eax 
-  lea         eax,[r10+r10] 
-  sub         rdx,r8 
-  punpcklbw   xmm14,xmm1 
-  movdqa      [rsp+90h],xmm5 
-  movdqa      [rsp+30h],xmm14 
-  movsxd      rsi,eax 
-  movsx       eax,r11w 
-  sub         rdi,rcx 
-  sub         rbx,rsi 
-  mov         r8,rbp 
-  sub         r8,r10 
-  movd        xmm0,eax 
-  movsx       eax,r9w 
-  movdqa      xmm12,[rdi] 
-  movdqa      xmm6, [rsi+rbp] 
-  movdqa      xmm13,[rbx] 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm6,xmm1 
-  movdqa      xmm8,[r8] 
-  movd        xmm0,eax 
-  movdqa      xmm10,xmm11 
-  mov         eax,2 
-  punpcklbw   xmm8,xmm1 
-  punpcklbw   xmm12,xmm1 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  psraw       xmm10,2 
-  movdqa      xmm1,xmm8 
-  movdqa      [rsp+0F0h],xmm13 
-  movdqa      [rsp+0B0h],xmm8 
-  pshufd      xmm7,xmm0,0 
-  psubw       xmm1,xmm13 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm4,xmm7 
-  movdqa      xmm2,xmm7 
-  psubw       xmm0,xmm8 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm5 
-  movdqa      [rsp+40h],xmm7 
-  movdqa      [rsp+60h],xmm6 
-  pcmpgtw     xmm4,xmm0 
-  psubw       xmm1,xmm14 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm4,xmm2 
-  movdqa      xmm0,xmm11 
-  pcmpgtw     xmm0,xmm3 
-  pand        xmm4,xmm0 
-  movd        xmm0,eax 
-  movdqa      [rsp+20h],xmm4 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm2,xmm0,0 
-  paddw       xmm10,xmm2 
-  movdqa      [rsp+0A0h],xmm2 
-  movdqa      xmm15,xmm7 
-  pxor        xmm4,xmm4 
-  movdqa      xmm0,xmm8 
-  psubw       xmm0,xmm12 
-  mov         eax,4 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm10 
-  cwde             
-  pcmpgtw     xmm15,xmm0 
-  pcmpgtw     xmm1,xmm3 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm7,[rdx] 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm6 
-  pand        xmm15,xmm1 
-  punpcklbw   xmm7,xmm4 
-  movdqa      xmm9,xmm15 
-  pabsw       xmm0,xmm0 
-  psllw       xmm7,1 
-  pandn       xmm9,xmm12 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm7,xmm12 
-  movd        xmm0,eax 
-  pand        xmm3,xmm1 
-  paddw       xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm7,xmm12 
-  pshufd      xmm1,xmm0,0 
-  paddw       xmm7,xmm13 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm6 
-  paddw       xmm7,xmm8 
-  movdqa      [rsp+70h],xmm1 
-  paddw       xmm7,xmm5 
-  movdqa      [rsp+120h],xmm0 
-  movdqa      xmm0,[rcx+rbp] 
-  punpcklbw   xmm0,xmm4 
-  paddw       xmm7,xmm1 
-  movdqa      xmm4,xmm15 
-  psllw       xmm0,1 
-  psraw       xmm7,3 
-  paddw       xmm0,xmm6 
-  pand        xmm7,xmm15 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm14 
-  movdqa      xmm6,xmm15 
-  paddw       xmm0,xmm5 
-  pandn       xmm6,xmm13 
-  paddw       xmm0,xmm8 
-  paddw       xmm0,xmm1 
-  psraw       xmm0,3 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,xmm13 
-  pand        xmm0,xmm3 
-  movdqa      [rsp+100h],xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,xmm5 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm3 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pandn       xmm0,xmm14 
-  pand        xmm4,xmm1 
-  movdqa      [rsp+0E0h],xmm0 
-  movdqa      xmm0,xmm5 
-  paddw       xmm0,xmm8 
-  movdqa      xmm1,[rsp+60h] 
-  paddw       xmm1,xmm14 
-  movdqa      xmm14,xmm3 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,[rsp+30h] 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pand        xmm14,xmm1 
-  movdqa      xmm1,xmm13 
-  paddw       xmm1,xmm13 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  movdqa      xmm0,[rsp+30h] 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm5,xmm15 
-  paddw       xmm0,[rsp+70h] 
-  pandn       xmm5,xmm1 
-  paddw       xmm2,xmm8 
-  movdqa      xmm8,[rsp+90h] 
-  movdqa      xmm1,xmm12 
-  paddw       xmm2,xmm8 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,xmm8 
-  movdqa      xmm8,xmm3 
-  movdqa      xmm2,[rsp+30h] 
-  paddw       xmm0,xmm13 
-  psraw       xmm1,3 
-  pand        xmm15,xmm1 
-  movdqa      xmm1,xmm2 
-  paddw       xmm1,xmm2 
-  paddw       xmm2,[rsp+90h] 
-  paddw       xmm2,[rsp+0B0h] 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm13,[r8] 
-  paddw       xmm0, [rsp+70h] 
-  paddw       xmm1, [rsp+0A0h] 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,2 
-  movdqa      xmm0, [rdi] 
-  pandn       xmm8,xmm1 
-  movdqa      xmm1, [rsp+60h] 
-  paddw       xmm1,xmm2 
-  movdqa      xmm2, [rbx] 
-  psraw       xmm1,3 
-  pand        xmm3,xmm1 
-  movdqa      xmm1, [rbp] 
-  movdqa      [rsp+0D0h],xmm3 
-  pxor        xmm3,xmm3 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm1,xmm3 
-  punpckhbw   xmm13,xmm3 
-  movdqa      [rsp+0C0h],xmm0 
-  movdqa      xmm0,[r10+rbp] 
-  movdqa      [rsp],xmm1 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm2,xmm3 
-  movdqa      [rsp+80h],xmm0 
-  movdqa      xmm0,[rsi+rbp] 
-  movdqa      [rsp+10h],xmm13 
-  punpckhbw   xmm0,xmm3 
-  movdqa      [rsp+50h],xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm1,xmm13 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm2 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,[rsp] 
-  movdqa      xmm13,[rsp+40h] 
-  movdqa      [rsp+110h],xmm2 
-  psubw       xmm1, [rsp+80h] 
-  pcmpgtw     xmm13,xmm0 
-  pcmpgtw     xmm11,xmm3 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm10,xmm3 
-  movdqa      xmm1, [rsp+40h] 
-  movdqa      xmm2,xmm1 
-  movdqa      xmm3,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  movdqa      xmm0, [rsp+10h] 
-  pand        xmm13,xmm2 
-  pand        xmm13,xmm11 
-  movdqa      xmm11,[rsp+0C0h] 
-  psubw       xmm0,xmm11 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm3,xmm0 
-  pand        xmm3,xmm10 
-  movdqa      xmm0,[rsp] 
-  psubw       xmm0,[rsp+50h] 
-  movdqa      xmm2,[rdx] 
-  pabsw       xmm0,xmm0 
-  por         xmm7,xmm9 
-  movdqa      xmm9,[rsp+20h] 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm9,xmm7 
-  movdqa      xmm7,[rsp+20h] 
-  movdqa      xmm0,xmm7 
-  pandn       xmm0,xmm12 
-  movdqa      xmm12,[rsp+110h] 
-  pand        xmm1,xmm10 
-  movdqa      xmm10,[rsp+70h] 
-  movdqa      [rsp+40h],xmm1 
-  movdqa      xmm1,xmm13 
-  por         xmm9,xmm0 
-  pxor        xmm0,xmm0 
-  por         xmm4,xmm6 
-  movdqa      xmm6,xmm7 
-  punpckhbw   xmm2,xmm0 
-  por         xmm15,xmm5 
-  movdqa      xmm5,[rsp+20h] 
-  movdqa      xmm0,xmm3 
-  psllw       xmm2,1 
-  pandn       xmm0,xmm11 
-  pand        xmm6,xmm4 
-  movdqa      xmm4,[rsp] 
-  paddw       xmm2,xmm11 
-  pand        xmm5,xmm15 
-  movdqa      xmm15,[rsp+20h] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm12 
-  paddw       xmm2,[rsp+10h] 
-  paddw       xmm2,[rsp] 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  pand        xmm2,xmm3 
-  por         xmm2,xmm0 
-  pand        xmm1,xmm2 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm2,xmm11 
-  pandn       xmm0,xmm11 
-  paddw       xmm2,xmm12 
-  por         xmm1,xmm0 
-  packuswb    xmm9,xmm1 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm7,[rsp+0A0h] 
-  pandn       xmm0,[rsp+0F0h] 
-  movdqa      xmm1,xmm3 
-  por         xmm6,xmm0 
-  movdqa      xmm0,[rsp+10h] 
-  paddw       xmm0,xmm4 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm12 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  pandn       xmm0,xmm12 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,[rsp+10h] 
-  por         xmm2,xmm0 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+0B0h] 
-  paddw       xmm1,xmm4 
-  packuswb    xmm6,xmm2 
-  movdqa      xmm2,xmm3 
-  psllw       xmm1,1 
-  por         xmm5,xmm0 
-  movdqa      xmm0,[rsp+80h] 
-  paddw       xmm0,xmm10 
-  paddw       xmm1,xmm0 
-  paddw       xmm11,xmm1 
-  psraw       xmm11,3 
-  movdqa      xmm1,xmm12 
-  pand        xmm2,xmm11 
-  paddw       xmm1,xmm12 
-  movdqa      xmm11,[rsp+80h] 
-  movdqa      xmm0, [rsp+10h] 
-  por         xmm14,[rsp+0E0h] 
-  paddw       xmm0,xmm11 
-  movdqa      xmm4,xmm15 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  paddw       xmm1,xmm7 
-  psraw       xmm1,2 
-  pandn       xmm3,xmm1 
-  por         xmm2,xmm3 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm3,[rsp+10h] 
-  pandn       xmm0,xmm3 
-  pand        xmm1,xmm2 
-  movdqa      xmm2,xmm11 
-  paddw       xmm2,[rsp] 
-  por         xmm1,xmm0 
-  movdqa      xmm0,[rsp+0D0h] 
-  por         xmm0,xmm8 
-  paddw       xmm2,xmm3 
-  packuswb    xmm5,xmm1 
-  movdqa      xmm8,[rsp+40h] 
-  movdqa      xmm1,[rsp+50h] 
-  movdqa      xmm3,xmm8 
-  pand        xmm4,xmm0 
-  psllw       xmm2,1 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+90h] 
-  por         xmm4,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm10 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,[rsp] 
-  movdqa      xmm2,xmm11 
-  paddw       xmm0,xmm12 
-  movdqa      xmm12,[rsp] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,3 
-  movdqa      xmm0,xmm8 
-  pand        xmm3,xmm1 
-  paddw       xmm2,xmm7 
-  movdqa      xmm1,xmm13 
-  psraw       xmm2,2 
-  pandn       xmm0,xmm2 
-  por         xmm3,xmm0 
-  movdqa      xmm2,[rsp+50h] 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm3 
-  paddw       xmm2,xmm11 
-  movdqa      xmm3,xmm15 
-  por         xmm1,xmm0 
-  pand        xmm3,xmm14 
-  movdqa      xmm14,[rsp+10h] 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+30h] 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm1,xmm8 
-  por         xmm3,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm14 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm8 
-  pandn       xmm0,xmm11 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm11 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm15 
-  por         xmm2,xmm0 
-  packuswb    xmm3,xmm2 
-  movdqa      xmm0,[rsp+100h] 
-  por         xmm0,[rsp+120h] 
-  pand        xmm1,xmm0 
-  movdqa      xmm2,[rcx+rbp] 
-  movdqa      xmm7,[rsp+50h] 
-  pandn       xmm15,[rsp+60h] 
-  lea         r11,[rsp+1D8h] 
-  pxor        xmm0,xmm0 
-  por         xmm1,xmm15 
-  movaps      xmm15,[r11-0A8h] 
-  movdqa      [rdi],xmm9 
-  movaps      xmm9,[r11-48h] 
-  punpckhbw   xmm2,xmm0 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm7 
-  movdqa      [rbx],xmm6 
-  movaps      xmm6,[r11-18h] 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm11 
-  movaps      xmm11,[r11-68h] 
-  paddw       xmm2,xmm12 
-  movaps      xmm12,[r11-78h] 
-  paddw       xmm2,xmm14 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  movaps      xmm10,[r11-58h] 
-  movaps      xmm14,[r11-98h] 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm8 
-  pandn       xmm8,xmm7 
-  pandn       xmm13,xmm7 
-  por         xmm2,xmm8 
-  movaps      xmm7,[r11-28h] 
-  movaps      xmm8,[r11-38h] 
-  movdqa      [r8],xmm5 
-  pand        xmm0,xmm2 
-  por         xmm0,xmm13 
-  packuswb    xmm1,xmm0 
-  movaps      xmm13,[r11-88h] 
-  movdqa      [rbp],xmm4 
-  movdqa      [r10+rbp],xmm3 
-  movdqa      [rsi+rbp],xmm1 
-  mov         rsp,r11 
-  pop         rdi  
-  pop         rsi  
-  pop         rbp  
-  pop         rbx  
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
   ret
 
 
@@ -784,161 +784,161 @@
 
 ALIGN  16
 DeblockChromaLt4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rdi     
-  sub         rsp,0C8h 
+  mov         rax,rsp
+  push        rbx
+  push        rdi
+  sub         rsp,0C8h
   mov         r10,qword [rax + 30h]  ; pTC
-  pxor        xmm1,xmm1 
-  mov         rbx,rcx 
-  movsxd      r11,r8d 
-  movsx       ecx,byte [r10] 
-  movsx       r8d,byte [r10+2] 
-  mov         rdi,rdx 
-  movq        xmm2,[rbx] 
-  movq        xmm9,[r11+rbx] 
-  movsx       edx,byte [r10+1] 
-  mov         word [rsp+2],cx 
-  mov         word [rsp],cx 
-  movsx       eax,byte [r10+3] 
-  mov         word [rsp+6],dx 
-  mov         word [rsp+4],dx 
-  movdqa      xmm11,xmm1 
-  mov         word [rsp+0Eh],ax 
-  mov         word [rsp+0Ch],ax 
-  lea         eax,[r11+r11] 
-  movsxd      rcx,eax 
-  mov         rax,rbx 
-  mov         rdx,rdi 
-  sub         rax,rcx 
-  mov         word [rsp+0Ah],r8w 
-  mov         word [rsp+8],r8w 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm7,xmm6 
-  movq        xmm13, [rax] 
-  mov         rax,rdi 
-  sub         rax,rcx 
-  mov         rcx,rbx 
-  pcmpgtw     xmm7,xmm1 
-  psubw       xmm11,xmm6 
-  sub         rcx,r11 
-  sub         rdx,r11 
-  movq        xmm0,[rax] 
-  movsx       eax,r9w 
-  movq        xmm15,[rcx] 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rdx] 
-  movdqa      xmm4,xmm13 
-  punpcklqdq  xmm15,xmm0 
-  movq        xmm0, [rdi] 
-  punpcklbw   xmm4,xmm1 
-  movdqa      xmm12,xmm15 
-  punpcklqdq  xmm2,xmm0 
-  movq        xmm0, [r11+rdi] 
-  punpcklbw   xmm12,xmm1 
-  movdqa      xmm14,xmm2 
-  punpcklqdq  xmm9,xmm0 
-  punpckhbw   xmm2,xmm1 
-  punpcklbw   xmm14,xmm1 
-  movd        xmm0,eax 
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
   movsx       eax,word [rsp + 0C8h + 38h] ; iBeta
-  punpckhbw   xmm13,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm9 
-  movdqa      [rsp+10h],xmm2 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm9,xmm1 
-  punpcklbw   xmm3,xmm1 
-  movdqa      xmm1,xmm14 
-  pshufd      xmm10,xmm0,0 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm8,xmm0,0 
-  movd        xmm0,eax 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  psubw       xmm1,xmm12 
-  movdqa      xmm2,xmm10 
-  lea         r11,[rsp+0C8h] 
-  psllw       xmm1,2 
-  movdqa      xmm0,xmm4 
-  psubw       xmm4,xmm12 
-  psubw       xmm0,xmm3 
-  psubw       xmm3,xmm14 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm11 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm12 
-  psubw       xmm0,xmm14 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  movdqa      xmm3,[rsp] 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm9 
-  psubw       xmm13,xmm15 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  paddw       xmm12,xmm6 
-  psubw       xmm14,xmm6 
-  movdqa      xmm2,[rsp+10h] 
-  movaps      xmm6,[r11-18h] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm15 
-  psubw       xmm9,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm15 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  pmaxsw      xmm11,xmm1 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm10,xmm0 
-  pabsw       xmm0,xmm13 
-  pminsw      xmm3,xmm11 
-  movaps      xmm11,[r11-68h] 
-  movaps      xmm13,[rsp+40h] 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm9 
-  movaps      xmm9, [r11-48h] 
-  pand        xmm10,xmm1 
-  pcmpgtw     xmm8,xmm0 
-  pand        xmm10,xmm8 
-  pand        xmm10,xmm7 
-  movaps      xmm8,[r11-38h] 
-  movaps      xmm7,[r11-28h] 
-  pand        xmm3,xmm10 
-  paddw       xmm15,xmm3 
-  psubw       xmm2,xmm3 
-  movaps      xmm10,[r11-58h] 
-  packuswb    xmm12,xmm15 
-  movaps      xmm15,[rsp+20h] 
-  packuswb    xmm14,xmm2 
-  movq        [rcx],xmm12 
-  movq        [rbx],xmm14 
-  psrldq      xmm12,8 
-  psrldq      xmm14,8 
-  movq        [rdx],xmm12 
-  movaps      xmm12,[r11-78h] 
-  movq        [rdi],xmm14 
-  movaps      xmm14,[rsp+30h] 
-  mov         rsp,r11 
-  pop         rdi  
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  pop         rdi
+  pop         rbx
   ret
 
 
@@ -945,151 +945,151 @@
 WELS_EXTERN   DeblockChromaEq4V_sse2
 ALIGN 16
 DeblockChromaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  sub         rsp,90h 
-  pxor        xmm1,xmm1 
-  mov         r11,rcx 
-  mov         rbx,rdx 
-  mov         r10d,r9d   
-  movq        xmm13,[r11] 
-  lea         eax,[r8+r8] 
-  movsxd      r9,eax 
-  mov         rax,rcx 
-  sub         rax,r9 
-  movq        xmm14,[rax] 
-  mov         rax,rdx 
-  sub         rax,r9 
-  movq        xmm0,[rax] 
-  movsxd      rax,r8d 
-  sub         rcx,rax 
-  sub         rdx,rax 
-  movq        xmm12,[rax+r11] 
-  movq        xmm10,[rcx] 
-  punpcklqdq  xmm14,xmm0 
-  movdqa      xmm8,xmm14 
-  movq        xmm0,[rdx] 
-  punpcklbw   xmm8,xmm1 
-  punpckhbw   xmm14,xmm1 
-  punpcklqdq  xmm10,xmm0 
-  movq        xmm0,[rbx] 
-  movdqa      xmm5,xmm10 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rax+rbx] 
-  punpcklbw   xmm5,xmm1 
-  movsx       eax,r10w 
-  movdqa      xmm9,xmm13 
-  punpcklqdq  xmm12,xmm0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm10,xmm1 
-  movd        xmm0,eax 
+  mov         rax,rsp
+  push        rbx
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
   movsx       eax,word [rsp + 90h + 8h + 28h]   ; iBeta
-  punpckhbw   xmm13,xmm1 
-  movdqa      xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm12,xmm1 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm7,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm1,xmm8 
-  psubw       xmm1,xmm5 
-  punpcklwd   xmm0,xmm0 
-  movdqa      xmm6,xmm11 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm9 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm10 
-  movdqa      xmm1,xmm14 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm10 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm11,xmm0 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm11,xmm2 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm4,xmm6 
-  movdqa      xmm1,xmm8 
-  mov         eax,2 
-  cwde             
-  paddw       xmm1,xmm8 
-  psubw       xmm0,xmm13 
-  paddw       xmm1,xmm5 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm2,xmm14 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm14 
-  movd        xmm0,eax 
-  pand        xmm11,xmm3 
-  paddw       xmm7,xmm7 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm2,xmm12 
-  paddw       xmm12,xmm12 
-  pshufd      xmm3,xmm0,0 
-  paddw       xmm7,xmm9 
-  paddw       xmm12,xmm13 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm5 
-  paddw       xmm7,xmm8 
-  psraw       xmm1,2 
-  paddw       xmm12,xmm14 
-  paddw       xmm7,xmm3 
-  movaps      xmm14,[rsp] 
-  pand        xmm4,xmm1 
-  paddw       xmm12,xmm3 
-  psraw       xmm7,2 
-  movdqa      xmm1,xmm11 
-  por         xmm4,xmm0 
-  psraw       xmm12,2 
-  paddw       xmm2,xmm3 
-  movdqa      xmm0,xmm11 
-  pandn       xmm0,xmm10 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm0,xmm11 
-  movdqa      xmm1,xmm6 
-  pand        xmm1,xmm7 
-  movaps      xmm7,[rsp+70h] 
-  movq        [rcx],xmm4 
-  pandn       xmm6,xmm9 
-  pandn       xmm11,xmm13 
-  pand        xmm0,xmm12 
-  por         xmm1,xmm6 
-  por         xmm0,xmm11 
-  psrldq      xmm4,8 
-  packuswb    xmm1,xmm0 
-  movq        [r11],xmm1 
-  psrldq      xmm1,8 
-  movq        [rdx],xmm4 
-  lea         r11,[rsp+90h] 
-  movaps      xmm6,[r11-10h] 
-  movaps      xmm8,[r11-30h] 
-  movaps      xmm9,[r11-40h] 
-  movq        [rbx],xmm1 
-  movaps      xmm10,[r11-50h] 
-  movaps      xmm11,[r11-60h] 
-  movaps      xmm12,[r11-70h] 
-  movaps      xmm13,[r11-80h] 
-  mov         rsp,r11 
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movaps      xmm7,[rsp+70h]
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movaps      xmm6,[r11-10h]
+  movaps      xmm8,[r11-30h]
+  movaps      xmm9,[r11-40h]
+  movq        [rbx],xmm1
+  movaps      xmm10,[r11-50h]
+  movaps      xmm11,[r11-60h]
+  movaps      xmm12,[r11-70h]
+  movaps      xmm13,[r11-80h]
+  mov         rsp,r11
+  pop         rbx
   ret
 
 
@@ -1099,263 +1099,263 @@
 WELS_EXTERN   DeblockChromaEq4H_sse2
 ALIGN  16
 DeblockChromaEq4H_sse2:
-  mov         rax,rsp 
-  mov         [rax+20h],rbx 
-  push        rdi  
-  sub         rsp,140h    
-  mov         rdi,rdx 
-  lea         eax,[r8*4] 
-  movsxd      r10,eax 
-  mov         eax,[rcx-2] 
-  mov         [rsp+10h],eax 
-  lea         rbx,[r10+rdx-2] 
-  lea         r11,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+10h] 
-  movsxd      r10,r8d 
-  mov         eax,[r10+rcx-2] 
-  lea         rdx,[r10+r10*2] 
-  mov         [rsp+20h],eax 
-  mov         eax,[rcx+r10*2-2] 
-  mov         [rsp+30h],eax 
-  mov         eax,[rdx+rcx-2] 
-  movdqa      xmm2,[rsp+20h] 
-  mov         [rsp+40h],eax 
-  mov         eax, [rdi-2] 
-  movdqa      xmm4,[rsp+30h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rdi-2] 
-  movdqa      xmm3,[rsp+40h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[rdi+r10*2-2] 
-  punpckldq   xmm5,[rsp+50h] 
-  mov         [rsp+70h],eax 
-  mov         eax, [rdx+rdi-2] 
-  punpckldq   xmm2, [rsp+60h] 
-  mov          [rsp+80h],eax 
-  mov         eax,[r11] 
-  punpckldq   xmm4, [rsp+70h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx] 
-  punpckldq   xmm3,[rsp+80h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[r10+r11] 
-  movdqa      xmm0, [rsp+50h] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+60h],eax 
-  mov         eax,[r11+r10*2] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx+r10*2] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  mov         eax, [rdx+r11] 
-  movdqa      xmm15,xmm1 
-  punpckldq   xmm0,[rsp+60h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax, [rdx+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm15,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm12,xmm15 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm12,xmm0 
-  punpckhdq   xmm15,xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm11,xmm12 
-  punpckldq   xmm0,xmm5 
-  punpckhdq   xmm1,xmm5 
-  punpcklqdq  xmm11,xmm0 
-  punpckhqdq  xmm12,xmm0 
-  movsx       eax,r9w 
-  movdqa      xmm14,xmm15 
-  punpcklqdq  xmm14,xmm1 
-  punpckhqdq  xmm15,xmm1 
-  pxor        xmm1,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm4,xmm12 
-  movdqa      xmm8,xmm11 
-  movsx       eax,word [rsp+170h] ; iBeta
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm4,xmm1 
-  punpckhbw   xmm12,xmm1 
-  movdqa      xmm9,xmm14 
-  movdqa      xmm7,xmm15 
-  movdqa      xmm10,xmm15 
-  pshufd      xmm13,xmm0,0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm14,xmm1 
-  movdqa      xmm6,xmm13 
-  movd        xmm0,eax 
-  movdqa      [rsp],xmm11 
-  mov         eax,2 
-  cwde             
-  punpckhbw   xmm11,xmm1 
-  punpckhbw   xmm10,xmm1 
-  punpcklbw   xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm8,xmm1 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm4 
-  psubw       xmm0,xmm9 
-  psubw       xmm1,xmm4 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm1,xmm11 
-  psubw       xmm0,xmm14 
-  psubw       xmm1,xmm12 
-  movdqa      xmm5,xmm6 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm13,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm2,xmm0 
-  paddw       xmm1,xmm8 
-  movdqa      xmm0,xmm10 
-  pand        xmm13,xmm2 
-  psubw       xmm0,xmm14 
-  paddw       xmm1,xmm4 
-  movdqa      xmm2,xmm11 
-  pabsw       xmm0,xmm0 
-  paddw       xmm2,xmm11 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm12 
-  movd        xmm0,eax 
-  pand        xmm13,xmm3 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm4 
-  paddw       xmm2,xmm3 
-  psraw       xmm1,2 
-  pand        xmm5,xmm1 
-  por         xmm5,xmm0 
-  paddw       xmm7,xmm7 
-  paddw       xmm10,xmm10 
-  psraw       xmm2,2 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm2 
-  paddw       xmm7,xmm9 
-  por         xmm1,xmm0 
-  paddw       xmm10,xmm14 
-  paddw       xmm7,xmm8 
-  movdqa      xmm0,xmm13 
-  packuswb    xmm5,xmm1 
-  paddw       xmm7,xmm3 
-  paddw       xmm10,xmm11 
-  movdqa      xmm1,xmm6 
-  paddw       xmm10,xmm3 
-  pandn       xmm6,xmm9 
-  psraw       xmm7,2 
-  pand        xmm1,xmm7 
-  psraw       xmm10,2 
-  pandn       xmm13,xmm14 
-  pand        xmm0,xmm10 
-  por         xmm1,xmm6 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm4,xmm6 
-  por         xmm0,xmm13 
-  punpcklbw   xmm4,xmm5 
-  punpckhbw   xmm6,xmm5 
-  movdqa      xmm3,xmm4 
-  packuswb    xmm1,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckhbw   xmm1,xmm15 
-  punpcklbw   xmm0,xmm15 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm6 
-  movdqa      xmm2,xmm3 
-  punpcklwd   xmm0,xmm1 
-  punpckhwd   xmm6,xmm1 
-  movdqa      xmm1,xmm4 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm6 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm6 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+10h],xmm0 
-  movdqa      [rsp+60h],xmm2 
-  movdqa      xmm0,xmm3 
-  mov         eax,[rsp+10h] 
-  mov         [rcx-2],eax 
-  mov         eax,[rsp+60h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [r10+rcx-2],eax 
-  movdqa      [rsp+20h],xmm0 
-  mov         eax, [rsp+20h] 
-  movdqa      [rsp+70h],xmm3 
-  mov         [rcx+r10*2-2],eax 
-  mov         eax,[rsp+70h] 
-  mov         [rdx+rcx-2],eax 
-  mov         eax,[rsp+18h] 
-  mov         [r11],eax 
-  mov         eax,[rsp+68h] 
-  mov         [r10+r11],eax 
-  mov         eax,[rsp+28h] 
-  mov         [r11+r10*2],eax 
-  mov         eax,[rsp+78h] 
-  mov         [rdx+r11],eax 
-  mov         eax,[rsp+14h] 
-  mov         [rdi-2],eax 
-  mov         eax,[rsp+64h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+24h] 
-  mov         [rdi+r10*2-2],eax 
-  mov         eax, [rsp+74h] 
-  mov         [rdx+rdi-2],eax 
-  mov         eax, [rsp+1Ch] 
-  mov         [rbx],eax 
-  mov         eax, [rsp+6Ch] 
-  mov         [r10+rbx],eax 
-  mov         eax,[rsp+2Ch] 
-  mov         [rbx+r10*2],eax 
-  mov         eax,[rsp+7Ch] 
-  mov         [rdx+rbx],eax  
-  lea         r11,[rsp+140h] 
-  mov         rbx, [r11+28h]    
-  mov         rsp,r11 
-  pop         rdi  
+  mov         rax,rsp
+  mov         [rax+20h],rbx
+  push        rdi
+  sub         rsp,140h
+  mov         rdi,rdx
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
+  mov         eax,[rdx+rcx-2]
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  movsx       eax,word [rsp+170h] ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         r11,[rsp+140h]
+  mov         rbx, [r11+28h]
+  mov         rsp,r11
+  pop         rdi
   ret
 
 
@@ -1363,283 +1363,283 @@
 WELS_EXTERN DeblockChromaLt4H_sse2
 ALIGN  16
 DeblockChromaLt4H_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        rsi  
-  push        rdi  
-  push        r12  
-  sub         rsp,170h  
-  
-  movsxd      rsi,r8d 
-  lea         eax,[r8*4] 
-  mov         r11d,r9d 
-  movsxd      r10,eax 
-  mov         eax, [rcx-2] 
-  mov         r12,rdx 
-  mov         [rsp+40h],eax 
-  mov         eax, [rsi+rcx-2] 
-  lea         rbx,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+40h] 
-  mov         [rsp+50h],eax 
-  mov         eax, [rcx+rsi*2-2] 
-  lea         rbp,[r10+rdx-2] 
-  movdqa      xmm2, [rsp+50h] 
-  mov         [rsp+60h],eax 
-  lea         r10,[rsi+rsi*2] 
-  mov         rdi,rcx 
-  mov         eax,[r10+rcx-2] 
-  movdqa      xmm4,[rsp+60h] 
-  mov         [rsp+70h],eax 
-  mov         eax,[rdx-2] 
-  mov         [rsp+80h],eax 
-  mov         eax, [rsi+rdx-2] 
-  movdqa      xmm3,[rsp+70h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rdx+rsi*2-2] 
-  punpckldq   xmm5,[rsp+80h] 
-  mov         [rsp+0A0h],eax 
-  mov         eax, [r10+rdx-2] 
-  punpckldq   xmm2,[rsp+90h] 
-  mov         [rsp+0B0h],eax 
-  mov         eax, [rbx] 
-  punpckldq   xmm4,[rsp+0A0h] 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp] 
-  punpckldq   xmm3,[rsp+0B0h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rsi+rbx] 
-  movdqa      xmm0,[rsp+80h] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rsi+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+90h],eax 
-  mov         eax,[rbx+rsi*2] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp+rsi*2] 
-  movdqa      xmm0, [rsp+80h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm7,xmm1 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax, [r10+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  mov         [rsp+90h],eax 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm7,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm6,xmm7 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm6,xmm0 
-  punpckhdq   xmm7,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckldq   xmm0,xmm5 
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        rsi
+  push        rdi
+  push        r12
+  sub         rsp,170h
+
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
   mov         rax, [rsp+1C8h]    ; pTC
-  punpckhdq   xmm1,xmm5 
-  movdqa      xmm9,xmm6 
-  punpckhqdq  xmm6,xmm0 
-  punpcklqdq  xmm9,xmm0 
-  movdqa      xmm2,xmm7 
-  movdqa      xmm13,xmm6 
-  movdqa      xmm4,xmm9 
-  movdqa      [rsp+10h],xmm9 
-  punpcklqdq  xmm2,xmm1 
-  punpckhqdq  xmm7,xmm1 
-  pxor        xmm1,xmm1 
-  movsx       ecx,byte [rax+3] 
-  movsx       edx,byte [rax+2] 
-  movsx       r8d,byte [rax+1] 
-  movsx       r9d,byte [rax] 
-  movdqa      xmm10,xmm1 
-  movdqa      xmm15,xmm2 
-  punpckhbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm4,xmm1 
-  movsx       eax,r11w 
-  mov         word [rsp+0Eh],cx 
-  mov         word [rsp+0Ch],cx 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm8,xmm7 
-  movdqa      [rsp+20h],xmm7 
-  punpcklbw   xmm15,xmm1 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm3,xmm1 
-  mov         word [rsp+0Ah],dx 
-  mov         word [rsp+8],dx 
-  mov         word [rsp+6],r8w 
-  movd        xmm0,eax 
-  movdqa      [rsp+30h],xmm6 
-  punpckhbw   xmm9,xmm1 
-  punpckhbw   xmm8,xmm1 
-  punpcklwd   xmm0,xmm0 
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
   movsx       eax,word [rsp+1C0h]   ; iBeta
-  mov         word [rsp+4],r8w 
-  mov         word [rsp+2],r9w 
-  pshufd      xmm12,xmm0,0 
-  mov         word [rsp],r9w 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  movdqa      xmm14, [rsp] 
-  movdqa      [rsp],xmm2 
-  movdqa      xmm2,xmm12 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  psubw       xmm10,xmm14 
-  movd        xmm0,eax 
-  movdqa      xmm7,xmm14 
-  movdqa      xmm6,xmm14 
-  pcmpgtw     xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  movdqa      xmm0,xmm4 
-  movdqa      xmm1,xmm15 
-  psubw       xmm4,xmm13 
-  psubw       xmm0,xmm3 
-  psubw       xmm1,xmm13 
-  psubw       xmm3,xmm15 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm10 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm0,xmm13 
-  psubw       xmm0,xmm15 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm3,[rsp+30h] 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm9 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm8 
-  psubw       xmm9,xmm3 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  psubw       xmm15,xmm6 
-  paddw       xmm13,xmm6 
-  movdqa      xmm2,[rsp] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  psubw       xmm8,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm3 
-  movdqa      xmm5,[rsp+10h] 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  movdqa      xmm4,xmm5 
-  pabsw       xmm0,xmm0 
-  pmaxsw      xmm10,xmm1 
-  movdqa      xmm1,xmm11 
-  pcmpgtw     xmm12,xmm0 
-  pabsw       xmm0,xmm9 
-  pminsw      xmm14,xmm10 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm8 
-  pcmpgtw     xmm11,xmm0 
-  pand        xmm12,xmm1 
-  movdqa      xmm1,[rsp+20h] 
-  pand        xmm12,xmm11 
-  pand        xmm12,xmm7 
-  pand        xmm14,xmm12 
-  paddw       xmm3,xmm14 
-  psubw       xmm2,xmm14 
-  packuswb    xmm13,xmm3 
-  packuswb    xmm15,xmm2 
-  punpcklbw   xmm4,xmm13 
-  punpckhbw   xmm5,xmm13 
-  movdqa      xmm0,xmm15 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm4 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm2,xmm3 
-  movdqa      xmm1,xmm4 
-  punpcklwd   xmm0,xmm15 
-  punpckhwd   xmm5,xmm15 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm5 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm5 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+40h],xmm0 
-  movdqa      xmm0,xmm3 
-  movdqa      [rsp+90h],xmm2 
-  mov         eax,[rsp+40h] 
-  mov         [rdi-2],eax 
-  mov         eax, [rsp+90h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [rsi+rdi-2],eax 
-  movdqa      [rsp+50h],xmm0 
-  mov         eax,[rsp+50h] 
-  movdqa      [rsp+0A0h],xmm3 
-  mov         [rdi+rsi*2-2],eax 
-  mov         eax,[rsp+0A0h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+48h] 
-  mov         [rbx],eax 
-  mov         eax,[rsp+98h] 
-  mov         [rsi+rbx],eax 
-  mov         eax,[rsp+58h] 
-  mov         [rbx+rsi*2],eax 
-  mov         eax, [rsp+0A8h] 
-  mov         [r10+rbx],eax 
-  mov         eax, [rsp+44h] 
-  mov         [r12-2],eax 
-  mov         eax,[rsp+94h] 
-  mov         [rsi+r12-2],eax 
-  mov         eax,[rsp+54h] 
-  mov         [r12+rsi*2-2],eax 
-  mov         eax, [rsp+0A4h] 
-  mov         [r10+r12-2],eax 
-  mov         eax,[rsp+4Ch] 
-  mov         [rbp],eax 
-  mov         eax,[rsp+9Ch] 
-  mov         [rsi+rbp],eax 
-  mov         eax, [rsp+5Ch] 
-  mov         [rbp+rsi*2],eax 
-  mov         eax,[rsp+0ACh] 
-  mov         [r10+rbp],eax   
-  lea         r11,[rsp+170h]    
-  mov         rsp,r11 
-  pop         r12  
-  pop         rdi  
-  pop         rsi  
-  pop         rbp  
-  pop         rbx  
-  ret 
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
+  pop         r12
+  pop         rdi
+  pop         rsi
+  pop         rbp
+  pop         rbx
+  ret
 
 
 
@@ -1649,258 +1649,258 @@
 WELS_EXTERN   DeblockLumaLt4V_sse2
 
 DeblockLumaLt4V_sse2:
-  push        rbp      
-  mov         r11,r8  ; pTC                                                    
-  sub         rsp,1B0h                                                       
-  lea         rbp,[rsp+20h]                                                  
-  movd        xmm4,edx                                                                                                  
-  movd        xmm2,ecx                                                       
-  mov         qword [rbp+180h],r12                                       
-  mov         r10,rdi                                                        
-  movsxd      r12,esi                                                        
+  push        rbp
+  mov         r11,r8  ; pTC
+  sub         rsp,1B0h
+  lea         rbp,[rsp+20h]
+  movd        xmm4,edx
+  movd        xmm2,ecx
+  mov         qword [rbp+180h],r12
+  mov         r10,rdi
+  movsxd      r12,esi
   add         rsi,rsi
-  movsxd      rdx,esi 
-  sub         r10,r12                                                        
-  movsx       r8d,byte [r11]                                             
-  pxor        xmm3,xmm3                                                      
-  punpcklwd   xmm2,xmm2                                                      
-  movaps      [rbp+50h],xmm14                                    
-  lea         rax,[r12+r12*2]                                                
-  movdqa      xmm14,[rdx+rdi]                                    
-  neg         rax                                                            
-  pshufd      xmm0,xmm2,0                                                    
-  movd        xmm2,r8d                                                       
-  movsx       rsi,byte [r11+1]                                           
-  movsx       r8d,byte [r11+2]                                           
-  movsx       r11d,byte [r11+3]                                          
-  movaps      [rbp+70h],xmm12                                    
-  movd        xmm1,esi                                                      
-  movaps      [rbp+80h],xmm11                                    
-  movd        xmm12,r8d                                                      
-  movd        xmm11,r11d                                                     
-  movdqa      xmm5, [rax+rdi]                                     
-  lea         rax,[r12+r12]                                                  
-  punpcklwd   xmm12,xmm12                                                    
-  neg         rax                                                            
-  punpcklwd   xmm11,xmm11                                                    
-  movaps      [rbp],xmm8                                         
-  movdqa      xmm8, [r10]                                         
-  punpcklwd   xmm2,xmm2                                                      
-  punpcklwd   xmm1,xmm1                                                      
-  punpcklqdq  xmm12,xmm12                                                    
-  punpcklqdq  xmm11,xmm11                                                    
-  punpcklqdq  xmm2,xmm2                                                      
-  punpcklqdq  xmm1,xmm1                                                      
-  shufps      xmm12,xmm11,88h                                                
-  movdqa      xmm11,xmm8                                                     
-  movaps      [rbp+30h],xmm9                                     
-  movdqa      xmm9,[rdi]                                         
-  shufps      xmm2,xmm1,88h                                                  
-  movdqa      xmm1,xmm5                                                      
-  punpcklbw   xmm11,xmm3                                                     
-  movaps      [rbp+20h],xmm6                                     
-  movaps      [rbp+60h],xmm13                                    
-  movdqa      xmm13,xmm11                                                    
-  movaps      [rbp+90h],xmm10                                    
-  movdqa      xmm10,xmm9                                                     
-  movdqa      xmm6,[rax+rdi]                                     
-  punpcklbw   xmm1,xmm3                                                      
-  movaps      [rbp+0A0h],xmm12                                   
-  psubw       xmm13,xmm1                                                     
-  movaps      [rbp+40h],xmm15                                    
-  movdqa      xmm15,xmm14                                                    
-  movaps      [rbp+10h],xmm7                                     
-  movdqa      xmm7,xmm6                                                      
-  punpcklbw   xmm10,xmm3                                                     
-  movdqa      xmm12,[r12+rdi]                                    
-  punpcklbw   xmm7,xmm3                                                      
-  punpcklbw   xmm12,xmm3                                                     
-  punpcklbw   xmm15,xmm3                                                     
-  pabsw       xmm3,xmm13                                                     
-  movdqa      xmm13,xmm10                                                    
-  psubw       xmm13,xmm15                                                    
-  movdqa      [rbp+0F0h],xmm15                                   
-  pabsw       xmm15,xmm13                                                    
-  movdqa      xmm13,xmm11                                                    
-  movdqa      [rbp+0B0h],xmm1                                    
-  movdqa      xmm1,xmm0                                                      
-  pavgw       xmm13,xmm10                                                    
-  pcmpgtw     xmm1,xmm3                                                      
-  movdqa      [rbp+120h],xmm13                                   
-  movaps      xmm13,xmm2                                                     
-  punpcklwd   xmm4,xmm4                                                      
-  movdqa      xmm3,xmm0                                                      
-  movdqa      [rbp+100h],xmm1                                    
-  psubw       xmm13,xmm1                                                     
-  movdqa      xmm1,xmm10                                                     
-  pcmpgtw     xmm3,xmm15                                                     
-  pshufd      xmm4,xmm4,0                                                    
-  psubw       xmm1,xmm11                                                     
-  movdqa      [rbp+0D0h],xmm10                                   
-  psubw       xmm13,xmm3                                                     
-  movdqa      [rbp+110h],xmm3                                    
-  pabsw       xmm15,xmm1                                                     
-  movdqa      xmm3,xmm4                                                      
-  psubw       xmm10,xmm12                                                    
-  pcmpgtw     xmm3,xmm15                                                     
-  pabsw       xmm15,xmm10                                                    
-  movdqa      xmm10,xmm0                                                     
-  psllw       xmm1,2                                                         
-  movdqa      [rbp+0C0h],xmm11                                   
-  psubw       xmm11,xmm7                                                     
-  pcmpgtw     xmm10,xmm15                                                    
-  pabsw       xmm11,xmm11                                                    
-  movdqa      xmm15,xmm0                                                     
-  pand        xmm3,xmm10                                                     
-  pcmpgtw     xmm15,xmm11                                                    
-  movaps      xmm11,xmm2                                                     
-  pxor        xmm10,xmm10                                                    
-  pand        xmm3,xmm15                                                     
-  pcmpgtw     xmm11,xmm10                                                    
-  pcmpeqw     xmm10,xmm2                                                     
-  por         xmm11,xmm10                                                    
-  pand        xmm3,xmm11                                                     
-  movdqa      xmm11,xmm7                                                     
-  psubw       xmm11,xmm12                                                    
-  pxor        xmm15,xmm15                                                    
-  paddw       xmm11,xmm1                                                     
-  psubw       xmm15,xmm13                                                    
-  movdqa      [rbp+0E0h],xmm12                                   
-  paddw       xmm11,[FOUR_16B_SSE2] 
-  pxor        xmm12,xmm12                                                    
-  psraw       xmm11,3                                                        
-  punpckhbw   xmm8,xmm12                                                     
-  pmaxsw      xmm15,xmm11                                                    
-  punpckhbw   xmm5,xmm12                                                     
-  movdqa      xmm11,xmm8                                                     
-  pminsw      xmm13,xmm15                                                    
-  psubw       xmm11,xmm5                                                     
-  punpckhbw   xmm9,xmm12                                                     
-  pand        xmm13,xmm3                                                     
-  movdqa      [rbp+130h],xmm13                                   
-  pabsw       xmm13,xmm11                                                    
-  punpckhbw   xmm14,xmm12                                                    
-  movdqa      xmm11,xmm9                                                     
-  psubw       xmm11,xmm14                                                    
-  movdqa      xmm15,xmm0                                                     
-  movdqa      [rbp+140h],xmm14                                   
-  pabsw       xmm14,xmm11                                                    
-  movdqa      xmm11,xmm8                                                     
-  pcmpgtw     xmm15,xmm14                                                    
-  movdqa      xmm1,[r12+rdi]                                     
-  pavgw       xmm11,xmm9                                                     
-  movdqa      [rbp+170h],xmm11                                   
-  movdqa      xmm10,xmm9                                                     
-  punpckhbw   xmm6,xmm12                                                     
-  psubw       xmm10,xmm8                                                     
-  punpckhbw   xmm1,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  movaps      xmm11,[rbp+0A0h]                                   
-  pcmpgtw     xmm12,xmm13                                                    
-  movaps      xmm13,xmm11                                                    
-  psubw       xmm13,xmm12                                                    
-  movdqa      [rbp+160h],xmm15                                   
-  psubw       xmm13,xmm15                                                    
-  movdqa      xmm15,xmm9                                                     
-  psubw       xmm15,xmm1                                                     
-  movdqa      [rbp+150h],xmm12                                   
-  pabsw       xmm12,xmm10                                                    
-  pabsw       xmm14,xmm15                                                    
-  movdqa      xmm15,xmm8                                                     
-  pcmpgtw     xmm4,xmm12                                                     
-  movdqa      xmm12,xmm0                                                     
-  psubw       xmm15,xmm6                                                     
-  pcmpgtw     xmm12,xmm14                                                    
-  pabsw       xmm14,xmm15                                                    
-  psllw       xmm10,2                                                        
-  pcmpgtw     xmm0,xmm14                                                     
-  movdqa      xmm14,xmm6                                                     
-  psubw       xmm14,xmm1                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm14,xmm10                                                    
-  pand        xmm4,xmm0                                                      
-  paddw       xmm14,[FOUR_16B_SSE2] 
-  pxor        xmm15,xmm15                                                    
-  movaps      xmm12,xmm11                                                    
-  psubw       xmm15,xmm13                                                    
-  pxor        xmm0,xmm0                                                      
-  psraw       xmm14,3                                                        
-  pcmpgtw     xmm12,xmm0                                                     
-  pcmpeqw     xmm0,xmm11                                                     
-  pmaxsw      xmm15,xmm14                                                    
-  por         xmm12,xmm0                                                     
-  movdqa      xmm0,[rbp+120h]                                    
-  pminsw      xmm13,xmm15                                                    
-  movdqa      xmm15,[rbp+0B0h]                                   
-  movdqa      xmm10,xmm7                                                     
-  pand        xmm4,xmm12                                                     
-  paddw       xmm15,xmm0                                                     
-  pxor        xmm12,xmm12                                                    
-  paddw       xmm10,xmm7                                                     
-  movdqa      xmm14,xmm12                                                    
-  psubw       xmm15,xmm10                                                    
-  psubw       xmm14,xmm2                                                     
-  psraw       xmm15,1                                                        
-  pmaxsw      xmm15,xmm14                                                    
-  movdqa      xmm10,xmm6                                                     
-  pminsw      xmm15,xmm2                                                     
-  paddw       xmm10,xmm6                                                     
-  pand        xmm15,xmm3                                                     
-  psubw       xmm12,xmm11                                                    
-  pand        xmm15,[rbp+100h]                                   
-  pand        xmm13,xmm4                                                     
-  paddw       xmm7,xmm15                                                     
-  paddw       xmm8,xmm13                                                     
-  movdqa      xmm15,[rbp+170h]                                   
-  psubw       xmm9,xmm13                                                     
-  paddw       xmm5,xmm15                                                     
-  psubw       xmm5,xmm10                                                     
-  psraw       xmm5,1                                                         
-  pmaxsw      xmm5,xmm12                                                     
-  pminsw      xmm5,xmm11                                                     
-  pand        xmm5,xmm4                                                      
-  pand        xmm5,[rbp+150h]                                    
-  paddw       xmm6,xmm5                                                      
-  movdqa      xmm5,[rbp+0C0h]                                    
-  packuswb    xmm7,xmm6                                                      
-  movdqa      xmm6,[rbp+130h]                                    
-  paddw       xmm5,xmm6                                                      
-  packuswb    xmm5,xmm8                                                      
-  movdqa      xmm8,[rbp+0D0h]                                    
-  psubw       xmm8,xmm6                                                      
-  movdqa      xmm6,[rbp+0F0h]                                    
-  paddw       xmm6,xmm0                                                      
-  movdqa      xmm0,[rbp+0E0h]                                    
-  packuswb    xmm8,xmm9                                                      
-  movdqa      xmm9,xmm0                                                      
-  paddw       xmm9,xmm0                                                      
-  psubw       xmm6,xmm9                                                      
-  psraw       xmm6,1                                                         
-  pmaxsw      xmm14,xmm6                                                     
-  pminsw      xmm2,xmm14                                                     
-  pand        xmm2,xmm3                                                      
-  pand        xmm2,[rbp+110h]                                    
-  paddw       xmm0,xmm2                                                      
-  movdqa      xmm2,[rbp+140h]                                    
-  paddw       xmm2,xmm15                                                     
-  movdqa      xmm15,xmm1                                                     
-  paddw       xmm15,xmm1                                                     
-  psubw       xmm2,xmm15                                                     
-  psraw       xmm2,1                                                         
-  pmaxsw      xmm12,xmm2                                                     
-  pminsw      xmm11,xmm12                                                    
-  pand        xmm11,xmm4                                                     
-  pand        xmm11,[rbp+160h]                                   
-  paddw       xmm1,xmm11                                                     
-  movdqa      [rax+rdi],xmm7                                     
-  movdqa      [r10],xmm5                                         
-  packuswb    xmm0,xmm1                                                      
-  movdqa      [rdi],xmm8                                         
-  movdqa      [r12+rdi],xmm0                                                                        
-  mov         r12,qword [rbp+180h]                                       
-  lea         rsp,[rbp+190h]                                                 
-  pop         rbp                                                            
-  ret 
+  movsxd      rdx,esi
+  sub         r10,r12
+  movsx       r8d,byte [r11]
+  pxor        xmm3,xmm3
+  punpcklwd   xmm2,xmm2
+  movaps      [rbp+50h],xmm14
+  lea         rax,[r12+r12*2]
+  movdqa      xmm14,[rdx+rdi]
+  neg         rax
+  pshufd      xmm0,xmm2,0
+  movd        xmm2,r8d
+  movsx       rsi,byte [r11+1]
+  movsx       r8d,byte [r11+2]
+  movsx       r11d,byte [r11+3]
+  movaps      [rbp+70h],xmm12
+  movd        xmm1,esi
+  movaps      [rbp+80h],xmm11
+  movd        xmm12,r8d
+  movd        xmm11,r11d
+  movdqa      xmm5, [rax+rdi]
+  lea         rax,[r12+r12]
+  punpcklwd   xmm12,xmm12
+  neg         rax
+  punpcklwd   xmm11,xmm11
+  movaps      [rbp],xmm8
+  movdqa      xmm8, [r10]
+  punpcklwd   xmm2,xmm2
+  punpcklwd   xmm1,xmm1
+  punpcklqdq  xmm12,xmm12
+  punpcklqdq  xmm11,xmm11
+  punpcklqdq  xmm2,xmm2
+  punpcklqdq  xmm1,xmm1
+  shufps      xmm12,xmm11,88h
+  movdqa      xmm11,xmm8
+  movaps      [rbp+30h],xmm9
+  movdqa      xmm9,[rdi]
+  shufps      xmm2,xmm1,88h
+  movdqa      xmm1,xmm5
+  punpcklbw   xmm11,xmm3
+  movaps      [rbp+20h],xmm6
+  movaps      [rbp+60h],xmm13
+  movdqa      xmm13,xmm11
+  movaps      [rbp+90h],xmm10
+  movdqa      xmm10,xmm9
+  movdqa      xmm6,[rax+rdi]
+  punpcklbw   xmm1,xmm3
+  movaps      [rbp+0A0h],xmm12
+  psubw       xmm13,xmm1
+  movaps      [rbp+40h],xmm15
+  movdqa      xmm15,xmm14
+  movaps      [rbp+10h],xmm7
+  movdqa      xmm7,xmm6
+  punpcklbw   xmm10,xmm3
+  movdqa      xmm12,[r12+rdi]
+  punpcklbw   xmm7,xmm3
+  punpcklbw   xmm12,xmm3
+  punpcklbw   xmm15,xmm3
+  pabsw       xmm3,xmm13
+  movdqa      xmm13,xmm10
+  psubw       xmm13,xmm15
+  movdqa      [rbp+0F0h],xmm15
+  pabsw       xmm15,xmm13
+  movdqa      xmm13,xmm11
+  movdqa      [rbp+0B0h],xmm1
+  movdqa      xmm1,xmm0
+  pavgw       xmm13,xmm10
+  pcmpgtw     xmm1,xmm3
+  movdqa      [rbp+120h],xmm13
+  movaps      xmm13,xmm2
+  punpcklwd   xmm4,xmm4
+  movdqa      xmm3,xmm0
+  movdqa      [rbp+100h],xmm1
+  psubw       xmm13,xmm1
+  movdqa      xmm1,xmm10
+  pcmpgtw     xmm3,xmm15
+  pshufd      xmm4,xmm4,0
+  psubw       xmm1,xmm11
+  movdqa      [rbp+0D0h],xmm10
+  psubw       xmm13,xmm3
+  movdqa      [rbp+110h],xmm3
+  pabsw       xmm15,xmm1
+  movdqa      xmm3,xmm4
+  psubw       xmm10,xmm12
+  pcmpgtw     xmm3,xmm15
+  pabsw       xmm15,xmm10
+  movdqa      xmm10,xmm0
+  psllw       xmm1,2
+  movdqa      [rbp+0C0h],xmm11
+  psubw       xmm11,xmm7
+  pcmpgtw     xmm10,xmm15
+  pabsw       xmm11,xmm11
+  movdqa      xmm15,xmm0
+  pand        xmm3,xmm10
+  pcmpgtw     xmm15,xmm11
+  movaps      xmm11,xmm2
+  pxor        xmm10,xmm10
+  pand        xmm3,xmm15
+  pcmpgtw     xmm11,xmm10
+  pcmpeqw     xmm10,xmm2
+  por         xmm11,xmm10
+  pand        xmm3,xmm11
+  movdqa      xmm11,xmm7
+  psubw       xmm11,xmm12
+  pxor        xmm15,xmm15
+  paddw       xmm11,xmm1
+  psubw       xmm15,xmm13
+  movdqa      [rbp+0E0h],xmm12
+  paddw       xmm11,[FOUR_16B_SSE2]
+  pxor        xmm12,xmm12
+  psraw       xmm11,3
+  punpckhbw   xmm8,xmm12
+  pmaxsw      xmm15,xmm11
+  punpckhbw   xmm5,xmm12
+  movdqa      xmm11,xmm8
+  pminsw      xmm13,xmm15
+  psubw       xmm11,xmm5
+  punpckhbw   xmm9,xmm12
+  pand        xmm13,xmm3
+  movdqa      [rbp+130h],xmm13
+  pabsw       xmm13,xmm11
+  punpckhbw   xmm14,xmm12
+  movdqa      xmm11,xmm9
+  psubw       xmm11,xmm14
+  movdqa      xmm15,xmm0
+  movdqa      [rbp+140h],xmm14
+  pabsw       xmm14,xmm11
+  movdqa      xmm11,xmm8
+  pcmpgtw     xmm15,xmm14
+  movdqa      xmm1,[r12+rdi]
+  pavgw       xmm11,xmm9
+  movdqa      [rbp+170h],xmm11
+  movdqa      xmm10,xmm9
+  punpckhbw   xmm6,xmm12
+  psubw       xmm10,xmm8
+  punpckhbw   xmm1,xmm12
+  movdqa      xmm12,xmm0
+  movaps      xmm11,[rbp+0A0h]
+  pcmpgtw     xmm12,xmm13
+  movaps      xmm13,xmm11
+  psubw       xmm13,xmm12
+  movdqa      [rbp+160h],xmm15
+  psubw       xmm13,xmm15
+  movdqa      xmm15,xmm9
+  psubw       xmm15,xmm1
+  movdqa      [rbp+150h],xmm12
+  pabsw       xmm12,xmm10
+  pabsw       xmm14,xmm15
+  movdqa      xmm15,xmm8
+  pcmpgtw     xmm4,xmm12
+  movdqa      xmm12,xmm0
+  psubw       xmm15,xmm6
+  pcmpgtw     xmm12,xmm14
+  pabsw       xmm14,xmm15
+  psllw       xmm10,2
+  pcmpgtw     xmm0,xmm14
+  movdqa      xmm14,xmm6
+  psubw       xmm14,xmm1
+  pand        xmm4,xmm12
+  paddw       xmm14,xmm10
+  pand        xmm4,xmm0
+  paddw       xmm14,[FOUR_16B_SSE2]
+  pxor        xmm15,xmm15
+  movaps      xmm12,xmm11
+  psubw       xmm15,xmm13
+  pxor        xmm0,xmm0
+  psraw       xmm14,3
+  pcmpgtw     xmm12,xmm0
+  pcmpeqw     xmm0,xmm11
+  pmaxsw      xmm15,xmm14
+  por         xmm12,xmm0
+  movdqa      xmm0,[rbp+120h]
+  pminsw      xmm13,xmm15
+  movdqa      xmm15,[rbp+0B0h]
+  movdqa      xmm10,xmm7
+  pand        xmm4,xmm12
+  paddw       xmm15,xmm0
+  pxor        xmm12,xmm12
+  paddw       xmm10,xmm7
+  movdqa      xmm14,xmm12
+  psubw       xmm15,xmm10
+  psubw       xmm14,xmm2
+  psraw       xmm15,1
+  pmaxsw      xmm15,xmm14
+  movdqa      xmm10,xmm6
+  pminsw      xmm15,xmm2
+  paddw       xmm10,xmm6
+  pand        xmm15,xmm3
+  psubw       xmm12,xmm11
+  pand        xmm15,[rbp+100h]
+  pand        xmm13,xmm4
+  paddw       xmm7,xmm15
+  paddw       xmm8,xmm13
+  movdqa      xmm15,[rbp+170h]
+  psubw       xmm9,xmm13
+  paddw       xmm5,xmm15
+  psubw       xmm5,xmm10
+  psraw       xmm5,1
+  pmaxsw      xmm5,xmm12
+  pminsw      xmm5,xmm11
+  pand        xmm5,xmm4
+  pand        xmm5,[rbp+150h]
+  paddw       xmm6,xmm5
+  movdqa      xmm5,[rbp+0C0h]
+  packuswb    xmm7,xmm6
+  movdqa      xmm6,[rbp+130h]
+  paddw       xmm5,xmm6
+  packuswb    xmm5,xmm8
+  movdqa      xmm8,[rbp+0D0h]
+  psubw       xmm8,xmm6
+  movdqa      xmm6,[rbp+0F0h]
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[rbp+0E0h]
+  packuswb    xmm8,xmm9
+  movdqa      xmm9,xmm0
+  paddw       xmm9,xmm0
+  psubw       xmm6,xmm9
+  psraw       xmm6,1
+  pmaxsw      xmm14,xmm6
+  pminsw      xmm2,xmm14
+  pand        xmm2,xmm3
+  pand        xmm2,[rbp+110h]
+  paddw       xmm0,xmm2
+  movdqa      xmm2,[rbp+140h]
+  paddw       xmm2,xmm15
+  movdqa      xmm15,xmm1
+  paddw       xmm15,xmm1
+  psubw       xmm2,xmm15
+  psraw       xmm2,1
+  pmaxsw      xmm12,xmm2
+  pminsw      xmm11,xmm12
+  pand        xmm11,xmm4
+  pand        xmm11,[rbp+160h]
+  paddw       xmm1,xmm11
+  movdqa      [rax+rdi],xmm7
+  movdqa      [r10],xmm5
+  packuswb    xmm0,xmm1
+  movdqa      [rdi],xmm8
+  movdqa      [r12+rdi],xmm0
+  mov         r12,qword [rbp+180h]
+  lea         rsp,[rbp+190h]
+  pop         rbp
+  ret
 
 
 WELS_EXTERN DeblockLumaEq4V_sse2
@@ -1907,637 +1907,637 @@
 
 ALIGN  16
 DeblockLumaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp   
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   mov         r8,   rdx
   mov         r9,   rcx
   mov         rcx,  rdi
   mov         rdx,  rsi
-  sub         rsp,1D8h 
-  movaps      [rax-38h],xmm6 
-  movaps      [rax-48h],xmm7 
-  movaps      [rax-58h],xmm8 
-  pxor        xmm1,xmm1 
-  movsxd      r10,edx 
-  mov         rbp,rcx 
-  mov         r11d,r8d 
-  mov         rdx,rcx 
-  mov         rdi,rbp 
-  mov         rbx,rbp 
-  movdqa      xmm5,[rbp] 
-  movaps      [rax-68h],xmm9 
-  movaps      [rax-78h],xmm10 
-  punpcklbw   xmm5,xmm1 
-  movaps      [rax-88h],xmm11 
-  movaps      [rax-98h],xmm12 
-  movaps      [rax-0A8h],xmm13 
-  movaps      [rax-0B8h],xmm14 
-  movdqa      xmm14,[r10+rbp] 
-  movaps      [rax-0C8h],xmm15 
-  lea         eax,[r10*4] 
-  movsxd      r8,eax 
-  lea         eax,[r10+r10*2] 
-  movsxd      rcx,eax 
-  lea         eax,[r10+r10] 
-  sub         rdx,r8 
-  punpcklbw   xmm14,xmm1 
-  movdqa      [rsp+90h],xmm5 
-  movdqa      [rsp+30h],xmm14 
-  movsxd      rsi,eax 
-  movsx       eax,r11w 
-  sub         rdi,rcx 
-  sub         rbx,rsi 
-  mov         r8,rbp 
-  sub         r8,r10 
-  movd        xmm0,eax 
-  movsx       eax,r9w 
-  movdqa      xmm12,[rdi] 
-  movdqa      xmm6, [rsi+rbp] 
-  movdqa      xmm13,[rbx] 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm6,xmm1 
-  movdqa      xmm8,[r8] 
-  movd        xmm0,eax 
-  movdqa      xmm10,xmm11 
-  mov         eax,2 
-  punpcklbw   xmm8,xmm1 
-  punpcklbw   xmm12,xmm1 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  psraw       xmm10,2 
-  movdqa      xmm1,xmm8 
-  movdqa      [rsp+0F0h],xmm13 
-  movdqa      [rsp+0B0h],xmm8 
-  pshufd      xmm7,xmm0,0 
-  psubw       xmm1,xmm13 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm4,xmm7 
-  movdqa      xmm2,xmm7 
-  psubw       xmm0,xmm8 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm5 
-  movdqa      [rsp+40h],xmm7 
-  movdqa      [rsp+60h],xmm6 
-  pcmpgtw     xmm4,xmm0 
-  psubw       xmm1,xmm14 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm4,xmm2 
-  movdqa      xmm0,xmm11 
-  pcmpgtw     xmm0,xmm3 
-  pand        xmm4,xmm0 
-  movd        xmm0,eax 
-  movdqa      [rsp+20h],xmm4 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm2,xmm0,0 
-  paddw       xmm10,xmm2 
-  movdqa      [rsp+0A0h],xmm2 
-  movdqa      xmm15,xmm7 
-  pxor        xmm4,xmm4 
-  movdqa      xmm0,xmm8 
-  psubw       xmm0,xmm12 
-  mov         eax,4 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm10 
-  cwde             
-  pcmpgtw     xmm15,xmm0 
-  pcmpgtw     xmm1,xmm3 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm7,[rdx] 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm6 
-  pand        xmm15,xmm1 
-  punpcklbw   xmm7,xmm4 
-  movdqa      xmm9,xmm15 
-  pabsw       xmm0,xmm0 
-  psllw       xmm7,1 
-  pandn       xmm9,xmm12 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm7,xmm12 
-  movd        xmm0,eax 
-  pand        xmm3,xmm1 
-  paddw       xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm7,xmm12 
-  pshufd      xmm1,xmm0,0 
-  paddw       xmm7,xmm13 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm6 
-  paddw       xmm7,xmm8 
-  movdqa      [rsp+70h],xmm1 
-  paddw       xmm7,xmm5 
-  movdqa      [rsp+120h],xmm0 
-  movdqa      xmm0,[rcx+rbp] 
-  punpcklbw   xmm0,xmm4 
-  paddw       xmm7,xmm1 
-  movdqa      xmm4,xmm15 
-  psllw       xmm0,1 
-  psraw       xmm7,3 
-  paddw       xmm0,xmm6 
-  pand        xmm7,xmm15 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm6 
-  paddw       xmm0,xmm14 
-  movdqa      xmm6,xmm15 
-  paddw       xmm0,xmm5 
-  pandn       xmm6,xmm13 
-  paddw       xmm0,xmm8 
-  paddw       xmm0,xmm1 
-  psraw       xmm0,3 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,xmm13 
-  pand        xmm0,xmm3 
-  movdqa      [rsp+100h],xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,xmm5 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm3 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pandn       xmm0,xmm14 
-  pand        xmm4,xmm1 
-  movdqa      [rsp+0E0h],xmm0 
-  movdqa      xmm0,xmm5 
-  paddw       xmm0,xmm8 
-  movdqa      xmm1,[rsp+60h] 
-  paddw       xmm1,xmm14 
-  movdqa      xmm14,xmm3 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm8 
-  paddw       xmm0,[rsp+30h] 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  pand        xmm14,xmm1 
-  movdqa      xmm1,xmm13 
-  paddw       xmm1,xmm13 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm2 
-  psraw       xmm1,2 
-  movdqa      xmm0,[rsp+30h] 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm5,xmm15 
-  paddw       xmm0,[rsp+70h] 
-  pandn       xmm5,xmm1 
-  paddw       xmm2,xmm8 
-  movdqa      xmm8,[rsp+90h] 
-  movdqa      xmm1,xmm12 
-  paddw       xmm2,xmm8 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,xmm8 
-  movdqa      xmm8,xmm3 
-  movdqa      xmm2,[rsp+30h] 
-  paddw       xmm0,xmm13 
-  psraw       xmm1,3 
-  pand        xmm15,xmm1 
-  movdqa      xmm1,xmm2 
-  paddw       xmm1,xmm2 
-  paddw       xmm2,[rsp+90h] 
-  paddw       xmm2,[rsp+0B0h] 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm13,[r8] 
-  paddw       xmm0, [rsp+70h] 
-  paddw       xmm1, [rsp+0A0h] 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,2 
-  movdqa      xmm0, [rdi] 
-  pandn       xmm8,xmm1 
-  movdqa      xmm1, [rsp+60h] 
-  paddw       xmm1,xmm2 
-  movdqa      xmm2, [rbx] 
-  psraw       xmm1,3 
-  pand        xmm3,xmm1 
-  movdqa      xmm1, [rbp] 
-  movdqa      [rsp+0D0h],xmm3 
-  pxor        xmm3,xmm3 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm1,xmm3 
-  punpckhbw   xmm13,xmm3 
-  movdqa      [rsp+0C0h],xmm0 
-  movdqa      xmm0,[r10+rbp] 
-  movdqa      [rsp],xmm1 
-  punpckhbw   xmm0,xmm3 
-  punpckhbw   xmm2,xmm3 
-  movdqa      [rsp+80h],xmm0 
-  movdqa      xmm0,[rsi+rbp] 
-  movdqa      [rsp+10h],xmm13 
-  punpckhbw   xmm0,xmm3 
-  movdqa      [rsp+50h],xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm1,xmm13 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm2 
-  pabsw       xmm3,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,[rsp] 
-  movdqa      xmm13,[rsp+40h] 
-  movdqa      [rsp+110h],xmm2 
-  psubw       xmm1, [rsp+80h] 
-  pcmpgtw     xmm13,xmm0 
-  pcmpgtw     xmm11,xmm3 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm10,xmm3 
-  movdqa      xmm1, [rsp+40h] 
-  movdqa      xmm2,xmm1 
-  movdqa      xmm3,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  movdqa      xmm0, [rsp+10h] 
-  pand        xmm13,xmm2 
-  pand        xmm13,xmm11 
-  movdqa      xmm11,[rsp+0C0h] 
-  psubw       xmm0,xmm11 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm3,xmm0 
-  pand        xmm3,xmm10 
-  movdqa      xmm0,[rsp] 
-  psubw       xmm0,[rsp+50h] 
-  movdqa      xmm2,[rdx] 
-  pabsw       xmm0,xmm0 
-  por         xmm7,xmm9 
-  movdqa      xmm9,[rsp+20h] 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm9,xmm7 
-  movdqa      xmm7,[rsp+20h] 
-  movdqa      xmm0,xmm7 
-  pandn       xmm0,xmm12 
-  movdqa      xmm12,[rsp+110h] 
-  pand        xmm1,xmm10 
-  movdqa      xmm10,[rsp+70h] 
-  movdqa      [rsp+40h],xmm1 
-  movdqa      xmm1,xmm13 
-  por         xmm9,xmm0 
-  pxor        xmm0,xmm0 
-  por         xmm4,xmm6 
-  movdqa      xmm6,xmm7 
-  punpckhbw   xmm2,xmm0 
-  por         xmm15,xmm5 
-  movdqa      xmm5,[rsp+20h] 
-  movdqa      xmm0,xmm3 
-  psllw       xmm2,1 
-  pandn       xmm0,xmm11 
-  pand        xmm6,xmm4 
-  movdqa      xmm4,[rsp] 
-  paddw       xmm2,xmm11 
-  pand        xmm5,xmm15 
-  movdqa      xmm15,[rsp+20h] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm12 
-  paddw       xmm2,[rsp+10h] 
-  paddw       xmm2,[rsp] 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  pand        xmm2,xmm3 
-  por         xmm2,xmm0 
-  pand        xmm1,xmm2 
-  movdqa      xmm0,xmm13 
-  movdqa      xmm2,xmm11 
-  pandn       xmm0,xmm11 
-  paddw       xmm2,xmm12 
-  por         xmm1,xmm0 
-  packuswb    xmm9,xmm1 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm7,[rsp+0A0h] 
-  pandn       xmm0,[rsp+0F0h] 
-  movdqa      xmm1,xmm3 
-  por         xmm6,xmm0 
-  movdqa      xmm0,[rsp+10h] 
-  paddw       xmm0,xmm4 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm3 
-  pandn       xmm0,xmm12 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  pandn       xmm0,xmm12 
-  movdqa      xmm1,xmm12 
-  paddw       xmm1,[rsp+10h] 
-  por         xmm2,xmm0 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+0B0h] 
-  paddw       xmm1,xmm4 
-  packuswb    xmm6,xmm2 
-  movdqa      xmm2,xmm3 
-  psllw       xmm1,1 
-  por         xmm5,xmm0 
-  movdqa      xmm0,[rsp+80h] 
-  paddw       xmm0,xmm10 
-  paddw       xmm1,xmm0 
-  paddw       xmm11,xmm1 
-  psraw       xmm11,3 
-  movdqa      xmm1,xmm12 
-  pand        xmm2,xmm11 
-  paddw       xmm1,xmm12 
-  movdqa      xmm11,[rsp+80h] 
-  movdqa      xmm0, [rsp+10h] 
-  por         xmm14,[rsp+0E0h] 
-  paddw       xmm0,xmm11 
-  movdqa      xmm4,xmm15 
-  paddw       xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  paddw       xmm1,xmm7 
-  psraw       xmm1,2 
-  pandn       xmm3,xmm1 
-  por         xmm2,xmm3 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm3,[rsp+10h] 
-  pandn       xmm0,xmm3 
-  pand        xmm1,xmm2 
-  movdqa      xmm2,xmm11 
-  paddw       xmm2,[rsp] 
-  por         xmm1,xmm0 
-  movdqa      xmm0,[rsp+0D0h] 
-  por         xmm0,xmm8 
-  paddw       xmm2,xmm3 
-  packuswb    xmm5,xmm1 
-  movdqa      xmm8,[rsp+40h] 
-  movdqa      xmm1,[rsp+50h] 
-  movdqa      xmm3,xmm8 
-  pand        xmm4,xmm0 
-  psllw       xmm2,1 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+90h] 
-  por         xmm4,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm10 
-  paddw       xmm2,xmm0 
-  paddw       xmm1,xmm2 
-  movdqa      xmm0,[rsp] 
-  movdqa      xmm2,xmm11 
-  paddw       xmm0,xmm12 
-  movdqa      xmm12,[rsp] 
-  paddw       xmm2,xmm11 
-  paddw       xmm2,xmm0 
-  psraw       xmm1,3 
-  movdqa      xmm0,xmm8 
-  pand        xmm3,xmm1 
-  paddw       xmm2,xmm7 
-  movdqa      xmm1,xmm13 
-  psraw       xmm2,2 
-  pandn       xmm0,xmm2 
-  por         xmm3,xmm0 
-  movdqa      xmm2,[rsp+50h] 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm3 
-  paddw       xmm2,xmm11 
-  movdqa      xmm3,xmm15 
-  por         xmm1,xmm0 
-  pand        xmm3,xmm14 
-  movdqa      xmm14,[rsp+10h] 
-  movdqa      xmm0,xmm15 
-  pandn       xmm0,[rsp+30h] 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm1,xmm8 
-  por         xmm3,xmm0 
-  movdqa      xmm0,xmm12 
-  paddw       xmm0,xmm14 
-  paddw       xmm2,xmm0 
-  paddw       xmm2,xmm7 
-  movdqa      xmm0,xmm8 
-  pandn       xmm0,xmm11 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  movdqa      xmm2,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm11 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm15 
-  por         xmm2,xmm0 
-  packuswb    xmm3,xmm2 
-  movdqa      xmm0,[rsp+100h] 
-  por         xmm0,[rsp+120h] 
-  pand        xmm1,xmm0 
-  movdqa      xmm2,[rcx+rbp] 
-  movdqa      xmm7,[rsp+50h] 
-  pandn       xmm15,[rsp+60h] 
-  lea         r11,[rsp+1D8h] 
-  pxor        xmm0,xmm0 
-  por         xmm1,xmm15 
-  movaps      xmm15,[r11-0A8h] 
-  movdqa      [rdi],xmm9 
-  movaps      xmm9,[r11-48h] 
-  punpckhbw   xmm2,xmm0 
-  psllw       xmm2,1 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm7 
-  movdqa      [rbx],xmm6 
-  movaps      xmm6,[r11-18h] 
-  paddw       xmm2,xmm7 
-  paddw       xmm2,xmm11 
-  movaps      xmm11,[r11-68h] 
-  paddw       xmm2,xmm12 
-  movaps      xmm12,[r11-78h] 
-  paddw       xmm2,xmm14 
-  paddw       xmm2,xmm10 
-  psraw       xmm2,3 
-  movaps      xmm10,[r11-58h] 
-  movaps      xmm14,[r11-98h] 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm8 
-  pandn       xmm8,xmm7 
-  pandn       xmm13,xmm7 
-  por         xmm2,xmm8 
-  movaps      xmm7,[r11-28h] 
-  movaps      xmm8,[r11-38h] 
-  movdqa      [r8],xmm5 
-  pand        xmm0,xmm2 
-  por         xmm0,xmm13 
-  packuswb    xmm1,xmm0 
-  movaps      xmm13,[r11-88h] 
-  movdqa      [rbp],xmm4 
-  movdqa      [r10+rbp],xmm3 
-  movdqa      [rsi+rbp],xmm1 
-  mov         rsp,r11   
-  pop         rbp  
-  pop         rbx  
+  sub         rsp,1D8h
+  movaps      [rax-38h],xmm6
+  movaps      [rax-48h],xmm7
+  movaps      [rax-58h],xmm8
+  pxor        xmm1,xmm1
+  movsxd      r10,edx
+  mov         rbp,rcx
+  mov         r11d,r8d
+  mov         rdx,rcx
+  mov         rdi,rbp
+  mov         rbx,rbp
+  movdqa      xmm5,[rbp]
+  movaps      [rax-68h],xmm9
+  movaps      [rax-78h],xmm10
+  punpcklbw   xmm5,xmm1
+  movaps      [rax-88h],xmm11
+  movaps      [rax-98h],xmm12
+  movaps      [rax-0A8h],xmm13
+  movaps      [rax-0B8h],xmm14
+  movdqa      xmm14,[r10+rbp]
+  movaps      [rax-0C8h],xmm15
+  lea         eax,[r10*4]
+  movsxd      r8,eax
+  lea         eax,[r10+r10*2]
+  movsxd      rcx,eax
+  lea         eax,[r10+r10]
+  sub         rdx,r8
+  punpcklbw   xmm14,xmm1
+  movdqa      [rsp+90h],xmm5
+  movdqa      [rsp+30h],xmm14
+  movsxd      rsi,eax
+  movsx       eax,r11w
+  sub         rdi,rcx
+  sub         rbx,rsi
+  mov         r8,rbp
+  sub         r8,r10
+  movd        xmm0,eax
+  movsx       eax,r9w
+  movdqa      xmm12,[rdi]
+  movdqa      xmm6, [rsi+rbp]
+  movdqa      xmm13,[rbx]
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm6,xmm1
+  movdqa      xmm8,[r8]
+  movd        xmm0,eax
+  movdqa      xmm10,xmm11
+  mov         eax,2
+  punpcklbw   xmm8,xmm1
+  punpcklbw   xmm12,xmm1
+  cwde
+  punpcklwd   xmm0,xmm0
+  psraw       xmm10,2
+  movdqa      xmm1,xmm8
+  movdqa      [rsp+0F0h],xmm13
+  movdqa      [rsp+0B0h],xmm8
+  pshufd      xmm7,xmm0,0
+  psubw       xmm1,xmm13
+  movdqa      xmm0,xmm5
+  movdqa      xmm4,xmm7
+  movdqa      xmm2,xmm7
+  psubw       xmm0,xmm8
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm5
+  movdqa      [rsp+40h],xmm7
+  movdqa      [rsp+60h],xmm6
+  pcmpgtw     xmm4,xmm0
+  psubw       xmm1,xmm14
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm4,xmm2
+  movdqa      xmm0,xmm11
+  pcmpgtw     xmm0,xmm3
+  pand        xmm4,xmm0
+  movd        xmm0,eax
+  movdqa      [rsp+20h],xmm4
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm2,xmm0,0
+  paddw       xmm10,xmm2
+  movdqa      [rsp+0A0h],xmm2
+  movdqa      xmm15,xmm7
+  pxor        xmm4,xmm4
+  movdqa      xmm0,xmm8
+  psubw       xmm0,xmm12
+  mov         eax,4
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm10
+  cwde
+  pcmpgtw     xmm15,xmm0
+  pcmpgtw     xmm1,xmm3
+  movdqa      xmm3,xmm7
+  movdqa      xmm7,[rdx]
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm6
+  pand        xmm15,xmm1
+  punpcklbw   xmm7,xmm4
+  movdqa      xmm9,xmm15
+  pabsw       xmm0,xmm0
+  psllw       xmm7,1
+  pandn       xmm9,xmm12
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm7,xmm12
+  movd        xmm0,eax
+  pand        xmm3,xmm1
+  paddw       xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  paddw       xmm7,xmm12
+  pshufd      xmm1,xmm0,0
+  paddw       xmm7,xmm13
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm6
+  paddw       xmm7,xmm8
+  movdqa      [rsp+70h],xmm1
+  paddw       xmm7,xmm5
+  movdqa      [rsp+120h],xmm0
+  movdqa      xmm0,[rcx+rbp]
+  punpcklbw   xmm0,xmm4
+  paddw       xmm7,xmm1
+  movdqa      xmm4,xmm15
+  psllw       xmm0,1
+  psraw       xmm7,3
+  paddw       xmm0,xmm6
+  pand        xmm7,xmm15
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm6
+  paddw       xmm0,xmm14
+  movdqa      xmm6,xmm15
+  paddw       xmm0,xmm5
+  pandn       xmm6,xmm13
+  paddw       xmm0,xmm8
+  paddw       xmm0,xmm1
+  psraw       xmm0,3
+  movdqa      xmm1,xmm12
+  paddw       xmm1,xmm13
+  pand        xmm0,xmm3
+  movdqa      [rsp+100h],xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,xmm5
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm3
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pandn       xmm0,xmm14
+  pand        xmm4,xmm1
+  movdqa      [rsp+0E0h],xmm0
+  movdqa      xmm0,xmm5
+  paddw       xmm0,xmm8
+  movdqa      xmm1,[rsp+60h]
+  paddw       xmm1,xmm14
+  movdqa      xmm14,xmm3
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm8
+  paddw       xmm0,[rsp+30h]
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  pand        xmm14,xmm1
+  movdqa      xmm1,xmm13
+  paddw       xmm1,xmm13
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm2
+  psraw       xmm1,2
+  movdqa      xmm0,[rsp+30h]
+  movdqa      xmm2,xmm13
+  movdqa      xmm5,xmm15
+  paddw       xmm0,[rsp+70h]
+  pandn       xmm5,xmm1
+  paddw       xmm2,xmm8
+  movdqa      xmm8,[rsp+90h]
+  movdqa      xmm1,xmm12
+  paddw       xmm2,xmm8
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,xmm8
+  movdqa      xmm8,xmm3
+  movdqa      xmm2,[rsp+30h]
+  paddw       xmm0,xmm13
+  psraw       xmm1,3
+  pand        xmm15,xmm1
+  movdqa      xmm1,xmm2
+  paddw       xmm1,xmm2
+  paddw       xmm2,[rsp+90h]
+  paddw       xmm2,[rsp+0B0h]
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  movdqa      xmm13,[r8]
+  paddw       xmm0, [rsp+70h]
+  paddw       xmm1, [rsp+0A0h]
+  psllw       xmm2,1
+  paddw       xmm2,xmm0
+  psraw       xmm1,2
+  movdqa      xmm0, [rdi]
+  pandn       xmm8,xmm1
+  movdqa      xmm1, [rsp+60h]
+  paddw       xmm1,xmm2
+  movdqa      xmm2, [rbx]
+  psraw       xmm1,3
+  pand        xmm3,xmm1
+  movdqa      xmm1, [rbp]
+  movdqa      [rsp+0D0h],xmm3
+  pxor        xmm3,xmm3
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm1,xmm3
+  punpckhbw   xmm13,xmm3
+  movdqa      [rsp+0C0h],xmm0
+  movdqa      xmm0,[r10+rbp]
+  movdqa      [rsp],xmm1
+  punpckhbw   xmm0,xmm3
+  punpckhbw   xmm2,xmm3
+  movdqa      [rsp+80h],xmm0
+  movdqa      xmm0,[rsi+rbp]
+  movdqa      [rsp+10h],xmm13
+  punpckhbw   xmm0,xmm3
+  movdqa      [rsp+50h],xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm1,xmm13
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm2
+  pabsw       xmm3,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,[rsp]
+  movdqa      xmm13,[rsp+40h]
+  movdqa      [rsp+110h],xmm2
+  psubw       xmm1, [rsp+80h]
+  pcmpgtw     xmm13,xmm0
+  pcmpgtw     xmm11,xmm3
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm10,xmm3
+  movdqa      xmm1, [rsp+40h]
+  movdqa      xmm2,xmm1
+  movdqa      xmm3,xmm1
+  pcmpgtw     xmm2,xmm0
+  movdqa      xmm0, [rsp+10h]
+  pand        xmm13,xmm2
+  pand        xmm13,xmm11
+  movdqa      xmm11,[rsp+0C0h]
+  psubw       xmm0,xmm11
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm3,xmm0
+  pand        xmm3,xmm10
+  movdqa      xmm0,[rsp]
+  psubw       xmm0,[rsp+50h]
+  movdqa      xmm2,[rdx]
+  pabsw       xmm0,xmm0
+  por         xmm7,xmm9
+  movdqa      xmm9,[rsp+20h]
+  pcmpgtw     xmm1,xmm0
+  pand        xmm9,xmm7
+  movdqa      xmm7,[rsp+20h]
+  movdqa      xmm0,xmm7
+  pandn       xmm0,xmm12
+  movdqa      xmm12,[rsp+110h]
+  pand        xmm1,xmm10
+  movdqa      xmm10,[rsp+70h]
+  movdqa      [rsp+40h],xmm1
+  movdqa      xmm1,xmm13
+  por         xmm9,xmm0
+  pxor        xmm0,xmm0
+  por         xmm4,xmm6
+  movdqa      xmm6,xmm7
+  punpckhbw   xmm2,xmm0
+  por         xmm15,xmm5
+  movdqa      xmm5,[rsp+20h]
+  movdqa      xmm0,xmm3
+  psllw       xmm2,1
+  pandn       xmm0,xmm11
+  pand        xmm6,xmm4
+  movdqa      xmm4,[rsp]
+  paddw       xmm2,xmm11
+  pand        xmm5,xmm15
+  movdqa      xmm15,[rsp+20h]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm12
+  paddw       xmm2,[rsp+10h]
+  paddw       xmm2,[rsp]
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  pand        xmm2,xmm3
+  por         xmm2,xmm0
+  pand        xmm1,xmm2
+  movdqa      xmm0,xmm13
+  movdqa      xmm2,xmm11
+  pandn       xmm0,xmm11
+  paddw       xmm2,xmm12
+  por         xmm1,xmm0
+  packuswb    xmm9,xmm1
+  movdqa      xmm0,xmm7
+  movdqa      xmm7,[rsp+0A0h]
+  pandn       xmm0,[rsp+0F0h]
+  movdqa      xmm1,xmm3
+  por         xmm6,xmm0
+  movdqa      xmm0,[rsp+10h]
+  paddw       xmm0,xmm4
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm3
+  pandn       xmm0,xmm12
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  pandn       xmm0,xmm12
+  movdqa      xmm1,xmm12
+  paddw       xmm1,[rsp+10h]
+  por         xmm2,xmm0
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+0B0h]
+  paddw       xmm1,xmm4
+  packuswb    xmm6,xmm2
+  movdqa      xmm2,xmm3
+  psllw       xmm1,1
+  por         xmm5,xmm0
+  movdqa      xmm0,[rsp+80h]
+  paddw       xmm0,xmm10
+  paddw       xmm1,xmm0
+  paddw       xmm11,xmm1
+  psraw       xmm11,3
+  movdqa      xmm1,xmm12
+  pand        xmm2,xmm11
+  paddw       xmm1,xmm12
+  movdqa      xmm11,[rsp+80h]
+  movdqa      xmm0, [rsp+10h]
+  por         xmm14,[rsp+0E0h]
+  paddw       xmm0,xmm11
+  movdqa      xmm4,xmm15
+  paddw       xmm1,xmm0
+  movdqa      xmm0,xmm13
+  paddw       xmm1,xmm7
+  psraw       xmm1,2
+  pandn       xmm3,xmm1
+  por         xmm2,xmm3
+  movdqa      xmm1,xmm13
+  movdqa      xmm3,[rsp+10h]
+  pandn       xmm0,xmm3
+  pand        xmm1,xmm2
+  movdqa      xmm2,xmm11
+  paddw       xmm2,[rsp]
+  por         xmm1,xmm0
+  movdqa      xmm0,[rsp+0D0h]
+  por         xmm0,xmm8
+  paddw       xmm2,xmm3
+  packuswb    xmm5,xmm1
+  movdqa      xmm8,[rsp+40h]
+  movdqa      xmm1,[rsp+50h]
+  movdqa      xmm3,xmm8
+  pand        xmm4,xmm0
+  psllw       xmm2,1
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+90h]
+  por         xmm4,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm10
+  paddw       xmm2,xmm0
+  paddw       xmm1,xmm2
+  movdqa      xmm0,[rsp]
+  movdqa      xmm2,xmm11
+  paddw       xmm0,xmm12
+  movdqa      xmm12,[rsp]
+  paddw       xmm2,xmm11
+  paddw       xmm2,xmm0
+  psraw       xmm1,3
+  movdqa      xmm0,xmm8
+  pand        xmm3,xmm1
+  paddw       xmm2,xmm7
+  movdqa      xmm1,xmm13
+  psraw       xmm2,2
+  pandn       xmm0,xmm2
+  por         xmm3,xmm0
+  movdqa      xmm2,[rsp+50h]
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm3
+  paddw       xmm2,xmm11
+  movdqa      xmm3,xmm15
+  por         xmm1,xmm0
+  pand        xmm3,xmm14
+  movdqa      xmm14,[rsp+10h]
+  movdqa      xmm0,xmm15
+  pandn       xmm0,[rsp+30h]
+  packuswb    xmm4,xmm1
+  movdqa      xmm1,xmm8
+  por         xmm3,xmm0
+  movdqa      xmm0,xmm12
+  paddw       xmm0,xmm14
+  paddw       xmm2,xmm0
+  paddw       xmm2,xmm7
+  movdqa      xmm0,xmm8
+  pandn       xmm0,xmm11
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  movdqa      xmm2,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm11
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm15
+  por         xmm2,xmm0
+  packuswb    xmm3,xmm2
+  movdqa      xmm0,[rsp+100h]
+  por         xmm0,[rsp+120h]
+  pand        xmm1,xmm0
+  movdqa      xmm2,[rcx+rbp]
+  movdqa      xmm7,[rsp+50h]
+  pandn       xmm15,[rsp+60h]
+  lea         r11,[rsp+1D8h]
+  pxor        xmm0,xmm0
+  por         xmm1,xmm15
+  movaps      xmm15,[r11-0A8h]
+  movdqa      [rdi],xmm9
+  movaps      xmm9,[r11-48h]
+  punpckhbw   xmm2,xmm0
+  psllw       xmm2,1
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm7
+  movdqa      [rbx],xmm6
+  movaps      xmm6,[r11-18h]
+  paddw       xmm2,xmm7
+  paddw       xmm2,xmm11
+  movaps      xmm11,[r11-68h]
+  paddw       xmm2,xmm12
+  movaps      xmm12,[r11-78h]
+  paddw       xmm2,xmm14
+  paddw       xmm2,xmm10
+  psraw       xmm2,3
+  movaps      xmm10,[r11-58h]
+  movaps      xmm14,[r11-98h]
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm8
+  pandn       xmm8,xmm7
+  pandn       xmm13,xmm7
+  por         xmm2,xmm8
+  movaps      xmm7,[r11-28h]
+  movaps      xmm8,[r11-38h]
+  movdqa      [r8],xmm5
+  pand        xmm0,xmm2
+  por         xmm0,xmm13
+  packuswb    xmm1,xmm0
+  movaps      xmm13,[r11-88h]
+  movdqa      [rbp],xmm4
+  movdqa      [r10+rbp],xmm3
+  movdqa      [rsi+rbp],xmm1
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
   ret
 
 WELS_EXTERN  DeblockChromaLt4V_sse2
-ALIGN  16 
-DeblockChromaLt4V_sse2: 
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp    
+ALIGN  16
+DeblockChromaLt4V_sse2:
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   mov         r10,  rdx
   mov         r11,  rcx
   mov         rcx,  rdi
-  mov         rdx,  rsi  
+  mov         rdx,  rsi
   mov         rsi,  r10
   mov         r10,  r9
   mov         rbp,  r8
   mov         r8,   rsi
   mov         r9,   r11
-  sub         rsp,0C8h   
-  pxor        xmm1,xmm1 
-  mov         rbx,rcx 
-  movsxd      r11,r8d 
-  movsx       ecx,byte [r10] 
-  movsx       r8d,byte [r10+2] 
-  mov         rdi,rdx 
-  movq        xmm2,[rbx] 
-  movq        xmm9,[r11+rbx] 
-  movsx       edx,byte [r10+1] 
-  mov         word [rsp+2],cx 
-  mov         word [rsp],cx 
-  movsx       eax,byte [r10+3] 
-  mov         word [rsp+6],dx 
-  mov         word [rsp+4],dx 
-  movdqa      xmm11,xmm1 
-  mov         word [rsp+0Eh],ax 
-  mov         word [rsp+0Ch],ax 
-  lea         eax,[r11+r11] 
-  movsxd      rcx,eax 
-  mov         rax,rbx 
-  mov         rdx,rdi 
-  sub         rax,rcx 
-  mov         word [rsp+0Ah],r8w 
-  mov         word [rsp+8],r8w 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm7,xmm6 
-  movq        xmm13, [rax] 
-  mov         rax,rdi 
-  sub         rax,rcx 
-  mov         rcx,rbx 
-  pcmpgtw     xmm7,xmm1 
-  psubw       xmm11,xmm6 
-  sub         rcx,r11 
-  sub         rdx,r11 
-  movq        xmm0,[rax] 
-  movsx       eax,r9w 
-  movq        xmm15,[rcx] 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rdx] 
-  movdqa      xmm4,xmm13 
-  punpcklqdq  xmm15,xmm0 
-  movq        xmm0, [rdi] 
-  punpcklbw   xmm4,xmm1 
-  movdqa      xmm12,xmm15 
-  punpcklqdq  xmm2,xmm0 
-  movq        xmm0, [r11+rdi] 
-  punpcklbw   xmm12,xmm1 
-  movdqa      xmm14,xmm2 
-  punpcklqdq  xmm9,xmm0 
-  punpckhbw   xmm2,xmm1 
-  punpcklbw   xmm14,xmm1 
-  movd        xmm0,eax 
+  sub         rsp,0C8h
+  pxor        xmm1,xmm1
+  mov         rbx,rcx
+  movsxd      r11,r8d
+  movsx       ecx,byte [r10]
+  movsx       r8d,byte [r10+2]
+  mov         rdi,rdx
+  movq        xmm2,[rbx]
+  movq        xmm9,[r11+rbx]
+  movsx       edx,byte [r10+1]
+  mov         word [rsp+2],cx
+  mov         word [rsp],cx
+  movsx       eax,byte [r10+3]
+  mov         word [rsp+6],dx
+  mov         word [rsp+4],dx
+  movdqa      xmm11,xmm1
+  mov         word [rsp+0Eh],ax
+  mov         word [rsp+0Ch],ax
+  lea         eax,[r11+r11]
+  movsxd      rcx,eax
+  mov         rax,rbx
+  mov         rdx,rdi
+  sub         rax,rcx
+  mov         word [rsp+0Ah],r8w
+  mov         word [rsp+8],r8w
+  movdqa      xmm6,[rsp]
+  movdqa      xmm7,xmm6
+  movq        xmm13, [rax]
+  mov         rax,rdi
+  sub         rax,rcx
+  mov         rcx,rbx
+  pcmpgtw     xmm7,xmm1
+  psubw       xmm11,xmm6
+  sub         rcx,r11
+  sub         rdx,r11
+  movq        xmm0,[rax]
+  movsx       eax,r9w
+  movq        xmm15,[rcx]
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rdx]
+  movdqa      xmm4,xmm13
+  punpcklqdq  xmm15,xmm0
+  movq        xmm0, [rdi]
+  punpcklbw   xmm4,xmm1
+  movdqa      xmm12,xmm15
+  punpcklqdq  xmm2,xmm0
+  movq        xmm0, [r11+rdi]
+  punpcklbw   xmm12,xmm1
+  movdqa      xmm14,xmm2
+  punpcklqdq  xmm9,xmm0
+  punpckhbw   xmm2,xmm1
+  punpcklbw   xmm14,xmm1
+  movd        xmm0,eax
   mov         eax, ebp ; iBeta
-  punpckhbw   xmm13,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm9 
-  movdqa      [rsp+10h],xmm2 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm9,xmm1 
-  punpcklbw   xmm3,xmm1 
-  movdqa      xmm1,xmm14 
-  pshufd      xmm10,xmm0,0 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm8,xmm0,0 
-  movd        xmm0,eax 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  psubw       xmm1,xmm12 
-  movdqa      xmm2,xmm10 
-  lea         r11,[rsp+0C8h] 
-  psllw       xmm1,2 
-  movdqa      xmm0,xmm4 
-  psubw       xmm4,xmm12 
-  psubw       xmm0,xmm3 
-  psubw       xmm3,xmm14 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm11 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm12 
-  psubw       xmm0,xmm14 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  movdqa      xmm3,[rsp] 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm13 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm9 
-  psubw       xmm13,xmm15 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  paddw       xmm12,xmm6 
-  psubw       xmm14,xmm6 
-  movdqa      xmm2,[rsp+10h] 
-  movaps      xmm6,[r11-18h] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm15 
-  psubw       xmm9,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm15 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  pmaxsw      xmm11,xmm1 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm10,xmm0 
-  pabsw       xmm0,xmm13 
-  pminsw      xmm3,xmm11 
-  movaps      xmm11,[r11-68h] 
-  movaps      xmm13,[rsp+40h] 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm9 
-  movaps      xmm9, [r11-48h] 
-  pand        xmm10,xmm1 
-  pcmpgtw     xmm8,xmm0 
-  pand        xmm10,xmm8 
-  pand        xmm10,xmm7 
-  movaps      xmm8,[r11-38h] 
-  movaps      xmm7,[r11-28h] 
-  pand        xmm3,xmm10 
-  paddw       xmm15,xmm3 
-  psubw       xmm2,xmm3 
-  movaps      xmm10,[r11-58h] 
-  packuswb    xmm12,xmm15 
-  movaps      xmm15,[rsp+20h] 
-  packuswb    xmm14,xmm2 
-  movq        [rcx],xmm12 
-  movq        [rbx],xmm14 
-  psrldq      xmm12,8 
-  psrldq      xmm14,8 
-  movq        [rdx],xmm12 
-  movaps      xmm12,[r11-78h] 
-  movq        [rdi],xmm14 
-  movaps      xmm14,[rsp+30h] 
-  mov         rsp,r11 
-  pop         rbp  
-  pop         rbx  
+  punpckhbw   xmm13,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm9
+  movdqa      [rsp+10h],xmm2
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm9,xmm1
+  punpcklbw   xmm3,xmm1
+  movdqa      xmm1,xmm14
+  pshufd      xmm10,xmm0,0
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm8,xmm0,0
+  movd        xmm0,eax
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  psubw       xmm1,xmm12
+  movdqa      xmm2,xmm10
+  lea         r11,[rsp+0C8h]
+  psllw       xmm1,2
+  movdqa      xmm0,xmm4
+  psubw       xmm4,xmm12
+  psubw       xmm0,xmm3
+  psubw       xmm3,xmm14
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm11
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm12
+  psubw       xmm0,xmm14
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  movdqa      xmm3,[rsp]
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm13
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm9
+  psubw       xmm13,xmm15
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  paddw       xmm12,xmm6
+  psubw       xmm14,xmm6
+  movdqa      xmm2,[rsp+10h]
+  movaps      xmm6,[r11-18h]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm15
+  psubw       xmm9,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm15
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  pmaxsw      xmm11,xmm1
+  pabsw       xmm0,xmm0
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm10,xmm0
+  pabsw       xmm0,xmm13
+  pminsw      xmm3,xmm11
+  movaps      xmm11,[r11-68h]
+  movaps      xmm13,[rsp+40h]
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm9
+  movaps      xmm9, [r11-48h]
+  pand        xmm10,xmm1
+  pcmpgtw     xmm8,xmm0
+  pand        xmm10,xmm8
+  pand        xmm10,xmm7
+  movaps      xmm8,[r11-38h]
+  movaps      xmm7,[r11-28h]
+  pand        xmm3,xmm10
+  paddw       xmm15,xmm3
+  psubw       xmm2,xmm3
+  movaps      xmm10,[r11-58h]
+  packuswb    xmm12,xmm15
+  movaps      xmm15,[rsp+20h]
+  packuswb    xmm14,xmm2
+  movq        [rcx],xmm12
+  movq        [rbx],xmm14
+  psrldq      xmm12,8
+  psrldq      xmm14,8
+  movq        [rdx],xmm12
+  movaps      xmm12,[r11-78h]
+  movq        [rdi],xmm14
+  movaps      xmm14,[rsp+30h]
+  mov         rsp,r11
+  pop         rbp
+  pop         rbx
   ret
 
 WELS_EXTERN   DeblockChromaEq4V_sse2
 ALIGN 16
 DeblockChromaEq4V_sse2:
-  mov         rax,rsp 
-  push        rbx  
+  mov         rax,rsp
+  push        rbx
   push        rbp
 
   mov         rbp, r8
@@ -2545,143 +2545,143 @@
   mov         r9, rcx
   mov         rcx, rdi
   mov         rdx, rsi
-  
-  sub         rsp,90h 
-  pxor        xmm1,xmm1 
-  mov         r11,rcx 
-  mov         rbx,rdx 
-  mov         r10d,r9d   
-  movq        xmm13,[r11] 
-  lea         eax,[r8+r8] 
-  movsxd      r9,eax 
-  mov         rax,rcx 
-  sub         rax,r9 
-  movq        xmm14,[rax] 
-  mov         rax,rdx 
-  sub         rax,r9 
-  movq        xmm0,[rax] 
-  movsxd      rax,r8d 
-  sub         rcx,rax 
-  sub         rdx,rax 
-  movq        xmm12,[rax+r11] 
-  movq        xmm10,[rcx] 
-  punpcklqdq  xmm14,xmm0 
-  movdqa      xmm8,xmm14 
-  movq        xmm0,[rdx] 
-  punpcklbw   xmm8,xmm1 
-  punpckhbw   xmm14,xmm1 
-  punpcklqdq  xmm10,xmm0 
-  movq        xmm0,[rbx] 
-  movdqa      xmm5,xmm10 
-  punpcklqdq  xmm13,xmm0 
-  movq        xmm0, [rax+rbx] 
-  punpcklbw   xmm5,xmm1 
-  movsx       eax,r10w 
-  movdqa      xmm9,xmm13 
-  punpcklqdq  xmm12,xmm0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm10,xmm1 
-  movd        xmm0,eax 
+
+  sub         rsp,90h
+  pxor        xmm1,xmm1
+  mov         r11,rcx
+  mov         rbx,rdx
+  mov         r10d,r9d
+  movq        xmm13,[r11]
+  lea         eax,[r8+r8]
+  movsxd      r9,eax
+  mov         rax,rcx
+  sub         rax,r9
+  movq        xmm14,[rax]
+  mov         rax,rdx
+  sub         rax,r9
+  movq        xmm0,[rax]
+  movsxd      rax,r8d
+  sub         rcx,rax
+  sub         rdx,rax
+  movq        xmm12,[rax+r11]
+  movq        xmm10,[rcx]
+  punpcklqdq  xmm14,xmm0
+  movdqa      xmm8,xmm14
+  movq        xmm0,[rdx]
+  punpcklbw   xmm8,xmm1
+  punpckhbw   xmm14,xmm1
+  punpcklqdq  xmm10,xmm0
+  movq        xmm0,[rbx]
+  movdqa      xmm5,xmm10
+  punpcklqdq  xmm13,xmm0
+  movq        xmm0, [rax+rbx]
+  punpcklbw   xmm5,xmm1
+  movsx       eax,r10w
+  movdqa      xmm9,xmm13
+  punpcklqdq  xmm12,xmm0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm10,xmm1
+  movd        xmm0,eax
   mov         eax, ebp   ; iBeta
-  punpckhbw   xmm13,xmm1 
-  movdqa      xmm7,xmm12 
-  punpcklwd   xmm0,xmm0 
-  punpckhbw   xmm12,xmm1 
-  pshufd      xmm11,xmm0,0 
-  punpcklbw   xmm7,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm1,xmm8 
-  psubw       xmm1,xmm5 
-  punpcklwd   xmm0,xmm0 
-  movdqa      xmm6,xmm11 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm5 
-  psubw       xmm0,xmm9 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm10 
-  movdqa      xmm1,xmm14 
-  psubw       xmm0,xmm13 
-  psubw       xmm1,xmm10 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm11,xmm0 
-  pabsw       xmm0,xmm1 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm11,xmm2 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm4,xmm6 
-  movdqa      xmm1,xmm8 
-  mov         eax,2 
-  cwde             
-  paddw       xmm1,xmm8 
-  psubw       xmm0,xmm13 
-  paddw       xmm1,xmm5 
-  pabsw       xmm0,xmm0 
-  movdqa      xmm2,xmm14 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm14 
-  movd        xmm0,eax 
-  pand        xmm11,xmm3 
-  paddw       xmm7,xmm7 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  paddw       xmm2,xmm12 
-  paddw       xmm12,xmm12 
-  pshufd      xmm3,xmm0,0 
-  paddw       xmm7,xmm9 
-  paddw       xmm12,xmm13 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm5 
-  paddw       xmm7,xmm8 
-  psraw       xmm1,2 
-  paddw       xmm12,xmm14 
-  paddw       xmm7,xmm3 
-  ;movaps      xmm14,[rsp] 
-  pand        xmm4,xmm1 
-  paddw       xmm12,xmm3 
-  psraw       xmm7,2 
-  movdqa      xmm1,xmm11 
-  por         xmm4,xmm0 
-  psraw       xmm12,2 
-  paddw       xmm2,xmm3 
-  movdqa      xmm0,xmm11 
-  pandn       xmm0,xmm10 
-  psraw       xmm2,2 
-  pand        xmm1,xmm2 
-  por         xmm1,xmm0 
-  packuswb    xmm4,xmm1 
-  movdqa      xmm0,xmm11 
-  movdqa      xmm1,xmm6 
-  pand        xmm1,xmm7 
-  movq        [rcx],xmm4 
-  pandn       xmm6,xmm9 
-  pandn       xmm11,xmm13 
-  pand        xmm0,xmm12 
-  por         xmm1,xmm6 
-  por         xmm0,xmm11 
-  psrldq      xmm4,8 
-  packuswb    xmm1,xmm0 
-  movq        [r11],xmm1 
-  psrldq      xmm1,8 
-  movq        [rdx],xmm4 
-  lea         r11,[rsp+90h] 
-  movq        [rbx],xmm1 
-  mov         rsp,r11 
+  punpckhbw   xmm13,xmm1
+  movdqa      xmm7,xmm12
+  punpcklwd   xmm0,xmm0
+  punpckhbw   xmm12,xmm1
+  pshufd      xmm11,xmm0,0
+  punpcklbw   xmm7,xmm1
+  movd        xmm0,eax
+  movdqa      xmm1,xmm8
+  psubw       xmm1,xmm5
+  punpcklwd   xmm0,xmm0
+  movdqa      xmm6,xmm11
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm5
+  psubw       xmm0,xmm9
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm10
+  movdqa      xmm1,xmm14
+  psubw       xmm0,xmm13
+  psubw       xmm1,xmm10
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm11,xmm0
+  pabsw       xmm0,xmm1
+  pcmpgtw     xmm2,xmm0
+  pand        xmm11,xmm2
+  movdqa      xmm0,xmm12
+  movdqa      xmm4,xmm6
+  movdqa      xmm1,xmm8
+  mov         eax,2
+  cwde
+  paddw       xmm1,xmm8
+  psubw       xmm0,xmm13
+  paddw       xmm1,xmm5
+  pabsw       xmm0,xmm0
+  movdqa      xmm2,xmm14
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm14
+  movd        xmm0,eax
+  pand        xmm11,xmm3
+  paddw       xmm7,xmm7
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  paddw       xmm2,xmm12
+  paddw       xmm12,xmm12
+  pshufd      xmm3,xmm0,0
+  paddw       xmm7,xmm9
+  paddw       xmm12,xmm13
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm5
+  paddw       xmm7,xmm8
+  psraw       xmm1,2
+  paddw       xmm12,xmm14
+  paddw       xmm7,xmm3
+  ;movaps      xmm14,[rsp]
+  pand        xmm4,xmm1
+  paddw       xmm12,xmm3
+  psraw       xmm7,2
+  movdqa      xmm1,xmm11
+  por         xmm4,xmm0
+  psraw       xmm12,2
+  paddw       xmm2,xmm3
+  movdqa      xmm0,xmm11
+  pandn       xmm0,xmm10
+  psraw       xmm2,2
+  pand        xmm1,xmm2
+  por         xmm1,xmm0
+  packuswb    xmm4,xmm1
+  movdqa      xmm0,xmm11
+  movdqa      xmm1,xmm6
+  pand        xmm1,xmm7
+  movq        [rcx],xmm4
+  pandn       xmm6,xmm9
+  pandn       xmm11,xmm13
+  pand        xmm0,xmm12
+  por         xmm1,xmm6
+  por         xmm0,xmm11
+  psrldq      xmm4,8
+  packuswb    xmm1,xmm0
+  movq        [r11],xmm1
+  psrldq      xmm1,8
+  movq        [rdx],xmm4
+  lea         r11,[rsp+90h]
+  movq        [rbx],xmm1
+  mov         rsp,r11
   pop         rbp
-  pop         rbx  
+  pop         rbx
   ret
 
 
@@ -2688,270 +2688,270 @@
 WELS_EXTERN   DeblockChromaEq4H_sse2
 ALIGN  16
 DeblockChromaEq4H_sse2:
-  mov         rax,rsp 
-  push        rbx 
-  push        rbp 
+  mov         rax,rsp
+  push        rbx
+  push        rbp
   push        r12
-  
-  mov         rbp,   r8  
+
+  mov         rbp,   r8
   mov         r8,    rdx
   mov         r9,    rcx
   mov         rcx,   rdi
-  mov         rdx,   rsi  
+  mov         rdx,   rsi
   mov         rdi,   rdx
 
-  sub         rsp,140h     
-  lea         eax,[r8*4] 
-  movsxd      r10,eax 
-  mov         eax,[rcx-2] 
-  mov         [rsp+10h],eax 
-  lea         rbx,[r10+rdx-2] 
-  lea         r11,[r10+rcx-2] 
+  sub         rsp,140h
+  lea         eax,[r8*4]
+  movsxd      r10,eax
+  mov         eax,[rcx-2]
+  mov         [rsp+10h],eax
+  lea         rbx,[r10+rdx-2]
+  lea         r11,[r10+rcx-2]
 
-  movdqa      xmm5,[rsp+10h] 
-  movsxd      r10,r8d 
-  mov         eax,[r10+rcx-2] 
-  lea         rdx,[r10+r10*2] 
-  mov         [rsp+20h],eax 
-  mov         eax,[rcx+r10*2-2] 
-  mov         [rsp+30h],eax 
+  movdqa      xmm5,[rsp+10h]
+  movsxd      r10,r8d
+  mov         eax,[r10+rcx-2]
+  lea         rdx,[r10+r10*2]
+  mov         [rsp+20h],eax
+  mov         eax,[rcx+r10*2-2]
+  mov         [rsp+30h],eax
   mov         eax,[rdx+rcx-2]
-  movdqa      xmm2,[rsp+20h] 
-  mov         [rsp+40h],eax 
-  mov         eax, [rdi-2] 
-  movdqa      xmm4,[rsp+30h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rdi-2] 
-  movdqa      xmm3,[rsp+40h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[rdi+r10*2-2] 
-  punpckldq   xmm5,[rsp+50h] 
-  mov         [rsp+70h],eax 
-  mov         eax, [rdx+rdi-2] 
-  punpckldq   xmm2, [rsp+60h] 
-  mov          [rsp+80h],eax 
-  mov         eax,[r11] 
-  punpckldq   xmm4, [rsp+70h] 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx] 
-  punpckldq   xmm3,[rsp+80h] 
-  mov         [rsp+60h],eax 
-  mov         eax,[r10+r11] 
-  movdqa      xmm0, [rsp+50h] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+60h],eax 
-  mov         eax,[r11+r10*2] 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax,[rbx+r10*2] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  mov         eax, [rdx+r11] 
-  movdqa      xmm15,xmm1 
-  punpckldq   xmm0,[rsp+60h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+50h],xmm0 
-  mov         [rsp+50h],eax 
-  mov         eax, [rdx+rbx] 
-  movdqa      xmm0,[rsp+50h] 
-  mov         [rsp+60h],eax 
-  punpckldq   xmm0, [rsp+60h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm15,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm12,xmm15 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm12,xmm0 
-  punpckhdq   xmm15,xmm0 
-  movdqa      xmm0,xmm1 
-  movdqa      xmm11,xmm12 
-  punpckldq   xmm0,xmm5 
-  punpckhdq   xmm1,xmm5 
-  punpcklqdq  xmm11,xmm0 
-  punpckhqdq  xmm12,xmm0 
-  movsx       eax,r9w 
-  movdqa      xmm14,xmm15 
-  punpcklqdq  xmm14,xmm1 
-  punpckhqdq  xmm15,xmm1 
-  pxor        xmm1,xmm1 
-  movd        xmm0,eax 
-  movdqa      xmm4,xmm12 
-  movdqa      xmm8,xmm11 
-  mov         eax, ebp ; iBeta
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm4,xmm1 
-  punpckhbw   xmm12,xmm1 
-  movdqa      xmm9,xmm14 
-  movdqa      xmm7,xmm15 
-  movdqa      xmm10,xmm15 
-  pshufd      xmm13,xmm0,0 
-  punpcklbw   xmm9,xmm1 
-  punpckhbw   xmm14,xmm1 
-  movdqa      xmm6,xmm13 
-  movd        xmm0,eax 
-  movdqa      [rsp],xmm11 
-  mov         eax,2 
-  cwde             
-  punpckhbw   xmm11,xmm1 
-  punpckhbw   xmm10,xmm1 
-  punpcklbw   xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  punpcklbw   xmm8,xmm1 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm1,xmm8 
-  movdqa      xmm0,xmm4 
-  psubw       xmm0,xmm9 
-  psubw       xmm1,xmm4 
-  movdqa      xmm2,xmm3 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm6,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm3 
-  pcmpgtw     xmm2,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm0,xmm7 
-  movdqa      xmm2,xmm3 
-  psubw       xmm0,xmm9 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm1,xmm0 
-  pand        xmm6,xmm1 
-  movdqa      xmm0,xmm12 
-  movdqa      xmm1,xmm11 
-  psubw       xmm0,xmm14 
-  psubw       xmm1,xmm12 
-  movdqa      xmm5,xmm6 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm13,xmm0 
-  pabsw       xmm0,xmm1 
-  movdqa      xmm1,xmm8 
-  pcmpgtw     xmm2,xmm0 
-  paddw       xmm1,xmm8 
-  movdqa      xmm0,xmm10 
-  pand        xmm13,xmm2 
-  psubw       xmm0,xmm14 
-  paddw       xmm1,xmm4 
-  movdqa      xmm2,xmm11 
-  pabsw       xmm0,xmm0 
-  paddw       xmm2,xmm11 
-  paddw       xmm1,xmm7 
-  pcmpgtw     xmm3,xmm0 
-  paddw       xmm2,xmm12 
-  movd        xmm0,eax 
-  pand        xmm13,xmm3 
-  paddw       xmm2,xmm10 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm3,xmm0,0 
-  movdqa      xmm0,xmm6 
-  paddw       xmm1,xmm3 
-  pandn       xmm0,xmm4 
-  paddw       xmm2,xmm3 
-  psraw       xmm1,2 
-  pand        xmm5,xmm1 
-  por         xmm5,xmm0 
-  paddw       xmm7,xmm7 
-  paddw       xmm10,xmm10 
-  psraw       xmm2,2 
-  movdqa      xmm1,xmm13 
-  movdqa      xmm0,xmm13 
-  pandn       xmm0,xmm12 
-  pand        xmm1,xmm2 
-  paddw       xmm7,xmm9 
-  por         xmm1,xmm0 
-  paddw       xmm10,xmm14 
-  paddw       xmm7,xmm8 
-  movdqa      xmm0,xmm13 
-  packuswb    xmm5,xmm1 
-  paddw       xmm7,xmm3 
-  paddw       xmm10,xmm11 
-  movdqa      xmm1,xmm6 
-  paddw       xmm10,xmm3 
-  pandn       xmm6,xmm9 
-  psraw       xmm7,2 
-  pand        xmm1,xmm7 
-  psraw       xmm10,2 
-  pandn       xmm13,xmm14 
-  pand        xmm0,xmm10 
-  por         xmm1,xmm6 
-  movdqa      xmm6,[rsp] 
-  movdqa      xmm4,xmm6 
-  por         xmm0,xmm13 
-  punpcklbw   xmm4,xmm5 
-  punpckhbw   xmm6,xmm5 
-  movdqa      xmm3,xmm4 
-  packuswb    xmm1,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckhbw   xmm1,xmm15 
-  punpcklbw   xmm0,xmm15 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm6 
-  movdqa      xmm2,xmm3 
-  punpcklwd   xmm0,xmm1 
-  punpckhwd   xmm6,xmm1 
-  movdqa      xmm1,xmm4 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm6 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm6 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+10h],xmm0 
-  movdqa      [rsp+60h],xmm2 
-  movdqa      xmm0,xmm3 
-  mov         eax,[rsp+10h] 
-  mov         [rcx-2],eax 
-  mov         eax,[rsp+60h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [r10+rcx-2],eax 
-  movdqa      [rsp+20h],xmm0 
-  mov         eax, [rsp+20h] 
-  movdqa      [rsp+70h],xmm3 
-  mov         [rcx+r10*2-2],eax 
-  mov         eax,[rsp+70h] 
-  mov         [rdx+rcx-2],eax 
-  mov         eax,[rsp+18h] 
-  mov         [r11],eax 
-  mov         eax,[rsp+68h] 
-  mov         [r10+r11],eax 
-  mov         eax,[rsp+28h] 
-  mov         [r11+r10*2],eax 
-  mov         eax,[rsp+78h] 
-  mov         [rdx+r11],eax 
-  mov         eax,[rsp+14h] 
-  mov         [rdi-2],eax 
-  mov         eax,[rsp+64h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+24h] 
-  mov         [rdi+r10*2-2],eax 
-  mov         eax, [rsp+74h] 
-  mov         [rdx+rdi-2],eax 
-  mov         eax, [rsp+1Ch] 
-  mov         [rbx],eax 
-  mov         eax, [rsp+6Ch] 
-  mov         [r10+rbx],eax 
-  mov         eax,[rsp+2Ch] 
-  mov         [rbx+r10*2],eax 
-  mov         eax,[rsp+7Ch] 
-  mov         [rdx+rbx],eax  
-  lea         r11,[rsp+140h] 
-  mov         rbx, [r11+28h]    
+  movdqa      xmm2,[rsp+20h]
+  mov         [rsp+40h],eax
+  mov         eax, [rdi-2]
+  movdqa      xmm4,[rsp+30h]
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rdi-2]
+  movdqa      xmm3,[rsp+40h]
+  mov         [rsp+60h],eax
+  mov         eax,[rdi+r10*2-2]
+  punpckldq   xmm5,[rsp+50h]
+  mov         [rsp+70h],eax
+  mov         eax, [rdx+rdi-2]
+  punpckldq   xmm2, [rsp+60h]
+  mov          [rsp+80h],eax
+  mov         eax,[r11]
+  punpckldq   xmm4, [rsp+70h]
+  mov         [rsp+50h],eax
+  mov         eax,[rbx]
+  punpckldq   xmm3,[rsp+80h]
+  mov         [rsp+60h],eax
+  mov         eax,[r10+r11]
+  movdqa      xmm0, [rsp+50h]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm0,[rsp+50h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+60h],eax
+  mov         eax,[r11+r10*2]
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax,[rbx+r10*2]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  mov         eax, [rdx+r11]
+  movdqa      xmm15,xmm1
+  punpckldq   xmm0,[rsp+60h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+50h],xmm0
+  mov         [rsp+50h],eax
+  mov         eax, [rdx+rbx]
+  movdqa      xmm0,[rsp+50h]
+  mov         [rsp+60h],eax
+  punpckldq   xmm0, [rsp+60h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm15,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm12,xmm15
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm12,xmm0
+  punpckhdq   xmm15,xmm0
+  movdqa      xmm0,xmm1
+  movdqa      xmm11,xmm12
+  punpckldq   xmm0,xmm5
+  punpckhdq   xmm1,xmm5
+  punpcklqdq  xmm11,xmm0
+  punpckhqdq  xmm12,xmm0
+  movsx       eax,r9w
+  movdqa      xmm14,xmm15
+  punpcklqdq  xmm14,xmm1
+  punpckhqdq  xmm15,xmm1
+  pxor        xmm1,xmm1
+  movd        xmm0,eax
+  movdqa      xmm4,xmm12
+  movdqa      xmm8,xmm11
+  mov         eax, ebp ; iBeta
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm4,xmm1
+  punpckhbw   xmm12,xmm1
+  movdqa      xmm9,xmm14
+  movdqa      xmm7,xmm15
+  movdqa      xmm10,xmm15
+  pshufd      xmm13,xmm0,0
+  punpcklbw   xmm9,xmm1
+  punpckhbw   xmm14,xmm1
+  movdqa      xmm6,xmm13
+  movd        xmm0,eax
+  movdqa      [rsp],xmm11
+  mov         eax,2
+  cwde
+  punpckhbw   xmm11,xmm1
+  punpckhbw   xmm10,xmm1
+  punpcklbw   xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  punpcklbw   xmm8,xmm1
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm1,xmm8
+  movdqa      xmm0,xmm4
+  psubw       xmm0,xmm9
+  psubw       xmm1,xmm4
+  movdqa      xmm2,xmm3
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm6,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm3
+  pcmpgtw     xmm2,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm0,xmm7
+  movdqa      xmm2,xmm3
+  psubw       xmm0,xmm9
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm1,xmm0
+  pand        xmm6,xmm1
+  movdqa      xmm0,xmm12
+  movdqa      xmm1,xmm11
+  psubw       xmm0,xmm14
+  psubw       xmm1,xmm12
+  movdqa      xmm5,xmm6
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm13,xmm0
+  pabsw       xmm0,xmm1
+  movdqa      xmm1,xmm8
+  pcmpgtw     xmm2,xmm0
+  paddw       xmm1,xmm8
+  movdqa      xmm0,xmm10
+  pand        xmm13,xmm2
+  psubw       xmm0,xmm14
+  paddw       xmm1,xmm4
+  movdqa      xmm2,xmm11
+  pabsw       xmm0,xmm0
+  paddw       xmm2,xmm11
+  paddw       xmm1,xmm7
+  pcmpgtw     xmm3,xmm0
+  paddw       xmm2,xmm12
+  movd        xmm0,eax
+  pand        xmm13,xmm3
+  paddw       xmm2,xmm10
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm3,xmm0,0
+  movdqa      xmm0,xmm6
+  paddw       xmm1,xmm3
+  pandn       xmm0,xmm4
+  paddw       xmm2,xmm3
+  psraw       xmm1,2
+  pand        xmm5,xmm1
+  por         xmm5,xmm0
+  paddw       xmm7,xmm7
+  paddw       xmm10,xmm10
+  psraw       xmm2,2
+  movdqa      xmm1,xmm13
+  movdqa      xmm0,xmm13
+  pandn       xmm0,xmm12
+  pand        xmm1,xmm2
+  paddw       xmm7,xmm9
+  por         xmm1,xmm0
+  paddw       xmm10,xmm14
+  paddw       xmm7,xmm8
+  movdqa      xmm0,xmm13
+  packuswb    xmm5,xmm1
+  paddw       xmm7,xmm3
+  paddw       xmm10,xmm11
+  movdqa      xmm1,xmm6
+  paddw       xmm10,xmm3
+  pandn       xmm6,xmm9
+  psraw       xmm7,2
+  pand        xmm1,xmm7
+  psraw       xmm10,2
+  pandn       xmm13,xmm14
+  pand        xmm0,xmm10
+  por         xmm1,xmm6
+  movdqa      xmm6,[rsp]
+  movdqa      xmm4,xmm6
+  por         xmm0,xmm13
+  punpcklbw   xmm4,xmm5
+  punpckhbw   xmm6,xmm5
+  movdqa      xmm3,xmm4
+  packuswb    xmm1,xmm0
+  movdqa      xmm0,xmm1
+  punpckhbw   xmm1,xmm15
+  punpcklbw   xmm0,xmm15
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm6
+  movdqa      xmm2,xmm3
+  punpcklwd   xmm0,xmm1
+  punpckhwd   xmm6,xmm1
+  movdqa      xmm1,xmm4
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm6
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm6
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+10h],xmm0
+  movdqa      [rsp+60h],xmm2
+  movdqa      xmm0,xmm3
+  mov         eax,[rsp+10h]
+  mov         [rcx-2],eax
+  mov         eax,[rsp+60h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [r10+rcx-2],eax
+  movdqa      [rsp+20h],xmm0
+  mov         eax, [rsp+20h]
+  movdqa      [rsp+70h],xmm3
+  mov         [rcx+r10*2-2],eax
+  mov         eax,[rsp+70h]
+  mov         [rdx+rcx-2],eax
+  mov         eax,[rsp+18h]
+  mov         [r11],eax
+  mov         eax,[rsp+68h]
+  mov         [r10+r11],eax
+  mov         eax,[rsp+28h]
+  mov         [r11+r10*2],eax
+  mov         eax,[rsp+78h]
+  mov         [rdx+r11],eax
+  mov         eax,[rsp+14h]
+  mov         [rdi-2],eax
+  mov         eax,[rsp+64h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+24h]
+  mov         [rdi+r10*2-2],eax
+  mov         eax, [rsp+74h]
+  mov         [rdx+rdi-2],eax
+  mov         eax, [rsp+1Ch]
+  mov         [rbx],eax
+  mov         eax, [rsp+6Ch]
+  mov         [r10+rbx],eax
+  mov         eax,[rsp+2Ch]
+  mov         [rbx+r10*2],eax
+  mov         eax,[rsp+7Ch]
+  mov         [rdx+rbx],eax
+  lea         r11,[rsp+140h]
+  mov         rbx, [r11+28h]
   mov         rsp,r11
   pop         r12
   pop         rbp
@@ -2962,14 +2962,14 @@
 WELS_EXTERN DeblockChromaLt4H_sse2
 ALIGN  16
 DeblockChromaLt4H_sse2:
-  mov         rax,rsp 
-  push        rbx  
-  push        rbp  
-  push        r12  
+  mov         rax,rsp
+  push        rbx
+  push        rbp
+  push        r12
   push        r13
   push        r14
-  sub         rsp,170h  
-  
+  sub         rsp,170h
+
   mov         r13,   r8
   mov         r14,   r9
   mov         r8,    rdx
@@ -2977,275 +2977,275 @@
   mov         rdx,   rdi
   mov         rcx,   rsi
 
-  movsxd      rsi,r8d 
-  lea         eax,[r8*4] 
-  mov         r11d,r9d 
-  movsxd      r10,eax 
-  mov         eax, [rcx-2] 
-  mov         r12,rdx 
-  mov         [rsp+40h],eax 
-  mov         eax, [rsi+rcx-2] 
-  lea         rbx,[r10+rcx-2] 
-  movdqa      xmm5,[rsp+40h] 
-  mov         [rsp+50h],eax 
-  mov         eax, [rcx+rsi*2-2] 
-  lea         rbp,[r10+rdx-2] 
-  movdqa      xmm2, [rsp+50h] 
-  mov         [rsp+60h],eax 
-  lea         r10,[rsi+rsi*2] 
-  mov         rdi,rcx 
-  mov         eax,[r10+rcx-2] 
-  movdqa      xmm4,[rsp+60h] 
-  mov         [rsp+70h],eax 
-  mov         eax,[rdx-2] 
-  mov         [rsp+80h],eax 
-  mov         eax, [rsi+rdx-2] 
-  movdqa      xmm3,[rsp+70h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rdx+rsi*2-2] 
-  punpckldq   xmm5,[rsp+80h] 
-  mov         [rsp+0A0h],eax 
-  mov         eax, [r10+rdx-2] 
-  punpckldq   xmm2,[rsp+90h] 
-  mov         [rsp+0B0h],eax 
-  mov         eax, [rbx] 
-  punpckldq   xmm4,[rsp+0A0h] 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp] 
-  punpckldq   xmm3,[rsp+0B0h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[rsi+rbx] 
-  movdqa      xmm0,[rsp+80h] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm5,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rsi+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  movdqa      xmm1,xmm5 
-  mov         [rsp+90h],eax 
-  mov         eax,[rbx+rsi*2] 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm2,xmm0 
-  punpcklbw   xmm1,xmm2 
-  punpckhbw   xmm5,xmm2 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax,[rbp+rsi*2] 
-  movdqa      xmm0, [rsp+80h] 
-  mov         [rsp+90h],eax 
-  mov         eax,[r10+rbx] 
-  movdqa      xmm7,xmm1 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm4,xmm0 
-  movdqa      [rsp+80h],xmm0 
-  mov         [rsp+80h],eax 
-  mov         eax, [r10+rbp] 
-  movdqa      xmm0,[rsp+80h] 
-  mov         [rsp+90h],eax 
-  punpckldq   xmm0,[rsp+90h] 
-  punpcklqdq  xmm3,xmm0 
-  movdqa      xmm0,xmm4 
-  punpcklbw   xmm0,xmm3 
-  punpckhbw   xmm4,xmm3 
-  punpcklwd   xmm7,xmm0 
-  punpckhwd   xmm1,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm6,xmm7 
-  punpcklwd   xmm0,xmm4 
-  punpckhwd   xmm5,xmm4 
-  punpckldq   xmm6,xmm0 
-  punpckhdq   xmm7,xmm0 
-  movdqa      xmm0,xmm1 
-  punpckldq   xmm0,xmm5 
+  movsxd      rsi,r8d
+  lea         eax,[r8*4]
+  mov         r11d,r9d
+  movsxd      r10,eax
+  mov         eax, [rcx-2]
+  mov         r12,rdx
+  mov         [rsp+40h],eax
+  mov         eax, [rsi+rcx-2]
+  lea         rbx,[r10+rcx-2]
+  movdqa      xmm5,[rsp+40h]
+  mov         [rsp+50h],eax
+  mov         eax, [rcx+rsi*2-2]
+  lea         rbp,[r10+rdx-2]
+  movdqa      xmm2, [rsp+50h]
+  mov         [rsp+60h],eax
+  lea         r10,[rsi+rsi*2]
+  mov         rdi,rcx
+  mov         eax,[r10+rcx-2]
+  movdqa      xmm4,[rsp+60h]
+  mov         [rsp+70h],eax
+  mov         eax,[rdx-2]
+  mov         [rsp+80h],eax
+  mov         eax, [rsi+rdx-2]
+  movdqa      xmm3,[rsp+70h]
+  mov         [rsp+90h],eax
+  mov         eax,[rdx+rsi*2-2]
+  punpckldq   xmm5,[rsp+80h]
+  mov         [rsp+0A0h],eax
+  mov         eax, [r10+rdx-2]
+  punpckldq   xmm2,[rsp+90h]
+  mov         [rsp+0B0h],eax
+  mov         eax, [rbx]
+  punpckldq   xmm4,[rsp+0A0h]
+  mov         [rsp+80h],eax
+  mov         eax,[rbp]
+  punpckldq   xmm3,[rsp+0B0h]
+  mov         [rsp+90h],eax
+  mov         eax,[rsi+rbx]
+  movdqa      xmm0,[rsp+80h]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm5,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rsi+rbp]
+  movdqa      xmm0,[rsp+80h]
+  movdqa      xmm1,xmm5
+  mov         [rsp+90h],eax
+  mov         eax,[rbx+rsi*2]
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm2,xmm0
+  punpcklbw   xmm1,xmm2
+  punpckhbw   xmm5,xmm2
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax,[rbp+rsi*2]
+  movdqa      xmm0, [rsp+80h]
+  mov         [rsp+90h],eax
+  mov         eax,[r10+rbx]
+  movdqa      xmm7,xmm1
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm4,xmm0
+  movdqa      [rsp+80h],xmm0
+  mov         [rsp+80h],eax
+  mov         eax, [r10+rbp]
+  movdqa      xmm0,[rsp+80h]
+  mov         [rsp+90h],eax
+  punpckldq   xmm0,[rsp+90h]
+  punpcklqdq  xmm3,xmm0
+  movdqa      xmm0,xmm4
+  punpcklbw   xmm0,xmm3
+  punpckhbw   xmm4,xmm3
+  punpcklwd   xmm7,xmm0
+  punpckhwd   xmm1,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm6,xmm7
+  punpcklwd   xmm0,xmm4
+  punpckhwd   xmm5,xmm4
+  punpckldq   xmm6,xmm0
+  punpckhdq   xmm7,xmm0
+  movdqa      xmm0,xmm1
+  punpckldq   xmm0,xmm5
   mov         rax, r14    ; pTC
-  punpckhdq   xmm1,xmm5 
-  movdqa      xmm9,xmm6 
-  punpckhqdq  xmm6,xmm0 
-  punpcklqdq  xmm9,xmm0 
-  movdqa      xmm2,xmm7 
-  movdqa      xmm13,xmm6 
-  movdqa      xmm4,xmm9 
-  movdqa      [rsp+10h],xmm9 
-  punpcklqdq  xmm2,xmm1 
-  punpckhqdq  xmm7,xmm1 
-  pxor        xmm1,xmm1 
-  movsx       ecx,byte [rax+3] 
-  movsx       edx,byte [rax+2] 
-  movsx       r8d,byte [rax+1] 
-  movsx       r9d,byte [rax] 
-  movdqa      xmm10,xmm1 
-  movdqa      xmm15,xmm2 
-  punpckhbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm4,xmm1 
-  movsx       eax,r11w 
-  mov         word [rsp+0Eh],cx 
-  mov         word [rsp+0Ch],cx 
-  movdqa      xmm3,xmm7 
-  movdqa      xmm8,xmm7 
-  movdqa      [rsp+20h],xmm7 
-  punpcklbw   xmm15,xmm1 
-  punpcklbw   xmm13,xmm1 
-  punpcklbw   xmm3,xmm1 
-  mov         word [rsp+0Ah],dx 
-  mov         word [rsp+8],dx 
-  mov         word [rsp+6],r8w 
-  movd        xmm0,eax 
-  movdqa      [rsp+30h],xmm6 
-  punpckhbw   xmm9,xmm1 
-  punpckhbw   xmm8,xmm1 
-  punpcklwd   xmm0,xmm0 
+  punpckhdq   xmm1,xmm5
+  movdqa      xmm9,xmm6
+  punpckhqdq  xmm6,xmm0
+  punpcklqdq  xmm9,xmm0
+  movdqa      xmm2,xmm7
+  movdqa      xmm13,xmm6
+  movdqa      xmm4,xmm9
+  movdqa      [rsp+10h],xmm9
+  punpcklqdq  xmm2,xmm1
+  punpckhqdq  xmm7,xmm1
+  pxor        xmm1,xmm1
+  movsx       ecx,byte [rax+3]
+  movsx       edx,byte [rax+2]
+  movsx       r8d,byte [rax+1]
+  movsx       r9d,byte [rax]
+  movdqa      xmm10,xmm1
+  movdqa      xmm15,xmm2
+  punpckhbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm4,xmm1
+  movsx       eax,r11w
+  mov         word [rsp+0Eh],cx
+  mov         word [rsp+0Ch],cx
+  movdqa      xmm3,xmm7
+  movdqa      xmm8,xmm7
+  movdqa      [rsp+20h],xmm7
+  punpcklbw   xmm15,xmm1
+  punpcklbw   xmm13,xmm1
+  punpcklbw   xmm3,xmm1
+  mov         word [rsp+0Ah],dx
+  mov         word [rsp+8],dx
+  mov         word [rsp+6],r8w
+  movd        xmm0,eax
+  movdqa      [rsp+30h],xmm6
+  punpckhbw   xmm9,xmm1
+  punpckhbw   xmm8,xmm1
+  punpcklwd   xmm0,xmm0
   mov         eax, r13d   ; iBeta
-  mov         word [rsp+4],r8w 
-  mov         word [rsp+2],r9w 
-  pshufd      xmm12,xmm0,0 
-  mov         word [rsp],r9w 
-  movd        xmm0,eax 
-  mov         eax,4 
-  cwde             
-  movdqa      xmm14, [rsp] 
-  movdqa      [rsp],xmm2 
-  movdqa      xmm2,xmm12 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm11,xmm0,0 
-  psubw       xmm10,xmm14 
-  movd        xmm0,eax 
-  movdqa      xmm7,xmm14 
-  movdqa      xmm6,xmm14 
-  pcmpgtw     xmm7,xmm1 
-  punpcklwd   xmm0,xmm0 
-  pshufd      xmm5,xmm0,0 
-  movdqa      xmm0,xmm4 
-  movdqa      xmm1,xmm15 
-  psubw       xmm4,xmm13 
-  psubw       xmm0,xmm3 
-  psubw       xmm1,xmm13 
-  psubw       xmm3,xmm15 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm10 
-  psraw       xmm1,3 
-  pmaxsw      xmm0,xmm1 
-  pminsw      xmm6,xmm0 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm0,xmm13 
-  psubw       xmm0,xmm15 
-  pabsw       xmm0,xmm0 
-  pcmpgtw     xmm2,xmm0 
-  pabsw       xmm0,xmm4 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm3 
-  pand        xmm2,xmm1 
-  movdqa      xmm1,xmm11 
-  movdqa      xmm3,[rsp+30h] 
-  pcmpgtw     xmm1,xmm0 
-  movdqa      xmm0,xmm9 
-  pand        xmm2,xmm1 
-  psubw       xmm0,xmm8 
-  psubw       xmm9,xmm3 
-  pand        xmm2,xmm7 
-  pand        xmm6,xmm2 
-  psubw       xmm15,xmm6 
-  paddw       xmm13,xmm6 
-  movdqa      xmm2,[rsp] 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  psubw       xmm8,xmm2 
-  psllw       xmm1,2 
-  paddw       xmm1,xmm0 
-  paddw       xmm1,xmm5 
-  movdqa      xmm0,xmm3 
-  movdqa      xmm5,[rsp+10h] 
-  psubw       xmm0,xmm2 
-  psraw       xmm1,3 
-  movdqa      xmm4,xmm5 
-  pabsw       xmm0,xmm0 
-  pmaxsw      xmm10,xmm1 
-  movdqa      xmm1,xmm11 
-  pcmpgtw     xmm12,xmm0 
-  pabsw       xmm0,xmm9 
-  pminsw      xmm14,xmm10 
-  pcmpgtw     xmm1,xmm0 
-  pabsw       xmm0,xmm8 
-  pcmpgtw     xmm11,xmm0 
-  pand        xmm12,xmm1 
-  movdqa      xmm1,[rsp+20h] 
-  pand        xmm12,xmm11 
-  pand        xmm12,xmm7 
-  pand        xmm14,xmm12 
-  paddw       xmm3,xmm14 
-  psubw       xmm2,xmm14 
-  packuswb    xmm13,xmm3 
-  packuswb    xmm15,xmm2 
-  punpcklbw   xmm4,xmm13 
-  punpckhbw   xmm5,xmm13 
-  movdqa      xmm0,xmm15 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm15,xmm1 
-  movdqa      xmm3,xmm4 
-  punpcklwd   xmm3,xmm0 
-  punpckhwd   xmm4,xmm0 
-  movdqa      xmm0,xmm5 
-  movdqa      xmm2,xmm3 
-  movdqa      xmm1,xmm4 
-  punpcklwd   xmm0,xmm15 
-  punpckhwd   xmm5,xmm15 
-  punpckldq   xmm2,xmm0 
-  punpckhdq   xmm3,xmm0 
-  punpckldq   xmm1,xmm5 
-  movdqa      xmm0,xmm2 
-  punpcklqdq  xmm0,xmm1 
-  punpckhdq   xmm4,xmm5 
-  punpckhqdq  xmm2,xmm1 
-  movdqa      [rsp+40h],xmm0 
-  movdqa      xmm0,xmm3 
-  movdqa      [rsp+90h],xmm2 
-  mov         eax,[rsp+40h] 
-  mov         [rdi-2],eax 
-  mov         eax, [rsp+90h] 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm3,xmm4 
-  mov         [rsi+rdi-2],eax 
-  movdqa      [rsp+50h],xmm0 
-  mov         eax,[rsp+50h] 
-  movdqa      [rsp+0A0h],xmm3 
-  mov         [rdi+rsi*2-2],eax 
-  mov         eax,[rsp+0A0h] 
-  mov         [r10+rdi-2],eax 
-  mov         eax,[rsp+48h] 
-  mov         [rbx],eax 
-  mov         eax,[rsp+98h] 
-  mov         [rsi+rbx],eax 
-  mov         eax,[rsp+58h] 
-  mov         [rbx+rsi*2],eax 
-  mov         eax, [rsp+0A8h] 
-  mov         [r10+rbx],eax 
-  mov         eax, [rsp+44h] 
-  mov         [r12-2],eax 
-  mov         eax,[rsp+94h] 
-  mov         [rsi+r12-2],eax 
-  mov         eax,[rsp+54h] 
-  mov         [r12+rsi*2-2],eax 
-  mov         eax, [rsp+0A4h] 
-  mov         [r10+r12-2],eax 
-  mov         eax,[rsp+4Ch] 
-  mov         [rbp],eax 
-  mov         eax,[rsp+9Ch] 
-  mov         [rsi+rbp],eax 
-  mov         eax, [rsp+5Ch] 
-  mov         [rbp+rsi*2],eax 
-  mov         eax,[rsp+0ACh] 
-  mov         [r10+rbp],eax   
-  lea         r11,[rsp+170h]    
-  mov         rsp,r11 
+  mov         word [rsp+4],r8w
+  mov         word [rsp+2],r9w
+  pshufd      xmm12,xmm0,0
+  mov         word [rsp],r9w
+  movd        xmm0,eax
+  mov         eax,4
+  cwde
+  movdqa      xmm14, [rsp]
+  movdqa      [rsp],xmm2
+  movdqa      xmm2,xmm12
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm11,xmm0,0
+  psubw       xmm10,xmm14
+  movd        xmm0,eax
+  movdqa      xmm7,xmm14
+  movdqa      xmm6,xmm14
+  pcmpgtw     xmm7,xmm1
+  punpcklwd   xmm0,xmm0
+  pshufd      xmm5,xmm0,0
+  movdqa      xmm0,xmm4
+  movdqa      xmm1,xmm15
+  psubw       xmm4,xmm13
+  psubw       xmm0,xmm3
+  psubw       xmm1,xmm13
+  psubw       xmm3,xmm15
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm10
+  psraw       xmm1,3
+  pmaxsw      xmm0,xmm1
+  pminsw      xmm6,xmm0
+  movdqa      xmm1,xmm11
+  movdqa      xmm0,xmm13
+  psubw       xmm0,xmm15
+  pabsw       xmm0,xmm0
+  pcmpgtw     xmm2,xmm0
+  pabsw       xmm0,xmm4
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm3
+  pand        xmm2,xmm1
+  movdqa      xmm1,xmm11
+  movdqa      xmm3,[rsp+30h]
+  pcmpgtw     xmm1,xmm0
+  movdqa      xmm0,xmm9
+  pand        xmm2,xmm1
+  psubw       xmm0,xmm8
+  psubw       xmm9,xmm3
+  pand        xmm2,xmm7
+  pand        xmm6,xmm2
+  psubw       xmm15,xmm6
+  paddw       xmm13,xmm6
+  movdqa      xmm2,[rsp]
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  psubw       xmm8,xmm2
+  psllw       xmm1,2
+  paddw       xmm1,xmm0
+  paddw       xmm1,xmm5
+  movdqa      xmm0,xmm3
+  movdqa      xmm5,[rsp+10h]
+  psubw       xmm0,xmm2
+  psraw       xmm1,3
+  movdqa      xmm4,xmm5
+  pabsw       xmm0,xmm0
+  pmaxsw      xmm10,xmm1
+  movdqa      xmm1,xmm11
+  pcmpgtw     xmm12,xmm0
+  pabsw       xmm0,xmm9
+  pminsw      xmm14,xmm10
+  pcmpgtw     xmm1,xmm0
+  pabsw       xmm0,xmm8
+  pcmpgtw     xmm11,xmm0
+  pand        xmm12,xmm1
+  movdqa      xmm1,[rsp+20h]
+  pand        xmm12,xmm11
+  pand        xmm12,xmm7
+  pand        xmm14,xmm12
+  paddw       xmm3,xmm14
+  psubw       xmm2,xmm14
+  packuswb    xmm13,xmm3
+  packuswb    xmm15,xmm2
+  punpcklbw   xmm4,xmm13
+  punpckhbw   xmm5,xmm13
+  movdqa      xmm0,xmm15
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm15,xmm1
+  movdqa      xmm3,xmm4
+  punpcklwd   xmm3,xmm0
+  punpckhwd   xmm4,xmm0
+  movdqa      xmm0,xmm5
+  movdqa      xmm2,xmm3
+  movdqa      xmm1,xmm4
+  punpcklwd   xmm0,xmm15
+  punpckhwd   xmm5,xmm15
+  punpckldq   xmm2,xmm0
+  punpckhdq   xmm3,xmm0
+  punpckldq   xmm1,xmm5
+  movdqa      xmm0,xmm2
+  punpcklqdq  xmm0,xmm1
+  punpckhdq   xmm4,xmm5
+  punpckhqdq  xmm2,xmm1
+  movdqa      [rsp+40h],xmm0
+  movdqa      xmm0,xmm3
+  movdqa      [rsp+90h],xmm2
+  mov         eax,[rsp+40h]
+  mov         [rdi-2],eax
+  mov         eax, [rsp+90h]
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm3,xmm4
+  mov         [rsi+rdi-2],eax
+  movdqa      [rsp+50h],xmm0
+  mov         eax,[rsp+50h]
+  movdqa      [rsp+0A0h],xmm3
+  mov         [rdi+rsi*2-2],eax
+  mov         eax,[rsp+0A0h]
+  mov         [r10+rdi-2],eax
+  mov         eax,[rsp+48h]
+  mov         [rbx],eax
+  mov         eax,[rsp+98h]
+  mov         [rsi+rbx],eax
+  mov         eax,[rsp+58h]
+  mov         [rbx+rsi*2],eax
+  mov         eax, [rsp+0A8h]
+  mov         [r10+rbx],eax
+  mov         eax, [rsp+44h]
+  mov         [r12-2],eax
+  mov         eax,[rsp+94h]
+  mov         [rsi+r12-2],eax
+  mov         eax,[rsp+54h]
+  mov         [r12+rsi*2-2],eax
+  mov         eax, [rsp+0A4h]
+  mov         [r10+r12-2],eax
+  mov         eax,[rsp+4Ch]
+  mov         [rbp],eax
+  mov         eax,[rsp+9Ch]
+  mov         [rsi+rbp],eax
+  mov         eax, [rsp+5Ch]
+  mov         [rbp+rsi*2],eax
+  mov         eax,[rsp+0ACh]
+  mov         [r10+rbp],eax
+  lea         r11,[rsp+170h]
+  mov         rsp,r11
   pop         r14
   pop         r13
-  pop         r12  
-  pop         rbp  
-  pop         rbx  
-  ret 
+  pop         r12
+  pop         rbp
+  pop         rbx
+  ret
 
 
 
@@ -5162,7 +5162,7 @@
 	mov	esp, ebp
 	pop	ebp
 	ret
-    
+
 %endif
 
 
@@ -5178,16 +5178,16 @@
 ALIGN  16
 
 DeblockLumaTransposeH2V_sse2:
-    push     r3 
-    push     r4  
+    push     r3
+    push     r4
     push     r5
 
-%assign   push_num   3 
-    LOAD_3_PARA    
+%assign   push_num   3
+    LOAD_3_PARA
 
     SIGN_EXTENTION   r1, r1d
 
-    mov      r5,    r7 
+    mov      r5,    r7
     mov      r3,    r7
     and      r3,    0Fh
     sub      r7,    r3
@@ -5229,7 +5229,7 @@
 
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r7]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-   
+
     movdqa  [r2],    xmm4
     movdqa  [r2 + 10h],  xmm2
     movdqa  [r2 + 20h],  xmm3
@@ -5258,17 +5258,17 @@
 
 DeblockLumaTransposeV2H_sse2:
     push     r3
-    push     r4 
+    push     r4
 
 %assign  push_num 2
     LOAD_3_PARA
 
-    SIGN_EXTENTION   r1, r1d 
+    SIGN_EXTENTION   r1, r1d
 
     mov      r4,    r7
-    mov      r3,    r7 
+    mov      r3,    r7
     and      r3,    0Fh
-    sub      r7,    r3 
+    sub      r7,    r3
     sub      r7,    10h
 
     movdqa   xmm0,   [r2]
--- a/codec/common/expand_picture.asm
+++ b/codec/common/expand_picture.asm
@@ -244,7 +244,7 @@
 %macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
     ;r6 [height]
     ;r0 [pSrc+0]  r5[pSrc-32] r1[stride]
-    ;r3 [pSrc+(w-1)] r4[pSrc+w] 
+    ;r3 [pSrc+(w-1)] r4[pSrc+w]
 
 %if %1 == 32		; for luma
 .left_right_loops:
@@ -375,13 +375,13 @@
 
     %assign push_num 3
     LOAD_4_PARA
-    
+
     SIGN_EXTENTION r1, r1d
     SIGN_EXTENTION r2, r2d
     SIGN_EXTENTION r3, r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte[r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -395,22 +395,22 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,05h                   ;r6 = 32*stride
     lea r4,[r3+r6]               ;r4 = dst bottom
-    
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
     push r1
     push r2
@@ -419,13 +419,13 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-32]                          ;left border dst  luma =32 chroma = -16
-    
+
     lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
@@ -432,7 +432,7 @@
     ;prepare for cross border data: top-rigth with xmm4
      movzx r6d,byte [r3]                         ;top -rigth
      SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -444,7 +444,7 @@
     push r1
     push r2
     push r6
-    
+
     exp_left_right_sse2  32,a
 
     pop r6
@@ -455,22 +455,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-32]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
     lea r4,[r4+r1]    ;psrc +width -stride
-    
-    
+
+
     neg r1  ;r1 = stride
     add r6,32         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -477,11 +477,11 @@
     exp_cross_sse2 32,a
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
@@ -495,7 +495,7 @@
 ;										const int32_t iHeight	);
 ;***********************************************************************----------------
 ExpandPictureChromaAlign_sse2:
-	
+
     push r4
     push r5
     push r6
@@ -508,7 +508,7 @@
     SIGN_EXTENTION r3,r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte [r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -522,24 +522,24 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom 
-    
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
-    push r1 
+    push r1
     push r2
 
     exp_top_bottom_sse2 16
@@ -546,20 +546,20 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-    
-    lea r3,[r0+r2-1]                        ;right border src 
+
+    lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
     movzx r6d,byte [r3]                         ;top -rigth
     SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -568,7 +568,7 @@
 
 
     push r0
-    push r1 
+    push r1
     push r2
 	push r6
     exp_left_right_sse2 16,a
@@ -581,22 +581,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-16]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride  
-    
-    
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
     neg r1  ;r1 = stride
     add r6,16         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -603,11 +603,11 @@
     exp_cross_sse2 16,a
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
@@ -633,7 +633,7 @@
     SIGN_EXTENTION r3,r3d
 
     ;also prepare for cross border pData top-left:xmm3
-    
+
     movzx r6d,byte [r0]
     SSE2_Copy16Times xmm3,r6d         ;xmm3: pSrc[0]
 
@@ -647,24 +647,24 @@
     dec r3                      ;h-1
     imul r3,r1                  ;(h-1)*stride
     lea  r3,[r0+r3]             ;pSrc[(h-1)*stride]  r3 = src bottom
-    
+
     mov r6,r1                    ;r6 = stride
     sal r6,04h                   ;r6 = 32*stride
-    lea r4,[r3+r6]               ;r4 = dst bottom 
-    
+    lea r4,[r3+r6]               ;r4 = dst bottom
+
     ;also prepare for cross border data: bottom-left with xmm5,bottom-right xmm6
-    
+
     movzx r6d,byte [r3]             ;bottom-left
     SSE2_Copy16Times xmm5,r6d
-    
+
     lea r6,[r3+r2-1]
     movzx r6d,byte [r6]
     SSE2_Copy16Times xmm6,r6d ;bottom-right
-    
+
     neg r1  ;r1 = -stride
-    
+
     push r0
-    push r1 
+    push r1
     push r2
 
     exp_top_bottom_sse2 16
@@ -671,20 +671,20 @@
 
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-    
+
     pop r2
     pop r1
     pop r0
 
     lea r5,[r0-16]                          ;left border dst  luma =32 chroma = -16
-    
-    lea r3,[r0+r2-1]                        ;right border src 
+
+    lea r3,[r0+r2-1]                        ;right border src
     lea r4,[r3+1]                           ;right border dst
 
     ;prepare for cross border data: top-rigth with xmm4
     movzx r6d,byte [r3]                         ;top -rigth
     SSE2_Copy16Times xmm4,r6d
-    
+
     neg r1   ;r1 = stride
 
 
@@ -693,7 +693,7 @@
 
 
     push r0
-    push r1 
+    push r1
     push r2
 	push r6
     exp_left_right_sse2 16,u
@@ -706,22 +706,22 @@
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     ; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
-    
+
     neg r1  ;r1 = -stride
     lea r3,[r0-16]
     lea r3,[r3+r1]    ;last line of top-left border
-    
+
     lea r4,[r0+r2]    ;psrc +width
-    lea r4,[r4+r1]    ;psrc +width -stride  
-    
-    
+    lea r4,[r4+r1]    ;psrc +width -stride
+
+
     neg r1  ;r1 = stride
     add r6,16         ;height +32(16) ,luma = 32, chroma = 16
     imul r6,r1
-    
+
     lea r5,[r3+r6]    ;last line of bottom-left border
     lea r6,[r4+r6]    ;last line of botoom-right border
-    
+
     neg r1 ; r1 = -stride
 
     ; for left & right border expanding
@@ -728,13 +728,12 @@
     exp_cross_sse2 16,u
 
     LOAD_4_PARA_POP
-    
+
     pop r6
     pop r5
     pop r4
-    
+
     %assign push_num 0
 
 
 	ret
-    
\ No newline at end of file
--- a/codec/common/mb_copy.asm
+++ b/codec/common/mb_copy.asm
@@ -121,7 +121,7 @@
 	movdqa [r0+r4], xmm7
 	LOAD_4_PARA_POP
 	pop r5
-	pop r4	
+	pop r4
 	ret
 
 ;***********************************************************************
@@ -141,7 +141,7 @@
 	;mov eax, [esp+20]	; iStrideD
 	;mov esi, [esp+24]	; Src
 	;mov ecx, [esp+28]	; iStrideS
-	
+
 	push r4
 	push r5
 	%assign  push_num 2
@@ -213,7 +213,7 @@
 	;mov eax, [esp+20]	; iStrideD
 	;mov esi, [esp+24]	; Src
 	;mov ecx, [esp+28]	; iStrideS
-	
+
 	push r4
 	push r5
 	%assign  push_num 2
@@ -243,7 +243,7 @@
 	movdqa [r0+r4], xmm7
 	LOAD_4_PARA_POP
 	pop r5
-	pop r4	
+	pop r4
 	ret
 
 
@@ -261,7 +261,7 @@
 	;mov ecx, [esp + 12]           ;iStrideD
 	;mov ebx, [esp + 16]           ;Src
 	;mov edx, [esp + 20]           ;iStrideS
-	
+
 	%assign  push_num 0
     LOAD_4_PARA
 
@@ -277,7 +277,7 @@
 	movq mm6, [r2]
 	movq mm7, [r2+r3]
 	lea r2, [r2+2*r3]
-	
+
 	movq [r0], mm0
 	movq [r0+r1], mm1
 	lea r0, [r0+2*r1]
@@ -302,7 +302,7 @@
 	lea r2, [r2+2*r3]
 	movq mm6, [r2]
 	movq mm7, [r2+r3]
-	
+
 	movq [r0], mm0
 	movq [r0+r1], mm1
 	lea r0, [r0+2*r1]
@@ -333,7 +333,7 @@
 	;mov ecx, [esp + 16]           ;iStrideD
 	;mov esi, [esp + 20]           ;Src
 	;mov ebx, [esp + 24]           ;iStrideS
-	
+
 	push r4
 	%assign  push_num 1
     LOAD_4_PARA
@@ -374,7 +374,7 @@
 
 	WELSEMMS
 	;pop esi
-	;pop ebx	
+	;pop ebx
 	LOAD_4_PARA_POP
 	pop r4
 	ret
@@ -388,7 +388,7 @@
 
     %assign  push_num 0
     LOAD_2_PARA
-    
+
 	;mov eax, [esp+4]	; mv_buffer
 	;movd xmm0, [esp+8]	; _mv
 	movd xmm0, r1d	; _mv
@@ -438,7 +438,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq4_mmx:
- 
+
     %assign  push_num 0
     LOAD_7_PARA
 
@@ -487,7 +487,7 @@
     ;mov         ebp, [esp+36]       ; pSrcB
     ;mov         edx, [esp+40]       ; iSrcBStride
     ;mov         ebx, [esp+44]       ; iHeight
-    
+
     %assign  push_num 0
     LOAD_7_PARA
 
@@ -497,7 +497,7 @@
 	movsx	r5, r5d
 	movsx	r6, r6d
 %endif
-    
+
 ALIGN 4
 .height_loop:
 	movq        mm0, [r2]
@@ -528,7 +528,7 @@
 ;                          int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq16_sse2:
-        
+
     %assign  push_num 0
     LOAD_7_PARA
 %ifndef X86_32
@@ -567,7 +567,7 @@
     lea         r2, [r2+2*r3]
     lea			r4, [r4+2*r5]
     lea			r0, [r0+2*r1]
-    
+
     sub         r6, 4
     jne         .height_loop
 
@@ -591,17 +591,17 @@
     ;mov edi,  [esp+24]
     ;mov ecx,  [esp+28]
     ;mov edx,  [esp+32]
-    
+
     push	r5
     %assign  push_num 1
     LOAD_5_PARA
-   
+
 %ifndef X86_32
 	movsx	r1, r1d
 	movsx	r3, r3d
 	movsx	r4, r4d
 %endif
-    
+
 ALIGN 4
 .height_loop:
 	mov r5d, [r0]
@@ -629,7 +629,7 @@
 	;mov edi, [esp+20]
 	;mov ecx, [esp+24]
 	;mov edx, [esp+28]
-	
+
     %assign  push_num 0
     LOAD_5_PARA
 
--- a/codec/common/mc_chroma.asm
+++ b/codec/common/mc_chroma.asm
@@ -79,17 +79,17 @@
 	;push esi
 	;push edi
 	;push ebx
-	
+
 	%assign  push_num 0
-	LOAD_6_PARA 
+	LOAD_6_PARA
 %ifndef X86_32
 	movsx	r1, r1d
 	movsx	r3, r3d
 	movsx	r5, r5d
 %endif
-	
+
 	;mov eax, [esp +12 + 20]
-	
+
 	movd mm3, [r4];	[eax]
 	WELS_Zero mm7
 	punpcklbw mm3, mm3
@@ -173,7 +173,7 @@
 	;push ebx
 
 	%assign  push_num 0
-	LOAD_6_PARA 	
+	LOAD_6_PARA
 %ifndef X86_32
 	movsx	r1, r1d
 	movsx	r3, r3d
@@ -241,7 +241,7 @@
 
 	dec r5
 	jnz near .xloop
-	
+
 	LOAD_6_PARA_POP
 
 	;pop ebx
@@ -273,7 +273,7 @@
 	movsx	r3, r3d
 	movsx	r5, r5d
 %endif
-	
+
 	;mov eax, [esp + 12 + 20]
 
     pxor      xmm7, xmm7
@@ -333,9 +333,9 @@
 
 	sub r5, 2
 	jnz .hloop_chroma
-	
+
 	LOAD_6_PARA_POP
-	
+
 	;pop edi
 	;pop esi
 	;pop ebx
--- a/codec/common/mc_luma.asm
+++ b/codec/common/mc_luma.asm
@@ -56,7 +56,7 @@
 
 ALIGN 16
 h264_w0x10:
-	dw 16, 16, 16, 16	
+	dw 16, 16, 16, 16
 ALIGN 16
 h264_w0x10_1:
 	dw 16, 16, 16, 16, 16, 16, 16, 16
@@ -91,7 +91,7 @@
 	;mov edi, [esp+20]
 	;mov ecx, [esp+24]
 	;mov edx, [esp+28]
-	
+
     %assign  push_num 0
     LOAD_5_PARA
 %ifndef X86_32
@@ -99,7 +99,7 @@
 	movsx	r3, r3d
 	movsx	r4, r4d
 %endif
-    
+
 	sub r0, 2
 	WELS_Zero mm7
 	movq mm6, [h264_w0x10]
@@ -195,7 +195,7 @@
 	;mov edi, [esp+24]		;pDst
 	;mov edx, [esp+28]	;iDstStride
 	;mov ebx, [esp+32]	;iHeight
-	
+
 	%assign  push_num 0
     LOAD_5_PARA
 %ifndef X86_32
@@ -257,7 +257,7 @@
 	;mov edi, [esp + 20]         ;pDst
 	;mov ecx, [esp + 28]         ;iHeight
 	;mov edx, [esp + 24]			;iDstStride
-	
+
 	%assign  push_num 0
     LOAD_5_PARA
 %ifndef X86_32
@@ -322,7 +322,7 @@
 	;mov edi, [esp + 20]         ;pDst
 	;mov ecx, [esp + 28]         ;iHeight
 	;mov edx, [esp + 24]			;iDstStride
-	
+
 	%assign  push_num 0
     LOAD_5_PARA
 %ifndef X86_32
@@ -392,7 +392,7 @@
 	lea r0, [r0+r1]
 	dec r4
 	jnz near .y_loop
-	
+
 	LOAD_5_PARA_POP
 	ret
 
@@ -523,7 +523,7 @@
 	;mov eax, [esp + 28]
 	;mov ecx, [esp + 36]
 	;mov ebx, [esp + 32]
-	
+
 	%assign  push_num 0
     LOAD_6_PARA
 %ifndef X86_32
@@ -532,7 +532,7 @@
 	movsx	r4, r4d
 	movsx	r5, r5d
 %endif
- 
+
 %ifndef X86_32
 	push r12
 	push r13
@@ -541,7 +541,7 @@
 	mov	 r13, r2
 	mov	 r14, r5
 %endif
-    
+
 	shr r4, 3
 	sub r0, r1
 	sub r0, r1
@@ -639,7 +639,7 @@
 	sub r0, r1
 	sub r0, r1
 	add r0, 8
-	add r2, 8	
+	add r2, 8
 	jmp near .xloop
 
 .xx_exit:
@@ -680,7 +680,7 @@
 	movsx	r3, r3d
 	movsx	r4, r4d
 	movsx	r5, r5d
-%endif    
+%endif
 	sub r0, 2
 	pxor xmm7, xmm7
 
@@ -842,7 +842,7 @@
 	;mov edx, [esp+28]
 	;mov ecx, [esp+32]
 	;mov ebx, [esp+36]
-	
+
 	%assign  push_num 0
     LOAD_6_PARA
 %ifndef X86_32
@@ -850,7 +850,7 @@
 	movsx	r3, r3d
 	movsx	r4, r4d
 	movsx	r5, r5d
-%endif    
+%endif
 	pxor xmm7, xmm7
 	sub r0, r1				;;;;;;;;need more 5 lines.
 	sub r0, r1
@@ -1023,7 +1023,7 @@
 	;mov edx, [esp+32]
 	;mov ebx, [esp+36]
 	;mov ecx, [esp+40]
-	
+
 	%assign  push_num 0
     LOAD_6_PARA
 %ifndef X86_32
--- a/codec/common/vaa.asm
+++ b/codec/common/vaa.asm
@@ -160,7 +160,7 @@
 AnalysisVaaInfoIntra_sse2:
 
     %assign push_num 0
-    LOAD_2_PARA 
+    LOAD_2_PARA
     SIGN_EXTENTION r1,r1d
 
 %ifdef X86_32
@@ -175,16 +175,16 @@
     and  r5,0fh
     sub  r7,r5
     sub  r7,32
-    
-    
-    mov r2,r1    
+
+
+    mov r2,r1
     sal r2,$1   ;r2 = 2*iLineSize
     mov r3,r2
     add r3,r1   ;r3 = 3*iLineSize
-    
+
     mov r4,r2
     sal r4,$1   ;r4 = 4*iLineSize
-    
+
 	pxor xmm7, xmm7
 
 	; loops
@@ -225,8 +225,8 @@
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
 
-	
-	
+
+
 	movd r2d, xmm0
 	and r2, 0ffffh		; effective low work truncated
 	mov r3, r2
@@ -234,7 +234,7 @@
 	sar r2, $4
 	movd retrd, xmm1
 	sub retrd, r2d
-	
+
 	add r7,32
 	add r7,r5
 
@@ -244,7 +244,7 @@
 	pop r4
 	pop r3
 %endif
-	
+
 	ret
 
 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
@@ -255,7 +255,7 @@
 AnalysisVaaInfoIntra_ssse3:
 
     %assign push_num 0
-    LOAD_2_PARA 
+    LOAD_2_PARA
     SIGN_EXTENTION r1,r1d
 
 %ifdef X86_32
@@ -265,41 +265,41 @@
     push r6
     %assign push_num push_num+4
 %endif
-   
+
     mov  r5,r7
     and  r5,0fh
     sub  r7,r5
     sub  r7,32
-    
 
-    mov r2,r1    
+
+    mov r2,r1
     sal r2,$1   ;r2 = 2*iLineSize
     mov r3,r2
     add r3,r1   ;r3 = 3*iLineSize
-    
+
     mov r4,r2
     sal r4,$1   ;r4 = 4*iLineSize
-     
+
 	pxor xmm7, xmm7
 
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7],xmm0
-    
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+8],xmm1
-    
-    
+
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
     movq [r7+16],xmm0
-    
+
 	lea r0,[r0+r4]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
     movq [r7+24],xmm1
-    
-    
+
+
 	movdqa xmm0,[r7]
 	movdqa xmm1,[r7+16]
 	movdqa xmm2, xmm0
@@ -322,7 +322,7 @@
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
 
-    
+
     movd r2d, xmm0
     and r2, 0ffffh          ; effective low work truncated
     mov r3, r2
@@ -339,7 +339,7 @@
 	pop r4
 	pop r3
 %endif
-	
+
 	ret
 
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
@@ -368,7 +368,7 @@
 	paddd xmm3, xmm4
 	movd r0d, xmm3
 	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
-	
+
 	jb near .threshold_exit
 	pshufd xmm0, xmm0, 01Bh
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
@@ -412,7 +412,7 @@
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
 	paddd xmm5, xmm4
-	
+
 	movd r0d, xmm5
 	cmp r0d, 20	; INTER_VARIANCE_SAD_THRESHOLD
 	jb near .threshold_exit
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -477,7 +477,7 @@
 		SSE2_Copy8Times	xmm4, r2d		; mm4 = c,c,c,c,c,c,c,c
 
 		;mov		esi,	[esp + pushsize + 4]
-		mov 	r0, r4 
+		mov 	r0, r4
 		add		r3,	16
 		imul	r2,	-3
 		add		r3,	r2				; s = a + 16 + (-3)*c
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -186,7 +186,7 @@
 	movsx r1, r1d
 	movsx r3, r3d
 	%endif
-;	mov     eax, [pDct   ] 
+;	mov     eax, [pDct   ]
     movq    mm0, [r4+ 0]
     movq    mm1, [r4+ 8]
     movq    mm2, [r4+16]
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -45,8 +45,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 	LOAD_1_PARA
 	;mov  eax,[esp+4]
 	prefetchnta [r0]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -71,7 +71,7 @@
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
 		neg		r1
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[r0],		xmm0
@@ -79,12 +79,12 @@
 		movdqa	[r0+32],	xmm0
 		movdqa	[r0+48],	xmm0
 		add		r0, 0x40
-		
+
 		add r1, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -96,7 +96,7 @@
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
 		neg		r1
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[r0],		mm0
@@ -106,16 +106,16 @@
 		movq	[r0+32],	mm0
 		movq	[r0+40],	mm0
 		movq	[r0+48],	mm0
-		movq	[r0+56],	mm0		
+		movq	[r0+56],	mm0
 		add		r0,		0x40
-		
+
 		add r1, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -125,17 +125,17 @@
 		%assign  push_num 0
 		LOAD_2_PARA
 		SIGN_EXTENTION r1, r1d
-		neg		r1			
+		neg		r1
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[r0],		mm0
 		add		r0,		0x08
-	
+
 		add		r1,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -163,7 +163,7 @@
 	;mov       ebx,  [esp+12]
 	;mov       ecx,  [esp+16]
 	;mov       edx,  [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
@@ -243,11 +243,11 @@
 	 ;mov    ebx,    [esp+12]
 	 ;mov    ecx,    [esp+16]
 	 ;mov    edx,    [esp+20]
-	 
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
     SSE2_GetSatd8x8
@@ -270,11 +270,11 @@
 	 ;mov    ebx,    [esp+12]
 	 ;mov    ecx,    [esp+16]
 	 ;mov    edx,    [esp+20]
-	 
+
 	 %assign  push_num 0
 	 LOAD_4_PARA
 	 SIGN_EXTENTION r1, r1d
-	 SIGN_EXTENTION r3, r3d	 
+	 SIGN_EXTENTION r3, r3d
 	 pxor   xmm6,   xmm6
      pxor   xmm7,   xmm7
 
@@ -302,18 +302,18 @@
 	;mov    ebx,    [esp+12]
 	;mov    ecx,    [esp+16]
 	;mov    edx,    [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	push r0
-	push r2	
+	push r2
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
 	SSE2_GetSatd8x8
-	
+
 	pop r2
 	pop r0
 	;mov    eax,    [esp+8]
@@ -341,13 +341,13 @@
 	;mov    ebx,    [esp+12]
 	;mov    ecx,    [esp+16]
 	;mov    edx,    [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
 	SIGN_EXTENTION r3, r3d
 	push r0
-	push r2	
+	push r2
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
 
@@ -1027,11 +1027,11 @@
 	;mov         ebx,[esp+12]
 	;mov         ecx,[esp+16]
 	;mov         edx,[esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	movdqa      xmm4,[HSwapSumSubDB1]
 	movd        xmm2,[r2]
 	movd        xmm5,[r2+r3]
@@ -1089,14 +1089,14 @@
 	;mov    ebx,    [esp+20]
 	;mov    ecx,    [esp+24]
 	;mov    edx,    [esp+28]
-%ifdef X86_32	
+%ifdef X86_32
 	push  r4
 	push  r5
-%endif	
+%endif
 	%assign  push_num 2
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
 	lea         r4,  [r1+r1*2]
 	lea         r5,  [r3+r3*2]
@@ -1130,15 +1130,15 @@
 	;mov    ebx,    [esp+pushsize+8]
 	;mov    ecx,    [esp+pushsize+12]
 	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32	
+%ifdef X86_32
 	push  r4
 	push  r5
 	push  r6
-%endif	
+%endif
 	%assign  push_num 3
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	movdqa      xmm7, [HSumSubDB1]
 	lea         r4,  [r1+r1*2]
 	lea         r5,  [r3+r3*2]
@@ -1175,17 +1175,17 @@
 	;mov    ebx,    [esp+20]
 	;mov    ecx,    [esp+24]
 	;mov    edx,    [esp+28]
-%ifdef X86_32	
+%ifdef X86_32
 	push  r4
 	push  r5
-%endif	
+%endif
 	%assign  push_num 2
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	push  r0
 	push  r2
-	
+
 	movdqa      xmm7, [HSumSubDB1]
 	lea         r4,  [r1+r1*2]
 	lea         r5,  [r3+r3*2]
@@ -1194,7 +1194,7 @@
 	lea			r0,  [r0+4*r1]
 	lea			r2,  [r2+4*r3]
 	SSE41_GetSatd8x4
-	
+
 	pop  r2
 	pop  r0
 	;mov			eax,    [esp+16]
@@ -1231,19 +1231,19 @@
 	;mov    ebx,    [esp+pushsize+8]
 	;mov    ecx,    [esp+pushsize+12]
 	;mov    edx,    [esp+pushsize+16]
-%ifdef X86_32	
+%ifdef X86_32
 	push  r4
 	push  r5
 	push  r6
-%endif	
+%endif
 	%assign  push_num 3
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
 	SIGN_EXTENTION r3, r3d
-	
+
 	push  r0
 	push  r2
-	
+
 	movdqa      xmm7, [HSumSubDB1]
 	lea         r4,  [r1+r1*2]
 	lea         r5,  [r3+r3*2]
@@ -1258,7 +1258,7 @@
 	jl          loop_get_satd_16x16_left
 
 	pop  r2
-	pop  r0	
+	pop  r0
 	;mov			eax,    [esp+pushsize+4]
 	;mov			ecx,    [esp+pushsize+12]
 	add			r0,    8
@@ -1360,13 +1360,13 @@
 	;push esi
 	;%define _STACK_SIZE		12
 	;mov eax, [esp+_STACK_SIZE+4 ]
-	;mov	ebx, [esp+_STACK_SIZE+8 ]	
+	;mov	ebx, [esp+_STACK_SIZE+8 ]
 	;mov ecx, [esp+_STACK_SIZE+12]
 	;mov edx, [esp+_STACK_SIZE+16]
 %ifdef X86_32
 	push  r4
 	push  r5
-%endif	
+%endif
 
 	%assign  push_num 2
 	LOAD_4_PARA
@@ -1411,11 +1411,11 @@
 	;mov    ebx,    [esp+12]
 	;mov    ecx,    [esp+16]
 	;mov    edx,    [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	movdqu xmm0,   [r2]
 	MOVDQ  xmm2,   [r0]
 	psadbw xmm0,   xmm2
@@ -1443,7 +1443,7 @@
 	;mov    ebx,    [esp+12]
 	;mov    ecx,    [esp+16]
 	;mov    edx,    [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
@@ -1483,7 +1483,7 @@
 	;push   edi
 	;mov    eax,    [esp+12]
 	;mov    ebx,    [esp+16]
-	
+
 	%assign  push_num 0
 	mov		r2,  arg3
 	push	r2
@@ -1490,7 +1490,7 @@
 	CACHE_SPLIT_CHECK r2, 8, 64
 	jle    near   .pixel_sad_8x8_nsplit
 	pop		r2
-%ifdef X86_32	
+%ifdef X86_32
 	push	r3
 	push	r4
 	push	r5
@@ -1497,10 +1497,10 @@
 %endif
 	%assign  push_num 3
 	mov		r0,  arg1
-	mov		r1,  arg2	
+	mov		r1,  arg2
 	SIGN_EXTENTION r1, r1d
     pxor   xmm7,   xmm7
-    
+
     ;ecx r2, edx r4, edi r5
 
     mov    r5,    r2
@@ -1594,18 +1594,18 @@
 	pop	 r3
 %endif
 	jmp        .return
-	
+
 .pixel_sad_8x8_nsplit:
     ;push   ebx
     ;mov    eax,    [esp+8]
 	;mov    ebx,    [esp+12]
 	;mov    edx,    [esp+20]
-	
+
 	pop r2
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    r0,    [r0+2*r1]
@@ -1652,11 +1652,11 @@
 	;mov    ebx,    [esp+12]
 	;mov    ecx,    [esp+16]
 	;mov    edx,    [esp+20]
-	
+
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
 	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
 	pxor   xmm6,   xmm6    ;sad pRefMb-1
@@ -1778,11 +1778,11 @@
 	;mov    ebx,    [esp+16]
 	;mov    edi,    [esp+20]
 	;mov    edx,    [esp+24]
-	
+
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
 	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
 	pxor   xmm6,   xmm6    ;sad pRefMb-1
@@ -1871,11 +1871,11 @@
 	;mov    ebx,    [esp+16]
 	;mov    edi,    [esp+20]
 	;mov    edx,    [esp+24]
-	
+
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
 	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
 	pxor   xmm6,   xmm6    ;sad pRefMb-1
@@ -2091,11 +2091,11 @@
 	;mov    ebx,    [esp+16]
 	;mov    edi,    [esp+20]
 	;mov    edx,    [esp+24]
-	
+
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm4,   xmm4    ;sad pRefMb-i_stride_ref
 	pxor   xmm5,   xmm5    ;sad pRefMb+i_stride_ref
 	pxor   xmm6,   xmm6    ;sad pRefMb-1
@@ -2219,11 +2219,11 @@
 	;mov    ebx,    [esp+16]
 	;mov    edi,    [esp+20]
 	;mov    edx,    [esp+24]
-	
+
 	%assign  push_num 0
 	LOAD_5_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d		
+	SIGN_EXTENTION r3, r3d
 	movd   xmm0,   [r0]
 	movd   xmm1,   [r0+r1]
 	lea        r0,    [r0+2*r1]
@@ -2310,11 +2310,11 @@
     ;mov		  ebx, [pix1stride ]
     ;mov		  ecx, [pix2address]
     ;mov		  edx, [pix2stride ]
-    
+
     %assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	movd	  mm0, [r0]
 	movd	  mm1, [r0+r1]
 	punpckldq mm0, mm1
--- a/codec/processing/src/asm/denoisefilter.asm
+++ b/codec/processing/src/asm/denoisefilter.asm
@@ -179,15 +179,15 @@
 ;%define         stride r1
 
 BilateralLumaFilter8_sse2:
-       
-        push r3 
+
+        push r3
         %assign push_num 1
         LOAD_2_PARA
 
 		pxor		xmm7,	xmm7
-	
+
 		mov         r3,     r0
-		
+
 		movq        xmm6,   [r0]
 		punpcklbw	xmm6,	xmm7
 		movdqa		xmm3,	[sse2_32]
@@ -218,10 +218,10 @@
 		packuswb	xmm5,	xmm5
 		movq		[r3],	xmm5
 
-       
+
 		pop r3
 		%assign push_num 0
-		
+
 		ret
 
 WELS_EXTERN WaverageChromaFilter8_sse2
@@ -239,11 +239,11 @@
 WaverageChromaFilter8_sse2:
 
         push r3
-       
+
         %assign push_num 1
-        
+
         LOAD_2_PARA
-        
+
         mov		r3,	r1
 		add		r3,	r3
 		sub		r0,	r3			; pixels - 2 * stride
@@ -272,8 +272,8 @@
 		packuswb	xmm3,		xmm3
 		movq		[r0 + 2],		xmm3
 
-              
+
         pop r3
-        
+
         %assign push_num 0
 		ret
--- a/codec/processing/src/asm/sad.asm
+++ b/codec/processing/src/asm/sad.asm
@@ -84,7 +84,7 @@
 	;push   edi
 	;mov    eax,    [esp+12]
 	;mov    ebx,    [esp+16]
-	
+
 	%assign  push_num 0
 	mov		r2,  arg3
 	push	r2
@@ -91,7 +91,7 @@
 	CACHE_SPLIT_CHECK r2, 8, 64
 	jle    near   .pixel_sad_8x8_nsplit
 	pop		r2
-%ifdef X86_32	
+%ifdef X86_32
 	push	r3
 	push	r4
 	push	r5
@@ -98,10 +98,10 @@
 %endif
 	%assign  push_num 3
 	mov		r0,  arg1
-	mov		r1,  arg2	
+	mov		r1,  arg2
 	SIGN_EXTENTION r1, r1d
     pxor   xmm7,   xmm7
-    
+
     ;ecx r2, edx r4, edi r5
 
     mov    r5,    r2
@@ -195,18 +195,18 @@
 	pop	 r3
 %endif
 	jmp        .return
-	
+
 .pixel_sad_8x8_nsplit:
     ;push   ebx
     ;mov    eax,    [esp+8]
 	;mov    ebx,    [esp+12]
 	;mov    edx,    [esp+20]
-	
+
 	pop r2
 	%assign  push_num 0
 	LOAD_4_PARA
 	SIGN_EXTENTION r1, r1d
-	SIGN_EXTENTION r3, r3d	
+	SIGN_EXTENTION r3, r3d
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    r0,    [r0+2*r1]