shithub: openh264

Download patch

ref: f9dea467123fbff2c74422a8634b20af4026de49
parent: 8f9a5469beb962c22b6d8bbe78f01ec79fb33a55
author: Martin Storsjö <martin@martin.st>
date: Fri Dec 13 05:06:44 EST 2013

Remove trailing whitespace

Most of it was removed in ff6b669176 from C++ source files,
but other files were left unchanged.

--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 OpenH264
 =======
 OpenH264 is a codec library which supports H.264 encoding and decoding. It is suitable for use in real time applications such as WebRTC. See http://www.openh264.org/ for more details.
- 
+
 Encoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -17,10 +17,10 @@
 - Single reference frame for inter prediction
 - Multiple reference frames when using LTR and/or 3-4 temporal layers
 - Periodic and on-demand Instantaneous Decoder Refresh (IDR) frame insertion
-- Dynamic changes to bit rate, frame rate, and resolution 
+- Dynamic changes to bit rate, frame rate, and resolution
 - Annex B byte stream output
 - YUV 4:2:0 planar input
- 
+
 Decoder Features
 ------------------------
 - Constrained Baseline Profile up to Level 5.2 (4096x2304)
@@ -32,7 +32,7 @@
 - Multiple reference frames when specified in Sequence Parameter Set (SPS)
 - Annex B byte stream input
 - YUV 4:2:0 planar output
- 
+
 OS Support
 ----------------
 - Windows 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
@@ -40,7 +40,7 @@
 - Linux 64-bit and 32-bit (initial release is only 32-bit, 64-bit will follow soon)
 - Android 32-bit (initial release does not include this target, will follow soon)
 - iOS 64-bit and 32-bit (not supported yet, may be added in the future)
- 
+
 Processor Support
 -------------------------
 - Intel x86 optionally with MMX/SSE (no AVX yet, help is welcome)
@@ -53,30 +53,30 @@
     : build the decoder library and executable via codec/build/linux/dec/makefile
     : build the encoder library and executable via codec/build/linux/enc/makefile
     : build the encoder shared library via processing/build/linux/makefile
- 
+
 Windows Visual Studio 2008/2010/2012 projects are available:
     : build the decoder via the Visual Studio projects in codec/build/win32/dec
     : build the encoder via the Visual Studio projects in codec/build/win32/enc
     : build the encoder shared library via the Visual Studio projects in processing/build/win32/
- 
+
 NASM needed to be installed for assembly code: workable version 2.07 or above, nasm can downloaded from http://www.nasm.us/
- 
+
 API details to be provided later.
- 
+
 Using the Test App
 -------------------------
 Linux shell scripts to build the test apps:
     : build via testbin/AutoBuild_Linux.sh
     : clean via testbin/AutoClean_Linux.sh
- 
+
 Windows batch files to build the test apps:
     : Visual Studio 2008 use testbin/AutoBuild_Windows_VS2008.bat
     : Visual Studio 2010 use testbin/AutoBuild_Windows_VS2010.bat
     : Visual Studio 2012 use testbin/AutoBuild_Windows_VS2012.bat
- 
+
 Usage information can be found in testbin/CmdLineReadMe
 Command line options and details to be provided later.
- 
+
 Using the Source
 -----------------------
 codec - encoder, decoder, console (test app), build (makefile, vcproj)
@@ -83,7 +83,7 @@
 processing - raw pixel processing (used by encoder)
 testbin - autobuild scripts, test app config files, yuv test files
 bin - binaries for library and test app
- 
+
 Known Issues
 -------------------
 See the issue tracker on https://github.com/cisco/openh264/issues
@@ -91,7 +91,7 @@
 - Encoder errors when compressed frame size exceeds half uncompressed size
 - Encoder console app only support multiple of 16 width/height for now
 - Decoder errors when compressed frame size exceeds 1MB
- 
+
 License
 ----------
 BSD, see LICENSE file for details.
--- a/build/mktargets.py
+++ b/build/mktargets.py
@@ -19,7 +19,7 @@
 def write_cpp_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(CXX) $(CFLAGS) $(CXXFLAGS) $(INCLUDES) $(' + PREFIX + '_CFLAGS) $(' + PREFIX + '_INCLUDES) -c -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -27,7 +27,7 @@
 def write_asm_rule(f, x):
     src = "$(%s_SRCDIR)/%s"%(PREFIX, x)
     dst = "$(%s_SRCDIR)/%s"%(PREFIX, make_o(x))
-    
+
     f.write("%s: %s\n"%(dst, src))
     f.write('\t$(ASM) $(ASMFLAGS) $(ASM_INCLUDES) $(' + PREFIX + '_ASMFLAGS) $(' + PREFIX + '_ASM_INCLUDES) -o ' + dst + ' ' + src + '\n');
     f.write("\n")
@@ -70,7 +70,7 @@
 f.write("%s_CPP_SRCS=\\\n"%(PREFIX))
 for c in cpp:
     f.write("\t$(%s_SRCDIR)/%s\\\n"%(PREFIX, c))
-f.write("\n")    
+f.write("\n")
 f.write("%s_OBJS += $(%s_CPP_SRCS:.cpp=.o)\n"%(PREFIX, PREFIX))
 
 f.write("ifeq ($(USE_ASM), Yes)\n");
--- a/codec/build/linux/dec/makefile
+++ b/codec/build/linux/dec/makefile
@@ -25,7 +25,7 @@
 ASFLAGS= -f elf -DNOPREFIX -I ../../../decoder/core/asm/
 
 LIBS= -lstdc++ -ldl
-#-lm 
+#-lm
 CFLAGS=  $(INCLUDE) -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DHAVE_CACHE_LINE_ALIGN
 
 ifeq ($(DBG),1)
@@ -65,7 +65,7 @@
 $(CORESRCDIR)/utils.cpp \
 $(PLUSSRCDIR)/welsDecoderExt.cpp \
 $(PLUSSRCDIR)/welsCodecTrace.cpp \
-$(COMMONSRCDIR)/logging.cpp 
+$(COMMONSRCDIR)/logging.cpp
 
 ASMSRC= $(ASMSRCDIR)/block_add.asm \
 $(ASMSRCDIR)/cpuid.asm \
@@ -78,7 +78,7 @@
 $(ASMSRCDIR)/mc_luma.asm \
 $(ASMSRCDIR)/memzero.asm \
 $(ASMSRCDIR)/asm_inc.asm \
- 
+
 MAINSRC= $(MAINSRCDIR)/d3d9_utils.cpp \
 $(MAINSRCDIR)/h264dec.cpp \
 $(MAINSRCDIR)/read_config.cpp
@@ -119,7 +119,7 @@
 $(OBJDIR)/mb_copy.o \
 $(OBJDIR)/mc_luma.o \
 $(OBJDIR)/memzero.o \
-$(OBJDIR)/asm_inc.o 
+$(OBJDIR)/asm_inc.o
 endif
 
 OBJBIN=	$(OBJDIR)/d3d9_utils.o \
@@ -134,7 +134,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(BINDIR) ; \
@@ -154,7 +154,7 @@
 		mkdir -p $(OBJDIR) ; \
 	fi
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -169,14 +169,14 @@
 	@rm -f $(OBJBIN)
 	@rm -f $(BINLIB)
 	@rm -f $(SHAREDLIB)
-	@rm -f $(BIN)    
+	@rm -f $(BIN)
 
 tags:
 	@echo update tag table
 	@etags $(CORESRCDIR)/*.c $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJDEC) 
+
+
+lib:   	$(OBJDEC)
 	@echo '$(OBJDEC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJDEC)'
@@ -197,15 +197,15 @@
 	@$(CXX)  -shared -Wl,-Bsymbolic -o $(SHAREDLIB) $(OBJDEC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
 
+
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(CXX) $(LIBS) $(OBJBIN) $(BINLIB) -o $(BIN)'
 	@echo 'creating binary "$(BIN)"'
-	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS) 
+	@$(CXX) $(OBJBIN) $(BINLIB) -o $(BIN) $(LIBS)
 	@echo '... done'
 	@echo
 
@@ -223,31 +223,31 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.c
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-		
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
+	@$(AS) $(ASFLAGS) -o $@ $<
 
 #$(OBJDIR)/%.o$(SUFFIX): $(ASMCOMDIR)/%.asm
 #	@echo 'compiling object file "$@" ...'
 #	@$(AS) $(ASFLAGS) -o $@ $<
-	
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
-	
+
 include $(DEPEND)
 
--- a/codec/build/linux/enc/makefile
+++ b/codec/build/linux/enc/makefile
@@ -26,8 +26,8 @@
 ASFLAGS= -f elf -DNOPREFIX -I ../../../encoder/core/asm/
 
 LIBS= -lstdc++ -ldl -lpthread -lm
-#-lm 
-CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED 
+#-lm
+CFLAGS=  $(INCLUDE) -m32 -fPIC -D__GCC__ -DLINUX -D__NO_CTYPE -DWELS_SVC -DENCODER_CORE -DHAVE_CACHE_LINE_ALIGN -DWELS_TESTBED -DMT_ENABLED
 
 ifeq ($(DBG),1)
 #SUFFIX= .dbg
@@ -150,7 +150,7 @@
 $(OBJDIR)/satd_sad.o \
 $(OBJDIR)/score.o \
 $(OBJDIR)/asm_inc.o \
-$(OBJDIR)/vaa.o 
+$(OBJDIR)/vaa.o
 endif
 OBJBIN=	$(OBJDIR)/read_config.o \
 $(OBJDIR)/welsenc.o
@@ -163,7 +163,7 @@
 
 dependencies:
 	@echo "" >dependencies
-	
+
 checkdir:
 	@echo 'checkdir..'
 	@if test ! -d $(OUTDIR) ; \
@@ -195,9 +195,9 @@
 tags:
 	@echo update tag table
 	@etags $(THREADLIBSRCDIR)/*.cpp $(COMMSRCDIR)/*.cpp $(CORESRCDIR)/*.cpp $(PLUSSRCDIR)/*.cpp $(MAINSRCDIR)/*.cpp
-	
-	
-lib:   	$(OBJENC) 
+
+
+lib:   	$(OBJENC)
 	@echo '$(OBJENC)'
 	@echo
 	@echo 'ar cr $(BINLIB) $(OBJENC)'
@@ -218,7 +218,7 @@
 	@$(GCC)  -shared -Wl,-Bsymbolic -m32 -o $(SHAREDLIB) $(OBJENC)  $(LIBS)
 	@echo '... done'
 	@echo
-	
+
 release:
 	@echo 'release..'
 	@echo 'cp -f $(SHAREDLIB) $(OUTDIR)'
@@ -228,7 +228,7 @@
 	@echo
 
 exe:	$(OBJBIN)
-	@echo	
+	@echo
 	@echo '$(OBJBIN)'
 	@echo
 	@echo '$(GCC) $(LIBS) $(OBJBIN) $(BINLIB) -m32 -o $(BIN)'
@@ -251,24 +251,24 @@
 
 $(OBJDIR)/%.o$(SUFFIX): $(CORESRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(PLUSSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<		
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
 
 $(OBJDIR)/%.o$(SUFFIX): $(ASMSRCDIR)/%.asm
 	@echo 'compiling object file "$@" ...'
-	@$(AS) $(ASFLAGS) -o $@ $<	
-	
+	@$(AS) $(ASFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(MAINSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
-	@$(CC) -m32 -c $(CFLAGS) -o $@ $<	
-	
+	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
+
 $(OBJDIR)/%.o$(SUFFIX): $(COMMONSRCDIR)/%.cpp
 	@echo 'compiling object file "$@" ...'
 	@$(CC) -m32 -c $(CFLAGS) -o $@ $<
--- a/codec/decoder/core/asm/asm_inc.asm
+++ b/codec/decoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/decoder/core/asm/block_add.asm
+++ b/codec/decoder/core/asm/block_add.asm
@@ -48,7 +48,7 @@
 ; Macros and other preprocessor constants
 ;*******************************************************************************
 
-%macro   BLOCK_ADD_16_SSE2   4 
+%macro   BLOCK_ADD_16_SSE2   4
 	movdqa    xmm0,       [%2]
 	movdqa    xmm1,       [%3]
     movdqa    xmm2,       [%3+10h]
@@ -65,7 +65,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%4*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 %macro    BLOCK_ADD_8_MMXEXT   4
@@ -106,7 +106,7 @@
 
 	lea          %2,      [%2+%4]
 	lea          %3,      [%3+%5*2]
-	lea          %1,      [%1+%4] 
+	lea          %1,      [%1+%4]
 %endmacro
 
 
@@ -130,24 +130,24 @@
 	lea          %1,      [%1+%4]
 %endmacro
 
-%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5    
+%macro    BLOCK_ADD_8_STRIDE_2_LINES_SSE2   5
 	movdqa xmm1, [%3]
 	movq xmm0, [%2]
 	punpcklbw xmm0, xmm7
 	paddw xmm0, xmm1
 	packuswb xmm0, xmm7
-	movq [%1], xmm0	
-	
+	movq [%1], xmm0
+
 	movdqa xmm3, [%3+%5*2]
 	movq xmm2, [%2+%4]
 	punpcklbw xmm2, xmm7
 	paddw xmm2, xmm3
-	packuswb xmm2, xmm7	
-	movq [%1+%4], xmm2	
-	
+	packuswb xmm2, xmm7
+	movq [%1+%4], xmm2
+
 	lea %1, [%1+%4*2]
 	lea %2, [%2+%4*2]
-	lea %3, [%3+%5*4]	
+	lea %3, [%3+%5*4]
 %endmacro
 
 %macro   CHECK_DATA_16_ZERO_SSE4     3
@@ -159,7 +159,7 @@
 	por		   xmm0,	 xmm1
 	ptest      xmm7,     xmm0
 	cmovae     eax,      %3
-	
+
 	add        %1,       20h
 	add        ecx,      04h
 	mov        byte [%2+ebx],  al
@@ -170,12 +170,12 @@
     movdqa     xmm1,      [%1+%3]
     movdqa     xmm2,      [%1+%3*2]
     movdqa     xmm3,      [%1+%4]
-    
+
     mov        eax,       0h
     mov        ebx,       0h
     movdqa     xmm4,      xmm0
     movdqa     xmm5,      xmm2
-    
+
     punpcklqdq  xmm0,     xmm1
     punpckhqdq  xmm4,     xmm1
     punpcklqdq  xmm2,     xmm3
@@ -183,12 +183,12 @@
 
 	por			xmm0,	  xmm2
 	por			xmm4,	  xmm5
-    
+
     ptest       xmm7,     xmm0
     cmovae      eax,      %5
     ptest       xmm7,     xmm4
-    cmovae      ebx,      %5    
-    
+    cmovae      ebx,      %5
+
     mov     byte [%2],    al
     mov     byte [%2+1],  bl
 %endmacro
@@ -230,45 +230,45 @@
     movdqa     xmm0,      [%1]
     movdqa     xmm1,      [%1+10h]
     mov        ebx,       [ecx]
-    
+
     pcmpeqw    xmm0,      xmm7
     pcmpeqw    xmm1,      xmm7
     packsswb   xmm0,      xmm1
-    pmovmskb   edx,       xmm0    
+    pmovmskb   edx,       xmm0
     sub        edx,       0ffffh
-    
-    cmovb      eax,       ebp   
+
+    cmovb      eax,       ebp
     add        ecx,       4
     add        %1,        20h
     mov      byte [%2+ebx],    al
 %endmacro
-    
 
 
+
 %macro   CHECK_RS_4x4_BLOCK_2_ZERO_SSE2    5
     movdqa    xmm0,      [%1]
     movdqa    xmm1,      [%1 + %3]
     movdqa    xmm2,      [%1 + %3*2]
-    movdqa    xmm3,      [%1 + %4]    
-    
+    movdqa    xmm3,      [%1 + %4]
+
     movdqa    xmm4,       xmm0
     movdqa    xmm5,       xmm2
-    
+
     punpcklqdq   xmm0,    xmm1
     punpckhqdq   xmm4,    xmm1
     punpcklqdq   xmm2,    xmm3
     punpckhqdq   xmm5,    xmm3
-    
+
     pcmpeqw      xmm0,    xmm7
     pcmpeqw      xmm2,    xmm7
     pcmpeqw      xmm4,    xmm7
     pcmpeqw      xmm5,    xmm7
-    
+
     packsswb     xmm0,    xmm2
     packsswb     xmm4,    xmm5
     pmovmskb     eax,     xmm0
     pmovmskb     ebx,     xmm4
-    
+
     sub          eax,     0ffffh
     mov          eax,     0
     cmovb        eax,     %5
@@ -276,7 +276,7 @@
     mov          ebx,     0
     cmovb        ebx,     %5
     mov       byte [%2],    al
-    mov       byte [%2+1],  bl        
+    mov       byte [%2+1],  bl
 %endmacro
 
 ;*******************************************************************************
@@ -291,12 +291,12 @@
 
 ALIGN  16
 SubMbScanIdx:
-     dd    0x0,  0x1,  0x4,  0x5, 
+     dd    0x0,  0x1,  0x4,  0x5,
 	 dd    0x2,  0x3,  0x6,  0x7,
 	 dd    0x8,  0x9,  0xc,  0xd,
 	 dd    0xa,  0xb,  0xe,  0xf,
 	 dd    0x10, 0x11, 0x14, 0x15,
-	 dd    0x12, 0x13, 0x16, 0x17,     
+	 dd    0x12, 0x13, 0x16, 0x17,
 
 ;*******************************************************************************
 ; Code
@@ -312,10 +312,10 @@
 ;  void_t WelsResBlockZero16x16_sse2(int16_t* pBlock,int32_t iStride)
 ;*******************************************************************************
 WelsResBlockZero16x16_sse2:
-    push     esi	
+    push     esi
 
 	mov      esi,        [esp+08h]
-	mov      ecx,        [esp+0ch]	
+	mov      ecx,        [esp+0ch]
 	lea      ecx,        [ecx*2]
 	lea      eax,        [ecx*3]
 
@@ -375,7 +375,7 @@
 
 	movdqa   [esi+eax],     xmm7
 	movdqa   [esi+eax+10h],     xmm7
-    
+
     pop      esi
 	ret
 
@@ -386,7 +386,7 @@
 ;*******************************************************************************
 ;  void_t WelsResBlockZero8x8_sse2(int16_t * pBlock, int32_t iStride)
 ;*******************************************************************************
-WelsResBlockZero8x8_sse2: 
+WelsResBlockZero8x8_sse2:
 	  push      esi
 
       mov       esi,     [esp+08h]
@@ -407,7 +407,7 @@
 	  movdqa    [esi+ecx*2],   xmm7
 	  movdqa    [esi+eax],     xmm7
 
-	  
+
 	  pop       esi
 	  ret
 
--- a/codec/decoder/core/asm/cpuid.asm
+++ b/codec/decoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t index, int32_t *uiFeatureA, int32_t *uiFeatureB, int32_t *uiFeatureC, int32_t *uiFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/decoder/core/asm/dct.asm
+++ b/codec/decoder/core/asm/dct.asm
@@ -99,9 +99,9 @@
 %define kiStride     esp+pushsize+8
 %define pRs         esp+pushsize+12
 
-	mov     eax, [pRs   ] 
-    mov     edx, [pPred ]   
-    mov     ecx, [kiStride]   
+	mov     eax, [pRs   ]
+    mov     edx, [pPred ]
+    mov     ecx, [kiStride]
     movq    mm0, [eax+ 0]
     movq    mm1, [eax+ 8]
     movq    mm2, [eax+16]
@@ -114,13 +114,13 @@
 
     WELS_Zero			mm7
     WELS_DW32			mm6
-    
+
     MMX_StoreDiff4P    mm3, mm0, mm6, mm7, [edx]
     MMX_StoreDiff4P    mm4, mm0, mm6, mm7, [edx+ecx]
     lea     edx, [edx+2*ecx]
     MMX_StoreDiff4P    mm1, mm0, mm6, mm7, [edx]
     MMX_StoreDiff4P    mm2, mm0, mm6, mm7, [edx+ecx]
-    
+
 %undef	pushsize
 %undef  pPred
 %undef  kiStride
--- a/codec/decoder/core/asm/deblock.asm
+++ b/codec/decoder/core/asm/deblock.asm
@@ -62,169 +62,169 @@
 
 ALIGN  16
 DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
   mov         edx,[ebp+10h]      ;  iStride
   mov         eax,[ebp+8]        ;  pPixCb
   mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
 
 ;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
 
@@ -231,203 +231,203 @@
 WELS_EXTERN  DeblockChromaLt4V_sse2
 
 DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
   mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
 ;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;          int32_t iAlpha, int32_t iBeta)
 ;***************************************************************************
 
@@ -434,606 +434,606 @@
 WELS_EXTERN     DeblockChromaEq4H_sse2
 
 ALIGN  16
-  
+
 DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
 ;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
-  
+
 WELS_EXTERN  DeblockChromaLt4H_sse2
-  
+
 ALIGN  16
 
 DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
 ;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta, int8_t * pTC)
 ;*******************************************************************************
-  
 
+
 WELS_EXTERN  DeblockLumaLt4V_sse2
-  
+
 ALIGN  16
 
 DeblockLumaLt4V_sse2:
@@ -1419,12 +1419,12 @@
 
 
 ;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta)
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockLumaEq4V_sse2
-  
+
 ALIGN  16
 
 DeblockLumaEq4V_sse2:
@@ -1965,11 +1965,11 @@
 	mov	esp, ebp
 	pop	ebp
 	ret
-  
-    
+
+
 ;********************************************************************************
 ;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
 ;
 ;********************************************************************************
 
@@ -1982,49 +1982,49 @@
     push    ebx
     mov     ebp,   esp
     and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
     mov     ecx,   [ebp + 10h]
     lea     edx,   [eax + ecx * 8]
     lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
+
+    movq    xmm0,  [eax]
     movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
+    punpcklqdq   xmm0,  xmm7
     movq    xmm1,  [eax + ecx]
     movq    xmm7,  [edx + ecx]
     punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm2,  [eax + ecx*2]
     movq    xmm7,  [edx + ecx*2]
     punpcklqdq   xmm2,  xmm7
     movq    xmm3,  [eax + ebx]
     movq    xmm7,  [edx + ebx]
     punpcklqdq   xmm3,  xmm7
-    
+
     lea     eax,   [eax + ecx * 4]
     lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
+    movq    xmm4,  [eax]
     movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
+    punpcklqdq   xmm4,  xmm7
     movq    xmm5,  [eax + ecx]
     movq    xmm7,  [edx + ecx]
     punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm6,  [eax + ecx*2]
     movq    xmm7,  [edx + ecx*2]
     punpcklqdq   xmm6,  xmm7
-    
+
     movdqa  [esp],   xmm0
     movq    xmm7,  [eax + ebx]
     movq    xmm0,  [edx + ebx]
     punpcklqdq   xmm7,  xmm0
     movdqa  xmm0,   [esp]
-    
+
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
+
     mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
+    movdqa  [eax],    xmm4
     movdqa  [eax + 10h],  xmm2
     movdqa  [eax + 20h],  xmm3
     movdqa  [eax + 30h],  xmm7
@@ -2031,15 +2031,15 @@
     movdqa  [eax + 40h],  xmm5
     movdqa  [eax + 50h],  xmm1
     movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
+    movdqa  [eax + 70h],  xmm0
+
     mov     esp,   ebp
     pop     ebx
     pop     ebp
     ret
-    
-    
-    
+
+
+
 ;*******************************************************************************************
 ;
 ;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
@@ -2053,14 +2053,14 @@
 DeblockLumaTransposeV2H_sse2:
     push     ebp
     mov      ebp,   esp
-    
+
     and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
     mov      ecx,   [ebp + 0Ch]
     mov      edx,   [ebp + 08h]
-      
+
     movdqa   xmm0,  [eax]
     movdqa   xmm1,  [eax + 10h]
     movdqa   xmm2,  [eax + 20h]
@@ -2069,23 +2069,23 @@
     movdqa   xmm5,	[eax + 50h]
     movdqa   xmm6,	[eax + 60h]
     movdqa   xmm7,	[eax + 70h]
-    
+
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
+
     lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
+
+    movq     [edx],  xmm4
     movq     [edx + ecx],  xmm2
     movq     [edx + ecx*2],  xmm3
     movq     [edx + eax],  xmm7
-    
+
     lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
+    movq     [edx],  xmm5
     movq     [edx + ecx],  xmm1
     movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
+    movq     [edx + eax],  xmm0
+
     psrldq    xmm4,   8
     psrldq    xmm2,   8
     psrldq    xmm3,   8
@@ -2094,20 +2094,20 @@
     psrldq    xmm1,   8
     psrldq    xmm6,   8
     psrldq    xmm0,   8
-    
+
     lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
+    movq     [edx],  xmm4
     movq     [edx + ecx],  xmm2
     movq     [edx + ecx*2],  xmm3
     movq     [edx + eax],  xmm7
-    
+
     lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
+    movq     [edx],  xmm5
     movq     [edx + ecx],  xmm1
     movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
+    movq     [edx + eax],  xmm0
+
+
     mov      esp,   ebp
     pop      ebp
     ret
\ No newline at end of file
--- a/codec/decoder/core/asm/expand_picture.asm
+++ b/codec/decoder/core/asm/expand_picture.asm
@@ -155,11 +155,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -173,7 +173,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -184,15 +184,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -202,21 +202,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -243,13 +243,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -256,37 +256,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -339,25 +339,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -375,7 +375,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -387,10 +387,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -398,16 +398,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -419,7 +419,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -426,7 +426,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -436,7 +436,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -444,19 +444,19 @@
 	mov ecx, [esp+28]					; kiStride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -kiStride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -472,7 +472,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -484,10 +484,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -495,16 +495,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; pDst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; pDst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst: left border pSrc
@@ -516,7 +516,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -523,7 +523,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -533,9 +533,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -545,15 +545,15 @@
 	neg ecx										; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -569,7 +569,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; pDst
@@ -581,10 +581,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; pDst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; kiStride	
+	mov ecx, edx							; kiStride
 	neg ecx 								; -kiStride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*kiStride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -592,16 +592,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*kiStride + 16 * kiStride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; kiWidth-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; kiWidth
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -613,7 +613,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -620,7 +620,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -630,9 +630,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -kiStride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; kiStride
 	add edx, 16							; kiHeight+16, luma=32, chroma=16
@@ -642,14 +642,14 @@
 	neg ecx									; -kiStride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/decoder/core/asm/intra_pred.asm
+++ b/codec/decoder/core/asm/intra_pred.asm
@@ -38,7 +38,7 @@
 ;*      18/09/2009 Created
 ;*		19/11/2010 Added
 ;*					WelsI16x16LumaPredDcTop_sse2, WelsI16x16LumaPredDcNA_sse2,
-;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2 
+;*					WelsIChromaPredDcLeft_mmx, WelsIChromaPredDcTop_sse2
 ;*					and WelsIChromaPredDcNA_mmx
 ;*
 ;*
@@ -96,13 +96,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -116,24 +116,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -162,7 +162,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -186,7 +186,7 @@
 ALIGN 16
 ;*******************************************************************************
 ;   void_t __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;	pPred must align to 16
 ;*******************************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -196,7 +196,7 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
@@ -205,11 +205,11 @@
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+2*ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	sub         eax,    ecx
 	movd        [eax], xmm0
 	movd        [eax+ecx], xmm1
@@ -216,9 +216,9 @@
 	lea         eax, [eax+2*ecx]
 	movd        [eax], xmm2
 	movd        [eax+ecx], xmm3
-	
+
 	ret
-	
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -229,9 +229,9 @@
 		mov		ecx,	[esp + pushsize + 8]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -241,7 +241,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -249,26 +249,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*kiStride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -276,17 +276,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -295,7 +295,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -302,13 +302,13 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredH_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -315,7 +315,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE_DEC 0
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],	xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
@@ -326,13 +326,12 @@
 WelsI16x16LumaPredH_sse2:
     mov     eax, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [eax],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [eax+ecx],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE_DEC 
+
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
@@ -339,9 +338,10 @@
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
 	SSE2_PRED_H_16X16_TWO_LINE_DEC
-   
+	SSE2_PRED_H_16X16_TWO_LINE_DEC
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsI16x16LumaPredV_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -349,10 +349,10 @@
 WelsI16x16LumaPredV_sse2:
     mov     edx, [esp+4]    ; pPred
     mov     ecx, [esp+8]    ; kiStride
-    
+
     sub     edx, ecx
     movdqa  xmm0, [edx]
-    
+
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
@@ -377,9 +377,9 @@
     movdqa  [edx+ecx], xmm0
     lea     edx, [edx+2*ecx]
     movdqa  [edx],     xmm0
-        
+
     ret
-    
+
 ;*******************************************************************************
 ; void_t WelsIChromaPredPlane_sse2(uint8_t *pPred, const int32_t kiStride);
 ;*******************************************************************************
@@ -391,8 +391,8 @@
 		mov		ecx,	[esp + pushsize + 8]	;kiStride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -402,7 +402,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -412,7 +412,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -421,17 +421,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*kiStride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -441,17 +441,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -463,12 +463,12 @@
 		add		esi,	ecx
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -480,13 +480,13 @@
 ;	pPred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pPred
 	mov         eax,edx
 	mov			ecx,[esp+8]		;kiStride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -513,19 +513,19 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
+
 	lea         edx,[edx+ecx]
-	movd        [edx+2*ecx],mm2 
+	movd        [edx+2*ecx],mm2
 	sub         edx,ecx
-	psrlq       mm2,8 
-	movd        [edx+2*ecx],mm2 
-	psrlq       mm2,8 
-	movd        [edx+ecx],mm2 
-	psrlq       mm2,8 
+	psrlq       mm2,8
+	movd        [edx+2*ecx],mm2
+	psrlq       mm2,8
+	movd        [edx+ecx],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -537,36 +537,36 @@
 ;	pPred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void_t __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
-;   
+;
 ;*******************************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+4]			;pPred
 	mov			ecx,[esp+8]			;kiStride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pPred
 	mov         [edx],       ebx
 	mov         [edx+ecx],   ebx
@@ -575,8 +575,8 @@
 	mov         [edx+ecx],   ebx
 
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredH_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -585,7 +585,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -594,7 +594,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
 	movq		[%4],	%1
@@ -605,37 +605,37 @@
 	mov			edx,	[esp+4]			;pPred
 	mov         eax,	edx
 	mov			ecx,	[esp+8]			;kiStride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
-	
+
 	lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-	
+
 	lea			eax, [eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax, edx+2*ecx
 
     lea         edx, [edx+2*ecx]
 	MMX_PRED_H_8X8_ONE_LINEE mm0, mm1, eax, edx+ecx
-		
+
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl get_i4x4_luma_pred_v_asm(uint8_t *pPred, const int32_t kiStride)
@@ -645,7 +645,7 @@
 get_i4x4_luma_pred_v_asm:
 	mov			eax,	[esp+4]        ;pPred
 	mov			ecx,	[esp+8]        ;kiStride
-	
+
 	sub			eax,	ecx
 	mov         edx,    [eax]
 	mov		    [eax+ecx],	 edx
@@ -653,9 +653,9 @@
 	lea         eax, [eax+2*ecx]
 	mov			[eax+ecx],	 edx
 	mov			[eax+2*ecx], edx
-	
-	ret	
 
+	ret
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t __cdecl WelsIChromaPredV_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -665,7 +665,7 @@
 WelsIChromaPredV_mmx:
 	mov			eax,		[esp+4]    ;pPred
 	mov			ecx,		[esp+8]    ;kiStride
-	
+
 	sub			eax,		ecx
 	movq		mm0,		[eax]
 
@@ -680,11 +680,11 @@
 	lea         eax, [eax+2*ecx]
 	movq		[eax+ecx],      mm0
 	movq		[eax+2*ecx],    mm0
-	
+
 	WELSEMMS
 	ret
-	
-	
+
+
 	ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -710,13 +710,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHD_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -723,16 +723,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -740,17 +740,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -757,7 +757,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm3
@@ -768,9 +768,9 @@
 	movd        [edx+ecx], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -793,17 +793,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredHU_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -811,39 +811,39 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	lea         edx, [edx+ecx]
 	movd        [edx+2*ecx], mm4
-	
+
 	sub         edx, ecx
 	movd        [edx], mm1
 	psrlq       mm1, 10h
@@ -852,9 +852,9 @@
 	movd        [edx+2*ecx], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -880,12 +880,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void_t WelsI4x4LumaPredVR_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -892,51 +892,51 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+ecx], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+2*ecx], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	lea         edx, [edx+2*ecx]
@@ -943,7 +943,7 @@
 	movd        [edx+ecx], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -966,13 +966,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredDDL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
@@ -980,11 +980,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -994,9 +994,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -1008,8 +1008,8 @@
 	movd        [edx+ecx], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;*******************************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1035,40 +1035,40 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void_t WelsI4x4LumaPredVL_mmx(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pPred
 	mov         eax, edx
 	mov			ecx, [esp+8]            ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+2*ecx], mm3
-	
+
 	movd        [edx+ecx], mm2
 	psrlq       mm2, 8h
 	lea         edx, [edx+2*ecx]
@@ -1075,7 +1075,7 @@
 	movd        [edx+ecx], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1082,11 +1082,11 @@
 ;   void_t WelsIChromaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
 
@@ -1100,7 +1100,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
 	lea         eax, [eax+2*ecx]
 	movzx		edx, byte [eax-0x01]     ; l6
@@ -1111,7 +1111,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1118,46 +1118,46 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pPred
-	
+
 	movq        [edx],       mm0
 	movq        [edx+ecx],   mm0
 	movq        [edx+2*ecx], mm0
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm0
-	
+
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
@@ -1164,13 +1164,13 @@
 	movq        [edx+2*ecx], mm1
 	lea         edx, [edx+2*ecx]
 	movq        [edx+ecx],   mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;*******************************************************************************
 ;
@@ -1177,11 +1177,11 @@
 ;   void_t WelsI16x16LumaPredDc_sse2(uint8_t *pPred, const int32_t kiStride)
 ;*******************************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+8]			; pPred
 	mov			ecx, [esp+12]           ; kiStride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1191,7 +1191,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	movzx		ebx, byte [eax+ecx-0x01]
 	movzx		edx, byte [eax+2*ecx-0x01]
 	add			ebx, edx
@@ -1209,44 +1209,44 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pPred
-	
+
 	movdqa      [edx],       xmm0
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 	movdqa      [edx+2*ecx], xmm0
 	lea         edx,         [edx+2*ecx]
-	
+
 	movdqa      [edx+ecx],   xmm0
 
 	pop         ebx
 
 	ret
-	
+
 ;*******************************************************************************
 ; for intra prediction as follows, 11/19/2010
 ;*******************************************************************************
@@ -1258,12 +1258,12 @@
 WELS_EXTERN WelsI16x16LumaPredDcTop_sse2
 WelsI16x16LumaPredDcTop_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE 4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
 	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
-	
+
 	mov ecx, ebx
 	neg ecx
 	movdqa xmm0, [eax+ecx]		; pPred-kiStride, top line
@@ -1278,10 +1278,10 @@
 	pshufd xmm1, xmm0, 0b1h		; 10110001, w1+5 w0+4 w3+7 w2+6 w1+5 w0+4 w3+7 w2+6
 	paddw xmm0, xmm1			; w_o w_e w_o w_e w_o w_e w_o w_e (w_o=1+3+5+7, w_e=0+2+4+6)
 	pshuflw xmm1, xmm0, 0b1h	; 10110001
-	paddw xmm0, xmm1			; sum in word unit (x8)	
+	paddw xmm0, xmm1			; sum in word unit (x8)
 	movd edx, xmm0
 	and edx, 0ffffh
-	
+
 	add edx, 08h
 	sar edx, 04h
 	mov dh, dl
@@ -1288,35 +1288,35 @@
 	mov ecx, edx
 	shl ecx, 010h
 	or edx, ecx
-	movd xmm1, edx	
+	movd xmm1, edx
 	pshufd xmm0, xmm1, 00h
 	movdqa xmm1, xmm0
-	
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
 	pop ebx
 	ret
@@ -1328,41 +1328,41 @@
 WELS_EXTERN WelsI16x16LumaPredDcNA_sse2
 WelsI16x16LumaPredDcNA_sse2:
 	push ebx
-	
+
 	%define PUSH_SIZE	4
-	
+
 	mov eax, [esp+PUSH_SIZE+4]	; pPred
-	mov ebx, [esp+PUSH_SIZE+8]	; kiStride	
-	
+	mov ebx, [esp+PUSH_SIZE+8]	; kiStride
+
 	lea ecx, [2*ebx+ebx]		; 3*kiStride
-	
+
 	movdqa xmm0, [sse2_dc_0x80]
-	movdqa xmm1, xmm0	
+	movdqa xmm1, xmm0
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
-	movdqa [eax+ecx], xmm1	
+	movdqa [eax+ecx], xmm1
 	lea eax, [eax+4*ebx]
 	movdqa [eax], xmm0
 	movdqa [eax+ebx], xmm1
 	movdqa [eax+2*ebx], xmm0
 	movdqa [eax+ecx], xmm1
-	
+
 	%undef PUSH_SIZE
-	
+
 	pop ebx
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcLeft_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1370,12 +1370,12 @@
 WELS_EXTERN WelsIChromaPredDcLeft_mmx
 WelsIChromaPredDcLeft_mmx:
 	push ebx
-	push esi	
+	push esi
 	%define PUSH_SIZE 8
 	mov esi, [esp+PUSH_SIZE+4]	; pPred
 	mov ecx, [esp+PUSH_SIZE+8]	; kiStride
 	mov eax, esi
-	; for left	
+	; for left
 	dec eax
 	xor ebx, ebx
 	xor edx, edx
@@ -1384,7 +1384,7 @@
 	add ebx, edx
 	lea eax, [eax+2*ecx]
 	mov dl, [eax]
-	add ebx, edx	
+	add ebx, edx
 	mov dl, [eax+ecx]
 	add ebx, edx
 	add ebx, 02h
@@ -1451,7 +1451,7 @@
 	movdqa xmm6, [sse2_wd_0x02]
 	paddw xmm0, xmm6
 	psraw xmm0, 02h
-	packuswb xmm0, xmm7	
+	packuswb xmm0, xmm7
 	lea ebx, [2*ecx+ecx]
 	movq [eax], xmm0
 	movq [eax+ecx], xmm0
@@ -1463,10 +1463,10 @@
 	movq [eax+2*ecx], xmm0
 	movq [eax+ebx], xmm0
 	%undef PUSH_SIZE
-	pop ebx	
+	pop ebx
 	ret
 
-	
+
 ALIGN 16
 ;*******************************************************************************
 ;	void_t WelsIChromaPredDcNA_mmx(uint8_t *pPred, const int32_t kiStride)
@@ -1495,4 +1495,4 @@
 	ret
 
 
-	
+
--- a/codec/decoder/core/asm/mb_copy.asm
+++ b/codec/decoder/core/asm/mb_copy.asm
@@ -37,7 +37,7 @@
 ;*  History
 ;*      15/09/2009 Created
 ;*		12/28/2009 Modified with larger throughput
-;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2, 
+;*		12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
 ;*				   WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
 ;*
 ;*
@@ -84,7 +84,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq4_mmx:
-   
+
     push        esi
     push        edi
     push        ebp
@@ -102,7 +102,7 @@
 	movd        mm0, [ebp]
     pavgb       mm0, [esi]
     movd        [edi], mm0
-   
+
     dec         ebx
     lea         edi, [edi+eax]
     lea         esi, [esi+ecx]
@@ -115,7 +115,7 @@
     pop         edi
     pop         esi
     ret
-                          
+
 ALIGN 16
 ;*******************************************************************************
 ; void_t PixelAvgWidthEq8_mmx( uint8_t *pDst,  int iDstStride,
@@ -124,7 +124,7 @@
 ;                           int iHeight );
 ;*******************************************************************************
 PixelAvgWidthEq8_mmx:
-    
+
     push        esi
     push        edi
     push        ebp
@@ -145,14 +145,14 @@
     movq        mm0, [esi+ecx]
     pavgb       mm0, [ebp+edx]
     movq		[edi+eax], mm0
-    
+
     lea			esi,  [esi+2*ecx]
     lea			ebp, [ebp+2*edx]
     lea			edi,  [edi+2*eax]
-    
+
     sub           ebx, 2
     jnz         .height_loop
-	
+
 	WELSEMMS
     pop         ebx
     pop         ebp
@@ -174,8 +174,8 @@
     push        edi
     push        ebp
     push        ebx
-    
 
+
     mov         edi, [esp+20]       ; pDst
     mov         eax, [esp+24]       ; iDstStride
     mov         esi, [esp+28]       ; pSrcA
@@ -188,28 +188,28 @@
 	movdqu      xmm0, [esi]
 	pavgb         xmm0, [ebp]
     movdqu      [edi], xmm0
-    
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-	
+
 	movdqu      xmm0, [esi+2*ecx]
 	pavgb         xmm0, [ebp+2*edx]
     movdqu      [edi+2*eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-     
+
 	movdqu      xmm0, [esi+ecx]
 	pavgb         xmm0, [ebp+edx]
     movdqu      [edi+eax], xmm0
-    
+
     lea              esi,  [esi+2*ecx]
     lea			   ebp, [ebp+2*edx]
     lea			   edi,  [edi+2*eax]
-	    
-    
+
+
     sub         ebx, 4
     jne         .height_loop
 
@@ -232,7 +232,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -242,12 +242,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -275,12 +275,11 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
 
 
 
@@ -288,6 +287,7 @@
 
 
 
+
 ALIGN 16
 ;*******************************************************************************
 ;   void_t McCopyWidthEq16_sse2( uint8_t *pSrc, int iSrcStride, uint8_t *pDst, int iDstStride, int iHeight )
@@ -308,7 +308,7 @@
     push    edi
 
     mov     esi, [esp+12]       ; pSrc
-    mov     eax, [esp+16]       ; iSrcStride    
+    mov     eax, [esp+16]       ; iSrcStride
     mov     edi, [esp+20]       ; pDst
     mov     edx, [esp+24]       ; iDstStride
     mov     ecx, [esp+28]       ; iHeight
@@ -324,7 +324,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/decoder/core/asm/mc_chroma.asm
+++ b/codec/decoder/core/asm/mc_chroma.asm
@@ -69,11 +69,11 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
 ;							int32_t iHeigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq4_mmx
@@ -81,29 +81,29 @@
 	push esi
 	push edi
 	push ebx
-	
+
 	mov eax, [esp +12 + 20]
 	movd mm3, [eax]
 	WELS_Zero mm7
 	punpcklbw mm3, mm3
 	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
 	movq	  mm5, mm3
 	punpcklbw mm3, mm7
 	punpckhbw mm5, mm7
-	
+
 	movq	  mm6, mm4
 	punpcklbw mm4, mm7
 	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
 	lea ebx, [esi + eax]
 	movd mm0, [esi]
 	movd mm1, [esi+1]
@@ -110,17 +110,17 @@
 	punpcklbw mm0, mm7
 	punpcklbw mm1, mm7
 .xloop:
-	
+
 	pmullw mm0, mm3
 	pmullw mm1, mm5
 	paddw  mm0, mm1
-	
+
 	movd  mm1, [ebx]
 	punpcklbw mm1, mm7
 	movq mm2, mm1
 	pmullw mm1, mm4
 	paddw mm0, mm1
-	
+
 	movd mm1, [ebx+1]
 	punpcklbw mm1, mm7
 	movq mm7, mm1
@@ -130,13 +130,13 @@
 
 	paddw mm0, [h264_d0x20_mmx]
 	psrlw mm0, 6
-	
+
 	WELS_Zero mm7
 	packuswb mm0, mm7
-	movd [edi], mm0	
+	movd [edi], mm0
 
 	movq mm0, mm2
-	
+
 	lea edi, [edi +edx  ]
 	lea ebx, [ebx + eax]
 
@@ -151,11 +151,11 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
 ;						int32_t iheigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq8_sse2
@@ -163,30 +163,30 @@
 	push esi
 	push edi
 	push ebx
-	
+
 	mov eax, [esp +12 + 20]
 	movd xmm3, [eax]
 	WELS_Zero xmm7
 	punpcklbw  xmm3, xmm3
 	punpcklwd  xmm3, xmm3
-	
+
 	movdqa	   xmm4, xmm3
 	punpckldq  xmm3, xmm3
 	punpckhdq  xmm4, xmm4
 	movdqa     xmm5, xmm3
 	movdqa	   xmm6, xmm4
-	
+
 	punpcklbw  xmm3, xmm7
 	punpckhbw  xmm5, xmm7
 	punpcklbw  xmm4, xmm7
 	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
 	lea ebx, [esi + eax]
 	movq xmm0, [esi]
 	movq xmm1, [esi+1]
@@ -193,17 +193,17 @@
 	punpcklbw xmm0, xmm7
 	punpcklbw xmm1, xmm7
 .xloop:
-	
+
 	pmullw xmm0, xmm3
 	pmullw xmm1, xmm5
 	paddw  xmm0, xmm1
-	
+
 	movq  xmm1, [ebx]
 	punpcklbw xmm1, xmm7
 	movdqa xmm2, xmm1
 	pmullw xmm1, xmm4
 	paddw xmm0, xmm1
-	
+
 	movq xmm1, [ebx+1]
 	punpcklbw xmm1, xmm7
 	movdqa xmm7, xmm1
@@ -213,19 +213,19 @@
 
 	paddw xmm0, [h264_d0x20_sse2]
 	psrlw xmm0, 6
-	
+
 	WELS_Zero xmm7
 	packuswb xmm0, xmm7
-	movq [edi], xmm0	
+	movq [edi], xmm0
 
 	movdqa xmm0, xmm2
-	
+
 	lea edi, [edi +edx  ]
 	lea ebx, [ebx + eax]
 
 	dec ecx
 	jnz near .xloop
-	
+
 	pop ebx
 	pop edi
 	pop esi
@@ -237,8 +237,8 @@
 ALIGN 16
 ;***********************************************************************
 ; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
 ;                        int32_t iDstStride,
 ;                        uint8_t *pABCD,
 ;					     int32_t iHeigh);
@@ -248,23 +248,23 @@
 	push ebx
 	push esi
 	push edi
-		
+
 	mov eax, [esp + 12 + 20]
 
     pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
     movdqa    xmm6, xmm5
     punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
     sub esi, edi
     sub esi, edi
 	movdqa xmm7, [h264_d0x20_sse2]
@@ -273,16 +273,16 @@
 	movdqa xmm1, xmm0
 	psrldq xmm1, 1
 	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
+
+.hloop_chroma:
 	lea	esi, [esi+2*edi]
-	
+
 	movdqu xmm2, [eax+edx]
 	movdqa xmm3, xmm2
 	psrldq xmm3, 1
 	punpcklbw xmm2, xmm3
 	movdqa      xmm4, xmm2
-	
+
     pmaddubsw  xmm0, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm0, xmm2
@@ -289,8 +289,8 @@
     paddw      xmm0, xmm7
 	psrlw      xmm0, 6
     packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
+    movq       [esi],xmm0
+
     lea eax, [eax+2*edx]
     movdqu xmm2, [eax]
     movdqa xmm3, xmm2
@@ -297,7 +297,7 @@
     psrldq xmm3, 1
     punpcklbw xmm2, xmm3
     movdqa      xmm0, xmm2
-    
+
     pmaddubsw  xmm4, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm4, xmm2
@@ -304,8 +304,8 @@
     paddw      xmm4, xmm7
 	psrlw      xmm4, 6
     packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
+    movq       [esi+edi],xmm4
+
 	sub ecx, 2
 	jnz .hloop_chroma
 	pop edi
--- a/codec/decoder/core/asm/mc_luma.asm
+++ b/codec/decoder/core/asm/mc_luma.asm
@@ -69,16 +69,16 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;						uint8_t *pDst, 
-;						int iDstStride, 
+; void_t McHorVer20WidthEq4_mmx( uint8_t *pSrc,
+;                       int iSrcStride,
+;						uint8_t *pDst,
+;						int iDstStride,
 ;						int iHeight)
 ;*******************************************************************************
 McHorVer20WidthEq4_mmx:
 	push esi
 	push edi
-	
+
 	mov  esi, [esp+12]
 	mov eax, [esp+16]
 	mov edi, [esp+20]
@@ -100,7 +100,7 @@
 	punpcklbw mm4, mm7
 	movd mm5, [esi+3]
 	punpcklbw mm5, mm7
-	
+
 	paddw mm2, mm3
 	paddw mm4, mm5
 	psllw mm4, 2
@@ -113,12 +113,12 @@
 	psraw mm0, 5
 	packuswb mm0, mm7
 	movd [edi], mm0
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
+
 	WELSEMMS
 	pop edi
 	pop esi
@@ -181,8 +181,8 @@
 
 ALIGN 16
 ;***********************************************************************
-; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc, 
-;                       int16_t iSrcStride, 
+; void_t McHorVer22Width8HorFirst_sse2(int16_t *pSrc,
+;                       int16_t iSrcStride,
 ;						uint8_t *pDst,
 ;						int32_t iDstStride
 ;						int32_t iHeight
@@ -197,11 +197,11 @@
 	mov edi, [esp+24]		;pDst
 	mov edx, [esp+28]	;iDstStride
 	mov ebx, [esp+32]	;iHeight
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -215,7 +215,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -225,7 +225,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -238,8 +238,8 @@
 ALIGN 16
 ;***********************************************************************
 ;void_t McHorVer22VerLast_sse2(
-;											uint8_t *pSrc, 
-;											int32_t pSrcStride, 
+;											uint8_t *pSrc,
+;											int32_t pSrcStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -250,17 +250,17 @@
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -272,15 +272,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -290,12 +290,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -302,61 +302,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -366,9 +366,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -379,10 +379,10 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq8_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -389,18 +389,18 @@
 McHorVer20WidthEq8_sse2:
 	push	esi
 	push	edi
-	
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
-.y_loop:	
+.y_loop:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -413,7 +413,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -424,7 +424,7 @@
 	paddw xmm0, xmm4
 	paddw xmm0, xmm6
 	psraw xmm0, 5
-	
+
 	packuswb xmm0, xmm7
 	movq [edi], xmm0
 
@@ -432,17 +432,17 @@
 	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
-	
+
 	pop edi
 	pop esi
 	ret
-	
+
 ALIGN 16
 ;*******************************************************************************
-; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;                       int iSrcStride, 
-;												uint8_t *pDst, 
-;												int iDstStride, 
+; void_t McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;                       int iSrcStride,
+;												uint8_t *pDst,
+;												int iDstStride,
 ;												int iHeight,
 ;                      );
 ;*******************************************************************************
@@ -449,20 +449,20 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
+
 	mov esi, [esp + 12]         ;pSrc
 	mov eax, [esp + 16]         ;iSrcStride
 	mov edi, [esp + 20]         ;pDst
 	mov ecx, [esp + 28]         ;iHeight
 	mov edx, [esp + 24]			;iDstStride
-	
+
 	lea esi, [esi-2]            ;pSrc -= 2;
-	
+
 	pxor xmm7, xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -475,7 +475,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -501,7 +501,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -514,9 +514,9 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	lea edi, [edi+edx]	
-	lea esi, [esi+eax]	
+
+	lea edi, [edi+edx]
+	lea esi, [esi+eax]
 	dec ecx
 	jnz near .y_loop
 	pop edi
@@ -525,10 +525,10 @@
 
 
 ;*******************************************************************************
-; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int iDstStride, 
+; void_t McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int iSrcStride,
+;                       uint8_t *pDst,
+;                       int iDstStride,
 ;                       int iHeight )
 ;*******************************************************************************
 ALIGN 16
@@ -535,7 +535,7 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
+
 	mov esi, [esp + 12]           ;pSrc
 	mov edx, [esp + 16]	          ;iSrcStride
 	mov edi, [esp + 20]           ;pDst
@@ -546,7 +546,7 @@
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -555,8 +555,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -566,7 +566,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
--- a/codec/decoder/core/asm/memzero.asm
+++ b/codec/decoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/decoder/plus/res/welsdec.rc
+++ b/codec/decoder/plus/res/welsdec.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""afxres.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/codec/encoder/core/asm/asm_inc.asm
+++ b/codec/encoder/core/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/codec/encoder/core/asm/coeff.asm
+++ b/codec/encoder/core/asm/coeff.asm
@@ -318,9 +318,9 @@
 SECTION .text
 
 
-	
+
 ;***********************************************************************
-;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx); 
+;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
 ;***********************************************************************
 WELS_EXTERN CavlcParamCal_sse2
 CavlcParamCal_sse2:
@@ -327,16 +327,16 @@
 	push ebx
 	push edi
 	push esi
-	
+
 	mov			eax,	[esp+16]	;coffLevel
 	mov			edi,	[esp+24]	;Level
 	mov			ebx,	[esp+32]	;endIdx
 	cmp			ebx,	3
-	jne			.Level16	
+	jne			.Level16
 	pxor		xmm1,	xmm1
 	movq		xmm0,	[eax]	; removed QWORD
-	jmp			.Cal_begin		
-.Level16:	
+	jmp			.Cal_begin
+.Level16:
 	movdqa		xmm0,	[eax]
 	movdqa		xmm1,	[eax+16]
 .Cal_begin:
@@ -354,7 +354,7 @@
 	pcmpeqw		xmm7,	xmm7	;generate -1
     mov			ebx,	0xff
     ;pinsrw		xmm6,	ebx,	3
-   
+
     mov       bl,   dh
 
 	lea       ebx,  [byte_1pos_table+8*ebx]
@@ -362,7 +362,7 @@
 	pextrw    ecx,  xmm0, 3
 	shr       ecx,  8
     mov       dh,   cl
- 
+
 .loopHighFind0:
     cmp       ecx,   0
     je        .loopHighFind0End
@@ -372,7 +372,7 @@
     add       esi, 8
     mov       esi, [eax+2*esi]
     mov       [edi], si
-    add       edi,   2 
+    add       edi,   2
     ;add       ebx,   1
     inc		  ebx
     dec       ecx
@@ -403,8 +403,8 @@
 	;and       edx, 0xff
 	movzx	  edx,	byte [ebx]
 	mov       edx, [eax+2*edx]
-	mov       [edi], dx 
-	add       edi,   2 
+	mov       [edi], dx
+	add       edi,   2
 	;add       ebx,   1
 	inc		  ebx
     dec       esi
@@ -436,8 +436,8 @@
     psllq    xmm0, xmm3
     psrlq    xmm0, xmm3
     movdqa   xmm4, xmm1
-    psllq    xmm1, xmm2 
-    psrlq    xmm4, xmm3 
+    psllq    xmm1, xmm2
+    psrlq    xmm4, xmm3
     punpcklqdq xmm1, xmm4
     por      xmm0,  xmm1
 
--- a/codec/encoder/core/asm/cpuid.asm
+++ b/codec/encoder/core/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/codec/encoder/core/asm/dct.asm
+++ b/codec/encoder/core/asm/dct.asm
@@ -48,26 +48,26 @@
 
 ;***********************************************************************
 ; Constant
-;***********************************************************************		
-			
+;***********************************************************************
+
 align 16
-SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16, 
+SSE2_DeQuant8 dw  10, 13, 10, 13, 13, 16, 13, 16,
 			dw	10, 13, 10, 13, 13, 16, 13, 16,
-            dw  11, 14, 11, 14, 14, 18, 14, 18, 
+            dw  11, 14, 11, 14, 14, 18, 14, 18,
 			dw  11, 14, 11, 14, 14, 18, 14, 18,
-			dw  13, 16, 13, 16, 16, 20, 16, 20, 
 			dw  13, 16, 13, 16, 16, 20, 16, 20,
-            dw  14, 18, 14, 18, 18, 23, 18, 23, 
+			dw  13, 16, 13, 16, 16, 20, 16, 20,
+            dw  14, 18, 14, 18, 18, 23, 18, 23,
 			dw  14, 18, 14, 18, 18, 23, 18, 23,
-			dw  16, 20, 16, 20, 20, 25, 20, 25, 
 			dw  16, 20, 16, 20, 20, 25, 20, 25,
-            dw  18, 23, 18, 23, 23, 29, 23, 29, 
+			dw  16, 20, 16, 20, 20, 25, 20, 25,
+            dw  18, 23, 18, 23, 23, 29, 23, 29,
 			dw  18, 23, 18, 23, 23, 29, 23, 29
-			
 
+
 ;***********************************************************************
 ; MMX functions
-;***********************************************************************			
+;***********************************************************************
 
 %macro MMX_LoadDiff4P 5
 	movd        %1, [%3]
@@ -112,7 +112,7 @@
     MMX_SumSub		%4, %1, %6
     MMX_SumSub		%3, %2, %6
     MMX_SumSub		%3, %4, %6
-    MMX_SumSubMul2  %1, %2, %5  
+    MMX_SumSubMul2  %1, %2, %5
 %endmacro
 
 %macro MMX_IDCT 6
@@ -145,13 +145,13 @@
     mov     edx, [esp+24]   ; i_pix2
 
     WELS_Zero    mm7
-    
+
     MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, eax, ebx, ecx, edx, mm0, mm7
 
-    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6           
+    MMX_DCT			mm1, mm2, mm3 ,mm4, mm5, mm6
     MMX_Trans4x4W	mm3, mm1, mm4, mm5, mm2
-    
-    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6                    
+
+    MMX_DCT			mm3, mm5, mm2 ,mm4, mm1, mm6
     MMX_Trans4x4W	mm2, mm3, mm4, mm1, mm5
 
     mov     eax, [esp+ 8]   ; pDct
@@ -178,15 +178,15 @@
 %define     i_pred      esp+pushsize+16
 %define     pDct        esp+pushsize+20
 
-	mov     eax, [pDct   ] 
+	mov     eax, [pDct   ]
     movq    mm0, [eax+ 0]
     movq    mm1, [eax+ 8]
     movq    mm2, [eax+16]
     movq    mm3, [eax+24]
-    mov     edx, [p_dst ]   
-    mov     ecx, [i_dst ]   
+    mov     edx, [p_dst ]
+    mov     ecx, [i_dst ]
     mov     eax, [p_pred]
-    mov     ebx, [i_pred]     
+    mov     ebx, [i_pred]
 
 	MMX_Trans4x4W		mm0, mm1, mm2, mm3, mm4
 	MMX_IDCT			mm1, mm2, mm3, mm4, mm0, mm6
@@ -195,7 +195,7 @@
 
     WELS_Zero			mm7
     WELS_DW32			mm6
-    
+
     MMX_StoreDiff4P		mm3, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm4, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
     lea     edx, [edx+2*ecx]
@@ -202,7 +202,7 @@
     lea     eax, [eax+2*ebx]
     MMX_StoreDiff4P		mm1, mm0, mm6, mm7, [edx], [eax]
     MMX_StoreDiff4P		mm2, mm0, mm6, mm7, [edx+ecx], [eax+ebx]
-    
+
 	WELSEMMS
 %undef	pushsize
 %undef  p_dst
@@ -220,17 +220,17 @@
 %macro SSE2_Store4x8p 6
 	SSE2_XSawp qdq, %2, %3, %6
 	SSE2_XSawp qdq, %4, %5, %3
-	MOVDQ    [%1+0x00], %2 
-	MOVDQ    [%1+0x10], %4 
-	MOVDQ    [%1+0x20], %6 
-	MOVDQ    [%1+0x30], %3 
+	MOVDQ    [%1+0x00], %2
+	MOVDQ    [%1+0x10], %4
+	MOVDQ    [%1+0x20], %6
+	MOVDQ    [%1+0x30], %3
 %endmacro
 
 %macro SSE2_Load4x8p 6
 	MOVDQ    %2,	[%1+0x00]
-	MOVDQ    %4,	[%1+0x10]  
-	MOVDQ    %6,	[%1+0x20]  
-	MOVDQ    %3,	[%1+0x30]  
+	MOVDQ    %4,	[%1+0x10]
+	MOVDQ    %6,	[%1+0x20]
+	MOVDQ    %3,	[%1+0x30]
 	SSE2_XSawp qdq, %4, %3, %5
 	SSE2_XSawp qdq, %2, %6, %3
 %endmacro
@@ -271,40 +271,40 @@
 %endmacro
 
 %macro SSE2_Load8DC	6
-	movdqa		%1,		%6		; %1 = dc0 dc1	
+	movdqa		%1,		%6		; %1 = dc0 dc1
 	paddw       %1,		%5
-    psraw       %1,		$6		; (dc + 32) >> 6	
-    
+    psraw       %1,		$6		; (dc + 32) >> 6
+
     movdqa		%2,		%1
     psrldq		%2,		4
  	punpcklwd	%2,		%2
-	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3	   
+	punpckldq	%2,		%2		; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
 
     movdqa		%3,		%1
     psrldq		%3,		8
  	punpcklwd	%3,		%3
 	punpckldq	%3,		%3		; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
-	
+
 	movdqa		%4,		%1
     psrldq		%4,		12
  	punpcklwd	%4,		%4
 	punpckldq	%4,		%4		; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
-	    	
+
 	punpcklwd	%1,		%1
-	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1	
+	punpckldq	%1,		%1		; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
 %endmacro
 
 %macro SSE2_DCT 6
-    SSE2_SumSub		%6, %3,	%5						
-	SSE2_SumSub		%1, %2, %5																		
-	SSE2_SumSub		%3, %2, %5					
-	SSE2_SumSubMul2		%6, %1, %4               	
+    SSE2_SumSub		%6, %3,	%5
+	SSE2_SumSub		%1, %2, %5
+	SSE2_SumSub		%3, %2, %5
+	SSE2_SumSubMul2		%6, %1, %4
 %endmacro
 
 %macro SSE2_IDCT 7
-    SSE2_SumSub       %7, %2, %6					
-    SSE2_SumSubDiv2     %1, %3, %5, %4              
-    SSE2_SumSub	     %2, %1, %5 
+    SSE2_SumSub       %7, %2, %6
+    SSE2_SumSubDiv2     %1, %3, %5, %4
+    SSE2_SumSub	     %2, %1, %5
     SSE2_SumSub		 %7, %4, %5
 %endmacro
 
@@ -316,12 +316,12 @@
 WelsDctFourT4_sse2:
     push    ebx
     push	esi
-    mov		esi, [esp+12] 
+    mov		esi, [esp+12]
     mov     eax, [esp+16]   ; pix1
     mov     ebx, [esp+20]   ; i_pix1
     mov     ecx, [esp+24]   ; pix2
-    mov     edx, [esp+28]   ; i_pix2    
-    
+    mov     edx, [esp+28]   ; i_pix2
+
     pxor    xmm7, xmm7
 
 	;Load 4x8
@@ -331,33 +331,33 @@
 	lea		ecx, [ecx + 2 * edx]
 	SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
 	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
-	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2             		
+	SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5  
-	
+
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-    
+
 	;Load 4x8
 	SSE2_LoadDiff8P    xmm0, xmm6, xmm7, [eax      ], [ecx    ]
     SSE2_LoadDiff8P    xmm1, xmm6, xmm7, [eax+ebx  ], [ecx+edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
     SSE2_LoadDiff8P    xmm2, xmm6, xmm7, [eax], [ecx]
     SSE2_LoadDiff8P    xmm3, xmm6, xmm7, [eax+ebx], [ecx+edx]
-	
+
 	SSE2_DCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
-	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1		
-    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2              		
+	SSE2_TransTwo4x4W	xmm2, xmm0, xmm3, xmm4, xmm1
+    SSE2_DCT			xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
 	SSE2_TransTwo4x4W	xmm4, xmm2, xmm1, xmm3, xmm0
-	
+
 	lea		esi, [esi+64]
-	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5 
-	
+	SSE2_Store4x8p esi, xmm4, xmm2, xmm3, xmm0, xmm5
+
     pop esi
     pop ebx
     ret
@@ -377,21 +377,21 @@
 %define	pushsize	8
     push		ebx
     push		esi
-    
-    mov			eax,		[rec]   
-    mov			ebx,		[stride]   
-    mov			ecx,		[pred]  
-    mov			edx,		[pred_stride]   
-    mov			esi,		[rs]  
 
+    mov			eax,		[rec]
+    mov			ebx,		[stride]
+    mov			ecx,		[pred]
+    mov			edx,		[pred_stride]
+    mov			esi,		[rs]
+
 	;Load 4x8
-	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
   	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W	xmm1, xmm4, xmm0, xmm2, xmm3
     SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
-    
+
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
 
@@ -398,41 +398,41 @@
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
 	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
-   
+
     add		esi, 64
 	lea		eax, [eax + 2 * ebx]
 	lea		ecx, [ecx + 2 * edx]
-   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5 	
-	
+   	SSE2_Load4x8p  esi, xmm0, xmm1, xmm4, xmm2, xmm5
+
 	SSE2_TransTwo4x4W   xmm0, xmm1, xmm4, xmm2, xmm3
-	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0           
+	SSE2_IDCT			xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
     SSE2_TransTwo4x4W   xmm1, xmm4, xmm0, xmm2, xmm3
 	SSE2_IDCT			xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
 
 	WELS_Zero			xmm7
     WELS_DW32			xmm6
-    
+
 	SSE2_StoreDiff8p   xmm4, xmm5, xmm6, xmm7, [eax		],	[ecx]
 	SSE2_StoreDiff8p   xmm0, xmm5, xmm6, xmm7, [eax + ebx	],	[ecx + edx]
 	lea		eax, [eax + 2 * ebx]
-	lea		ecx, [ecx + 2 * edx]	
+	lea		ecx, [ecx + 2 * edx]
 	SSE2_StoreDiff8p   xmm1, xmm5, xmm6, xmm7, [eax],			[ecx]
-	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx] 
+	SSE2_StoreDiff8p   xmm2, xmm5, xmm6, xmm7, [eax + ebx],	[ecx + edx]
 
     pop		esi
     pop		ebx
     ret
-    
+
   %macro SSE2_StoreDiff4x8p 8
    	SSE2_StoreDiff8p    %1, %3, %4, [%5],			[%6]
-	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]	
+	SSE2_StoreDiff8p    %1, %3, %4, [%5 + %7],		[%6 + %8]
 	SSE2_StoreDiff8p    %2, %3, %4, [%5 + 8],		[%6 + 8]
-	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]	
+	SSE2_StoreDiff8p    %2, %3, %4, [%5 + %7 + 8],	[%6 + %8 + 8]
  %endmacro
- 
+
  ;***********************************************************************
 ; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
 ;***********************************************************************
@@ -443,47 +443,47 @@
 WelsIDctRecI16x16Dc_sse2:
     push		esi
     push		edi
-    
+
 	mov			ecx,		[luma_dc]
-    mov			eax,		[rec]	
-    mov			edx,		[stride]	
-    mov			esi,		[pred]	
-    mov			edi,		[pred_stride]	    	
+    mov			eax,		[rec]
+    mov			edx,		[stride]
+    mov			esi,		[pred]
+    mov			edi,		[pred_stride]
 	pxor		xmm7,		xmm7
     WELS_DW32	xmm6
-    
+
 	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
-	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]	
+
+	SSE2_Load8DC			xmm0, xmm1, xmm2, xmm3, xmm6, [ecx + 16]
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	
-	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi	
-	  
+	lea			esi,		[esi + 2 * edi]
+	SSE2_StoreDiff4x8p		xmm0, xmm1, xmm5, xmm7, eax, esi, edx, edi
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]	 
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-	
+
 	lea			eax,		[eax + 2 * edx]
-	lea			esi,		[esi + 2 * edi]		
+	lea			esi,		[esi + 2 * edi]
 	SSE2_StoreDiff4x8p		xmm2, xmm3, xmm5, xmm7, eax, esi, edx, edi
-		
+
     pop		edi
     pop		esi
     ret
@@ -517,7 +517,7 @@
 	punpckldq	%3,			%4
 	punpcklqdq	%1,			%3
  %endmacro
- 
+
 ;***********************************************************************
 ;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
 ;***********************************************************************
@@ -525,23 +525,23 @@
 WelsHadamardT4Dc_sse2:
 		mov			eax,		[esp + 4]	; luma_dc
 		mov			ecx,		[esp + 8]	; pDct
-		
+
 		SSE2_Load4Col	    xmm1, xmm5, xmm6, xmm0, ecx
 		SSE2_Load4Col	    xmm2, xmm5, xmm6, xmm0, ecx + 0x40
 		SSE2_Load4Col	    xmm3, xmm5, xmm6, xmm0, ecx + 0x100
 		SSE2_Load4Col	    xmm4, xmm5, xmm6, xmm0, ecx + 0x140
-		
+
 		SSE2_SumSubD		xmm1, xmm2, xmm7
 		SSE2_SumSubD		xmm3, xmm4, xmm7
 		SSE2_SumSubD		xmm2, xmm4, xmm7
-		SSE2_SumSubD		xmm1, xmm3, xmm7	
+		SSE2_SumSubD		xmm1, xmm3, xmm7
 
 		SSE2_Trans4x4D		xmm4, xmm2, xmm1, xmm3, xmm5	; pOut: xmm4,xmm3,xmm5,xmm1
-	
+
 		SSE2_SumSubD		xmm4, xmm3, xmm7
 		SSE2_SumSubD		xmm5, xmm1, xmm7
 
-		WELS_DD1 xmm6      
+		WELS_DD1 xmm6
 		SSE2_SumSubDiv2D	xmm3, xmm1, xmm6, xmm0			; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
 		SSE2_SumSubDiv2D	xmm4, xmm5, xmm6, xmm1			; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
         SSE2_Trans4x4D		xmm3, xmm0, xmm1, xmm4, xmm2	; pOut: xmm3,xmm4,xmm2,xmm1
@@ -550,7 +550,7 @@
 		packssdw	xmm2,	xmm1
 		movdqa	[eax+ 0],   xmm3
 		movdqa	[eax+16],   xmm2
-		
-		ret	
+
+		ret
 
 
--- a/codec/encoder/core/asm/deblock.asm
+++ b/codec/encoder/core/asm/deblock.asm
@@ -62,169 +62,169 @@
 
 ALIGN  16
 DeblockChromaEq4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,68h 
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,68h
   mov         edx,[ebp+10h]      ;  iStride
   mov         eax,[ebp+8]        ;  pPixCb
   mov         ecx,[ebp+0Ch]      ;  pPixCr
-  movq        xmm4,[ecx] 
-  movq        xmm5,[edx+ecx] 
-  push        esi  
-  push        edi  
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  movq        xmm1,[edi] 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm1,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm2,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm3,[edi] 
-  punpcklqdq  xmm2,xmm3 
-  movq        xmm3,[eax] 
-  punpcklqdq  xmm3,xmm4 
-  movq        xmm4,[edx+eax] 
-  mov       edx, [ebp + 14h] 
-  punpcklqdq  xmm4,xmm5 
-  movd        xmm5,edx 
-  mov       edx, [ebp + 18h] 
-  pxor        xmm0,xmm0 
-  movdqa      xmm6,xmm5 
-  punpcklwd   xmm6,xmm5 
-  pshufd      xmm5,xmm6,0 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,xmm1 
-  punpckhbw   xmm1,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+40h],xmm1 
-  movdqa      [esp+60h],xmm7 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+10h],xmm7 
-  movdqa      xmm7,xmm3 
-  punpcklbw   xmm7,xmm0 
-  punpckhbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm7,xmm4 
-  punpckhbw   xmm4,xmm0 
-  punpckhbw   xmm2,xmm0 
-  punpcklbw   xmm7,xmm0 
-  movdqa      [esp+30h],xmm3 
-  movdqa      xmm3,[esp+10h] 
-  movdqa      xmm1,xmm3 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      [esp+20h],xmm4 
-  movdqa      xmm0,xmm5 
-  pcmpgtw     xmm0,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  pand        xmm0,xmm4 
-  movdqa      xmm1,xmm7 
-  psubw       xmm1,[esp+50h] 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,[esp+30h] 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  pand        xmm0,xmm4 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm4,xmm6 
-  pcmpgtw     xmm4,xmm1 
-  movdqa      xmm1,[esp+20h] 
-  psubw       xmm1,[esp+30h] 
-  pand        xmm5,xmm4 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  pand        xmm5,xmm6 
-  mov         edx,2 
-  movsx       edx,dx 
-  movd        xmm1,edx 
-  movdqa      xmm4,xmm1 
-  punpcklwd   xmm4,xmm1 
-  pshufd      xmm1,xmm4,0 
-  movdqa      xmm4,[esp+60h] 
-  movdqa      xmm6,xmm4 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,xmm3 
-  paddw       xmm6,xmm7 
-  movdqa      [esp+10h],xmm1 
-  paddw       xmm6,[esp+10h] 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm0 
-  pandn       xmm4,xmm3 
-  movdqa      xmm3,[esp+40h] 
-  movdqa      xmm1,xmm0 
-  pand        xmm1,xmm6 
-  por         xmm1,xmm4 
-  movdqa      xmm6,xmm3 
-  paddw       xmm6,xmm3 
-  movdqa      xmm3,[esp+10h] 
-  paddw       xmm6,xmm2 
-  paddw       xmm6,[esp+20h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm4,xmm5 
-  pand        xmm4,xmm6 
-  movdqa      xmm6,xmm5 
-  pandn       xmm6,xmm2 
-  por         xmm4,xmm6 
-  packuswb    xmm1,xmm4 
-  movdqa      xmm4,[esp+50h] 
-  movdqa      xmm6,xmm7 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,xmm4 
-  paddw       xmm6,[esp+60h] 
-  paddw       xmm6,xmm3 
-  psraw       xmm6,2 
-  movdqa      xmm2,xmm0 
-  pand        xmm2,xmm6 
-  pandn       xmm0,xmm4 
-  por         xmm2,xmm0 
-  movdqa      xmm0,[esp+20h] 
-  movdqa      xmm6,xmm0 
-  paddw       xmm6,xmm0 
-  movdqa      xmm0,[esp+30h] 
-  paddw       xmm6,xmm0 
-  paddw       xmm6,[esp+40h] 
-  movdqa      xmm4,xmm5 
-  paddw       xmm6,xmm3 
-  movq        [esi],xmm1 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  pandn       xmm5,xmm0 
-  por         xmm4,xmm5 
-  packuswb    xmm2,xmm4 
-  movq        [eax],xmm2 
-  psrldq      xmm1,8 
-  movq        [edi],xmm1 
-  pop         edi  
-  psrldq      xmm2,8 
-  movq        [ecx],xmm2 
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
+  movq        xmm4,[ecx]
+  movq        xmm5,[edx+ecx]
+  push        esi
+  push        edi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  movq        xmm1,[edi]
+  mov         edi,ecx
+  sub         edi,esi
+  movq        xmm2,[edi]
+  punpcklqdq  xmm1,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm2,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm3,[edi]
+  punpcklqdq  xmm2,xmm3
+  movq        xmm3,[eax]
+  punpcklqdq  xmm3,xmm4
+  movq        xmm4,[edx+eax]
+  mov       edx, [ebp + 14h]
+  punpcklqdq  xmm4,xmm5
+  movd        xmm5,edx
+  mov       edx, [ebp + 18h]
+  pxor        xmm0,xmm0
+  movdqa      xmm6,xmm5
+  punpcklwd   xmm6,xmm5
+  pshufd      xmm5,xmm6,0
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,xmm1
+  punpckhbw   xmm1,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+40h],xmm1
+  movdqa      [esp+60h],xmm7
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+10h],xmm7
+  movdqa      xmm7,xmm3
+  punpcklbw   xmm7,xmm0
+  punpckhbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm7,xmm4
+  punpckhbw   xmm4,xmm0
+  punpckhbw   xmm2,xmm0
+  punpcklbw   xmm7,xmm0
+  movdqa      [esp+30h],xmm3
+  movdqa      xmm3,[esp+10h]
+  movdqa      xmm1,xmm3
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      [esp+20h],xmm4
+  movdqa      xmm0,xmm5
+  pcmpgtw     xmm0,xmm1
+  movdqa      xmm1,[esp+60h]
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  pand        xmm0,xmm4
+  movdqa      xmm1,xmm7
+  psubw       xmm1,[esp+50h]
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,xmm2
+  psubw       xmm1,[esp+30h]
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  pand        xmm0,xmm4
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  movdqa      xmm4,xmm6
+  pcmpgtw     xmm4,xmm1
+  movdqa      xmm1,[esp+20h]
+  psubw       xmm1,[esp+30h]
+  pand        xmm5,xmm4
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  pand        xmm5,xmm6
+  mov         edx,2
+  movsx       edx,dx
+  movd        xmm1,edx
+  movdqa      xmm4,xmm1
+  punpcklwd   xmm4,xmm1
+  pshufd      xmm1,xmm4,0
+  movdqa      xmm4,[esp+60h]
+  movdqa      xmm6,xmm4
+  paddw       xmm6,xmm4
+  paddw       xmm6,xmm3
+  paddw       xmm6,xmm7
+  movdqa      [esp+10h],xmm1
+  paddw       xmm6,[esp+10h]
+  psraw       xmm6,2
+  movdqa      xmm4,xmm0
+  pandn       xmm4,xmm3
+  movdqa      xmm3,[esp+40h]
+  movdqa      xmm1,xmm0
+  pand        xmm1,xmm6
+  por         xmm1,xmm4
+  movdqa      xmm6,xmm3
+  paddw       xmm6,xmm3
+  movdqa      xmm3,[esp+10h]
+  paddw       xmm6,xmm2
+  paddw       xmm6,[esp+20h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm4,xmm5
+  pand        xmm4,xmm6
+  movdqa      xmm6,xmm5
+  pandn       xmm6,xmm2
+  por         xmm4,xmm6
+  packuswb    xmm1,xmm4
+  movdqa      xmm4,[esp+50h]
+  movdqa      xmm6,xmm7
+  paddw       xmm6,xmm7
+  paddw       xmm6,xmm4
+  paddw       xmm6,[esp+60h]
+  paddw       xmm6,xmm3
+  psraw       xmm6,2
+  movdqa      xmm2,xmm0
+  pand        xmm2,xmm6
+  pandn       xmm0,xmm4
+  por         xmm2,xmm0
+  movdqa      xmm0,[esp+20h]
+  movdqa      xmm6,xmm0
+  paddw       xmm6,xmm0
+  movdqa      xmm0,[esp+30h]
+  paddw       xmm6,xmm0
+  paddw       xmm6,[esp+40h]
+  movdqa      xmm4,xmm5
+  paddw       xmm6,xmm3
+  movq        [esi],xmm1
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  pandn       xmm5,xmm0
+  por         xmm4,xmm5
+  packuswb    xmm2,xmm4
+  movq        [eax],xmm2
+  psrldq      xmm1,8
+  movq        [edi],xmm1
+  pop         edi
+  psrldq      xmm2,8
+  movq        [ecx],xmm2
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
 
 ;******************************************************************************
-; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+; void DeblockChromaLt4V_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                           int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
 
@@ -231,203 +231,203 @@
 WELS_EXTERN  DeblockChromaLt4V_sse2
 
 DeblockChromaLt4V_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0E4h 
-  push        ebx  
-  push        esi  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0E4h
+  push        ebx
+  push        esi
   mov         esi, [ebp+1Ch]      ;  pTC
-  movsx       ebx, byte [esi+2] 
-  push        edi  
-  movsx       di,byte [esi+3] 
-  mov         word [esp+0Ch],bx 
-  movsx       bx,byte  [esi+1] 
-  movsx       esi,byte  [esi] 
-  mov         word  [esp+0Eh],si 
-  movzx       esi,di 
-  movd        xmm1,esi 
-  movzx       esi,di 
-  movd        xmm2,esi 
-  mov         si,word  [esp+0Ch] 
-  mov         edx, [ebp + 10h] 
-  mov         eax, [ebp + 08h] 
-  movzx       edi,si 
-  movzx       esi,si 
-  mov         ecx, [ebp + 0Ch] 
-  movd        xmm4,esi 
-  movzx       esi,bx 
-  movd        xmm5,esi 
-  movd        xmm3,edi 
-  movzx       esi,bx 
-  movd        xmm6,esi 
-  mov         si,word [esp+0Eh] 
-  movzx       edi,si 
-  movzx       esi,si 
-  punpcklwd   xmm6,xmm2 
-  pxor        xmm0,xmm0 
-  movdqa      [esp+40h],xmm0 
-  movd        xmm7,edi 
-  movd        xmm0,esi 
-  lea         esi,[edx+edx] 
-  mov         edi,eax 
-  sub         edi,esi 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+40h] 
-  punpcklwd   xmm0,xmm4 
-  movq        xmm4,[edx+ecx] 
-  punpcklwd   xmm7,xmm3 
-  movq        xmm3,[eax] 
-  punpcklwd   xmm0,xmm6 
-  movq        xmm6,[edi] 
-  punpcklwd   xmm7,xmm5 
-  punpcklwd   xmm0,xmm7 
-  mov         edi,ecx 
-  sub         edi,esi 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+60h],xmm2 
-  movq        xmm2, [edi] 
-  punpcklqdq  xmm6,xmm2 
-  mov         esi,eax 
-  sub         esi,edx 
-  movq        xmm7,[esi] 
-  mov         edi,ecx 
-  sub         edi,edx 
-  movq        xmm2,[edi] 
-  punpcklqdq  xmm7,xmm2 
-  movq        xmm2,[ecx] 
-  punpcklqdq  xmm3,xmm2 
-  movq        xmm2,[edx+eax] 
-  movsx       edx,word [ebp + 14h] 
-  punpcklqdq  xmm2,xmm4 
-  movdqa      [esp+0E0h],xmm2 
-  movd        xmm2,edx 
-  movsx       edx,word [ebp + 18h] 
-  movdqa      xmm4,xmm2 
-  punpcklwd   xmm4,xmm2 
-  movd        xmm2,edx 
-  movdqa      xmm5,xmm2 
-  punpcklwd   xmm5,xmm2 
-  pshufd      xmm2,xmm5,0 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  movdqa      [esp+0D0h],xmm3 
-  pshufd      xmm4,xmm4,0 
-  movdqa      [esp+30h],xmm2 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+80h],xmm6 
-  movdqa      xmm6,[esp+0D0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+70h],xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa     [esp+90h],xmm6 
-  movdqa      xmm5, [esp+0E0h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa       [esp+0A0h],xmm7 
-  punpcklbw   xmm3,xmm1 
-  mov         edx,4 
-  punpcklbw   xmm2,xmm1 
-  movsx       edx,dx 
-  movd        xmm6,edx 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      xmm7,[esp+30h] 
-  movdqa      [esp+20h],xmm6 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6, [esp+20h] 
-  movdqa      xmm7, [esp+50h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      [esp+10h],xmm0 
-  movdqa      xmm6, [esp+10h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+10h],xmm6 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  movdqa      xmm6,xmm4 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+30h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1,[esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5,[esp+80h] 
-  psubw       xmm5,[esp+90h] 
-  pand        xmm6,xmm1 
-  pand        xmm6,[esp+40h] 
-  movdqa      xmm1,[esp+10h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  movdqa      [esp+30h],xmm1 
-  movdqa      xmm1,[esp+0A0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,[esp+20h] 
-  movdqa      xmm5,[esp+60h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+70h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+80h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+90h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+40h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4,[esp+30h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  packuswb    xmm2,xmm1 
-  movq        [esi],xmm2 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm3,xmm5 
-  movq        [eax],xmm3 
-  psrldq      xmm2,8 
-  movq        [edi],xmm2 
-  pop         edi  
-  pop         esi  
-  psrldq      xmm3,8 
-  movq        [ecx],xmm3 
-  pop         ebx  
-  mov         esp,ebp 
-  pop         ebp  
-  ret    
-  
+  movsx       ebx, byte [esi+2]
+  push        edi
+  movsx       di,byte [esi+3]
+  mov         word [esp+0Ch],bx
+  movsx       bx,byte  [esi+1]
+  movsx       esi,byte  [esi]
+  mov         word  [esp+0Eh],si
+  movzx       esi,di
+  movd        xmm1,esi
+  movzx       esi,di
+  movd        xmm2,esi
+  mov         si,word  [esp+0Ch]
+  mov         edx, [ebp + 10h]
+  mov         eax, [ebp + 08h]
+  movzx       edi,si
+  movzx       esi,si
+  mov         ecx, [ebp + 0Ch]
+  movd        xmm4,esi
+  movzx       esi,bx
+  movd        xmm5,esi
+  movd        xmm3,edi
+  movzx       esi,bx
+  movd        xmm6,esi
+  mov         si,word [esp+0Eh]
+  movzx       edi,si
+  movzx       esi,si
+  punpcklwd   xmm6,xmm2
+  pxor        xmm0,xmm0
+  movdqa      [esp+40h],xmm0
+  movd        xmm7,edi
+  movd        xmm0,esi
+  lea         esi,[edx+edx]
+  mov         edi,eax
+  sub         edi,esi
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+40h]
+  punpcklwd   xmm0,xmm4
+  movq        xmm4,[edx+ecx]
+  punpcklwd   xmm7,xmm3
+  movq        xmm3,[eax]
+  punpcklwd   xmm0,xmm6
+  movq        xmm6,[edi]
+  punpcklwd   xmm7,xmm5
+  punpcklwd   xmm0,xmm7
+  mov         edi,ecx
+  sub         edi,esi
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+60h],xmm2
+  movq        xmm2, [edi]
+  punpcklqdq  xmm6,xmm2
+  mov         esi,eax
+  sub         esi,edx
+  movq        xmm7,[esi]
+  mov         edi,ecx
+  sub         edi,edx
+  movq        xmm2,[edi]
+  punpcklqdq  xmm7,xmm2
+  movq        xmm2,[ecx]
+  punpcklqdq  xmm3,xmm2
+  movq        xmm2,[edx+eax]
+  movsx       edx,word [ebp + 14h]
+  punpcklqdq  xmm2,xmm4
+  movdqa      [esp+0E0h],xmm2
+  movd        xmm2,edx
+  movsx       edx,word [ebp + 18h]
+  movdqa      xmm4,xmm2
+  punpcklwd   xmm4,xmm2
+  movd        xmm2,edx
+  movdqa      xmm5,xmm2
+  punpcklwd   xmm5,xmm2
+  pshufd      xmm2,xmm5,0
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  movdqa      [esp+0D0h],xmm3
+  pshufd      xmm4,xmm4,0
+  movdqa      [esp+30h],xmm2
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+80h],xmm6
+  movdqa      xmm6,[esp+0D0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+70h],xmm6
+  movdqa      xmm6, [esp+0E0h]
+  punpckhbw   xmm6,xmm1
+  movdqa     [esp+90h],xmm6
+  movdqa      xmm5, [esp+0E0h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa       [esp+0A0h],xmm7
+  punpcklbw   xmm3,xmm1
+  mov         edx,4
+  punpcklbw   xmm2,xmm1
+  movsx       edx,dx
+  movd        xmm6,edx
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      xmm7,[esp+30h]
+  movdqa      [esp+20h],xmm6
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1,[esp+60h]
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6, [esp+20h]
+  movdqa      xmm7, [esp+50h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      [esp+10h],xmm0
+  movdqa      xmm6, [esp+10h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+10h],xmm6
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  movdqa      xmm6,xmm4
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+30h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1,[esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5,[esp+80h]
+  psubw       xmm5,[esp+90h]
+  pand        xmm6,xmm1
+  pand        xmm6,[esp+40h]
+  movdqa      xmm1,[esp+10h]
+  pand        xmm1,xmm6
+  movdqa      xmm6,[esp+70h]
+  movdqa      [esp+30h],xmm1
+  movdqa      xmm1,[esp+0A0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6,[esp+20h]
+  movdqa      xmm5,[esp+60h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+70h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+80h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+90h]
+  pand        xmm4,xmm7
+  movdqa      xmm7,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+40h]
+  pand        xmm0,xmm4
+  movdqa      xmm4,[esp+30h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  packuswb    xmm2,xmm1
+  movq        [esi],xmm2
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm3,xmm5
+  movq        [eax],xmm3
+  psrldq      xmm2,8
+  movq        [edi],xmm2
+  pop         edi
+  pop         esi
+  psrldq      xmm3,8
+  movq        [ecx],xmm3
+  pop         ebx
+  mov         esp,ebp
+  pop         ebp
+  ret
+
 ;***************************************************************************
-;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;  void DeblockChromaEq4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;          int32_t iAlpha, int32_t iBeta)
 ;***************************************************************************
 
@@ -434,606 +434,606 @@
 WELS_EXTERN     DeblockChromaEq4H_sse2
 
 ALIGN  16
-  
+
 DeblockChromaEq4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,0C8h  
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+7Ch] 
-  push        edi  
-  mov         dword [esp+14h],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+0Ch],edx 
-  mov         dword [esp+10h],eax 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword  [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+0Ch] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+10h] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  movsx       ecx,word [ebp+14h] 
-  movsx       edx,word [ebp+18h] 
-  movdqa      xmm6,[esp+80h] 
-  movdqa      xmm4,[esp+90h] 
-  movdqa      xmm5,[esp+0A0h] 
-  movdqa      xmm7,[esp+0B0h] 
-  pxor        xmm0,xmm0 
-  movd        xmm1,ecx 
-  movdqa      xmm2,xmm1 
-  punpcklwd   xmm2,xmm1 
-  pshufd      xmm1,xmm2,0 
-  movd        xmm2,edx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3,xmm6 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm6,[esp+0A0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+40h],xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  punpckhbw   xmm6,xmm0 
-  movdqa      [esp+70h],xmm6 
-  punpcklbw   xmm7,xmm0 
-  punpcklbw   xmm4,xmm0 
-  punpcklbw   xmm5,xmm0 
-  punpcklbw   xmm3,xmm0 
-  movdqa      [esp+50h],xmm7 
-  movdqa      xmm6,xmm4 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm0,xmm1 
-  pcmpgtw     xmm0,xmm6 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm4 
-  pabsw       xmm6,xmm6 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+30h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm1,xmm6 
-  movdqa      xmm6,[esp+60h] 
-  psubw       xmm6,[esp+30h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm0,xmm7 
-  movdqa      xmm7,xmm2 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6,[esp+70h] 
-  psubw       xmm6,[esp+40h] 
-  pabsw       xmm6,xmm6 
-  pand        xmm1,xmm7 
-  pcmpgtw     xmm2,xmm6 
-  pand        xmm1,xmm2 
-  mov         eax,2 
-  movsx       ecx,ax 
-  movd        xmm2,ecx 
-  movdqa      xmm6,xmm2 
-  punpcklwd   xmm6,xmm2 
-  pshufd      xmm2,xmm6,0 
-  movdqa      [esp+20h],xmm2 
-  movdqa      xmm2,xmm3 
-  paddw       xmm2,xmm3 
-  paddw       xmm2,xmm4 
-  paddw       xmm2,[esp+50h] 
-  paddw       xmm2,[esp+20h] 
-  psraw       xmm2,2 
-  movdqa      xmm6,xmm0 
-  pand        xmm6,xmm2 
-  movdqa      xmm2,xmm0 
-  pandn       xmm2,xmm4 
-  por         xmm6,xmm2 
-  movdqa      xmm2,[esp+60h] 
-  movdqa      xmm7,xmm2 
-  paddw       xmm7,xmm2 
-  paddw       xmm7,[esp+30h] 
-  paddw       xmm7,[esp+70h] 
-  paddw       xmm7,[esp+20h] 
-  movdqa      xmm4,xmm1 
-  movdqa      xmm2,xmm1 
-  pandn       xmm2,[esp+30h] 
-  psraw       xmm7,2 
-  pand        xmm4,xmm7 
-  por         xmm4,xmm2 
-  movdqa      xmm2,[esp+50h] 
-  packuswb    xmm6,xmm4 
-  movdqa      [esp+90h],xmm6 
-  movdqa      xmm6,xmm2 
-  paddw       xmm6,xmm2 
-  movdqa      xmm2,[esp+20h] 
-  paddw       xmm6,xmm5 
-  paddw       xmm6,xmm3 
-  movdqa      xmm4,xmm0 
-  pandn       xmm0,xmm5 
-  paddw       xmm6,xmm2 
-  psraw       xmm6,2 
-  pand        xmm4,xmm6 
-  por         xmm4,xmm0 
-  movdqa      xmm0,[esp+70h] 
-  movdqa      xmm5,xmm0 
-  paddw       xmm5,xmm0 
-  movdqa      xmm0,[esp+40h] 
-  paddw       xmm5,xmm0 
-  paddw       xmm5,[esp+60h] 
-  movdqa      xmm3,xmm1 
-  paddw       xmm5,xmm2 
-  psraw       xmm5,2 
-  pand        xmm3,xmm5 
-  pandn       xmm1,xmm0 
-  por         xmm3,xmm1 
-  packuswb    xmm4,xmm3 
-  movdqa      [esp+0A0h],xmm4 
-  mov         esi,dword [esp+10h] 
-  movdqa      xmm0,[esi] 
-  movdqa      xmm1,[esi+10h] 
-  movdqa      xmm2,[esi+20h] 
-  movdqa      xmm3,[esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+1Ch] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+14h] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+0Ch] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  pop         edi  
-  pop         esi  
-  mov         esp,ebp 
-  pop         ebp  
-  ret              
-  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,0C8h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+7Ch]
+  push        edi
+  mov         dword [esp+14h],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+0Ch],edx
+  mov         dword [esp+10h],eax
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword  [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+0Ch]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+10h]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  movsx       ecx,word [ebp+14h]
+  movsx       edx,word [ebp+18h]
+  movdqa      xmm6,[esp+80h]
+  movdqa      xmm4,[esp+90h]
+  movdqa      xmm5,[esp+0A0h]
+  movdqa      xmm7,[esp+0B0h]
+  pxor        xmm0,xmm0
+  movd        xmm1,ecx
+  movdqa      xmm2,xmm1
+  punpcklwd   xmm2,xmm1
+  pshufd      xmm1,xmm2,0
+  movd        xmm2,edx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3,xmm6
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm6,[esp+90h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm6,[esp+0A0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+40h],xmm6
+  movdqa      xmm6,[esp+0B0h]
+  punpckhbw   xmm6,xmm0
+  movdqa      [esp+70h],xmm6
+  punpcklbw   xmm7,xmm0
+  punpcklbw   xmm4,xmm0
+  punpcklbw   xmm5,xmm0
+  punpcklbw   xmm3,xmm0
+  movdqa      [esp+50h],xmm7
+  movdqa      xmm6,xmm4
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  movdqa      xmm0,xmm1
+  pcmpgtw     xmm0,xmm6
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm4
+  pabsw       xmm6,xmm6
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+30h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm1,xmm6
+  movdqa      xmm6,[esp+60h]
+  psubw       xmm6,[esp+30h]
+  pabsw       xmm6,xmm6
+  pand        xmm0,xmm7
+  movdqa      xmm7,xmm2
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6,[esp+70h]
+  psubw       xmm6,[esp+40h]
+  pabsw       xmm6,xmm6
+  pand        xmm1,xmm7
+  pcmpgtw     xmm2,xmm6
+  pand        xmm1,xmm2
+  mov         eax,2
+  movsx       ecx,ax
+  movd        xmm2,ecx
+  movdqa      xmm6,xmm2
+  punpcklwd   xmm6,xmm2
+  pshufd      xmm2,xmm6,0
+  movdqa      [esp+20h],xmm2
+  movdqa      xmm2,xmm3
+  paddw       xmm2,xmm3
+  paddw       xmm2,xmm4
+  paddw       xmm2,[esp+50h]
+  paddw       xmm2,[esp+20h]
+  psraw       xmm2,2
+  movdqa      xmm6,xmm0
+  pand        xmm6,xmm2
+  movdqa      xmm2,xmm0
+  pandn       xmm2,xmm4
+  por         xmm6,xmm2
+  movdqa      xmm2,[esp+60h]
+  movdqa      xmm7,xmm2
+  paddw       xmm7,xmm2
+  paddw       xmm7,[esp+30h]
+  paddw       xmm7,[esp+70h]
+  paddw       xmm7,[esp+20h]
+  movdqa      xmm4,xmm1
+  movdqa      xmm2,xmm1
+  pandn       xmm2,[esp+30h]
+  psraw       xmm7,2
+  pand        xmm4,xmm7
+  por         xmm4,xmm2
+  movdqa      xmm2,[esp+50h]
+  packuswb    xmm6,xmm4
+  movdqa      [esp+90h],xmm6
+  movdqa      xmm6,xmm2
+  paddw       xmm6,xmm2
+  movdqa      xmm2,[esp+20h]
+  paddw       xmm6,xmm5
+  paddw       xmm6,xmm3
+  movdqa      xmm4,xmm0
+  pandn       xmm0,xmm5
+  paddw       xmm6,xmm2
+  psraw       xmm6,2
+  pand        xmm4,xmm6
+  por         xmm4,xmm0
+  movdqa      xmm0,[esp+70h]
+  movdqa      xmm5,xmm0
+  paddw       xmm5,xmm0
+  movdqa      xmm0,[esp+40h]
+  paddw       xmm5,xmm0
+  paddw       xmm5,[esp+60h]
+  movdqa      xmm3,xmm1
+  paddw       xmm5,xmm2
+  psraw       xmm5,2
+  pand        xmm3,xmm5
+  pandn       xmm1,xmm0
+  por         xmm3,xmm1
+  packuswb    xmm4,xmm3
+  movdqa      [esp+0A0h],xmm4
+  mov         esi,dword [esp+10h]
+  movdqa      xmm0,[esi]
+  movdqa      xmm1,[esi+10h]
+  movdqa      xmm2,[esi+20h]
+  movdqa      xmm3,[esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+1Ch]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+14h]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+0Ch]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
 ;*******************************************************************************
-;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride, 
+;    void DeblockChromaLt4H_sse2(uint8_t * pPixCb, uint8_t * pPixCr, int32_t iStride,
 ;                                int32_t iAlpha, int32_t iBeta, int8_t * pTC);
 ;*******************************************************************************
-  
+
 WELS_EXTERN  DeblockChromaLt4H_sse2
-  
+
 ALIGN  16
 
 DeblockChromaLt4H_sse2:
-  push        ebp  
-  mov         ebp,esp 
-  and         esp,0FFFFFFF0h 
-  sub         esp,108h   
-  mov         ecx,dword [ebp+8] 
-  mov         edx,dword [ebp+0Ch] 
-  mov         eax,dword [ebp+10h] 
-  sub         ecx,2 
-  sub         edx,2 
-  push        esi  
-  lea         esi,[eax+eax*2] 
-  mov         dword [esp+10h],ecx 
-  mov         dword [esp+4],edx 
-  lea         ecx,[ecx+eax*4] 
-  lea         edx,[edx+eax*4] 
-  lea         eax,[esp+6Ch] 
-  push        edi  
-  mov         dword [esp+0Ch],esi 
-  mov         dword [esp+18h],ecx 
-  mov         dword [esp+10h],edx 
-  mov         dword [esp+1Ch],eax 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  movd        xmm0,dword [esi] 
-  movd        xmm1,dword [esi+ecx] 
-  movd        xmm2,dword [esi+ecx*2] 
-  movd        xmm3,dword [esi+edx] 
-  mov         esi,dword [esp+8] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [esi+ecx] 
-  movd        xmm6,dword [esi+ecx*2] 
-  movd        xmm7,dword [esi+edx] 
-  punpckldq   xmm0,xmm4 
-  punpckldq   xmm1,xmm5 
-  punpckldq   xmm2,xmm6 
-  punpckldq   xmm3,xmm7 
-  mov         esi,dword [esp+18h] 
-  mov         edi,dword [esp+10h] 
-  movd        xmm4,dword [esi] 
-  movd        xmm5,dword [edi] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm0,xmm4 
-  movd        xmm4,dword [esi+ecx] 
-  movd        xmm5,dword [edi+ecx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm1,xmm4 
-  movd        xmm4,dword [esi+ecx*2] 
-  movd        xmm5,dword [edi+ecx*2] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm2,xmm4 
-  movd        xmm4,dword [esi+edx] 
-  movd        xmm5,dword [edi+edx] 
-  punpckldq   xmm4,xmm5 
-  punpcklqdq  xmm3,xmm4 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         edi,dword [esp+1Ch] 
-  movdqa      [edi],xmm0 
-  movdqa      [edi+10h],xmm5 
-  movdqa      [edi+20h],xmm1 
-  movdqa      [edi+30h],xmm6 
-  mov         eax,dword [ebp+1Ch] 
-  movsx       cx,byte [eax+3] 
-  movsx       dx,byte [eax+2] 
-  movsx       si,byte [eax+1] 
-  movsx       ax,byte [eax] 
-  movzx       edi,cx 
-  movzx       ecx,cx 
-  movd        xmm2,ecx 
-  movzx       ecx,dx 
-  movzx       edx,dx 
-  movd        xmm3,ecx 
-  movd        xmm4,edx 
-  movzx       ecx,si 
-  movzx       edx,si 
-  movd        xmm5,ecx 
-  pxor        xmm0,xmm0 
-  movd        xmm6,edx 
-  movzx       ecx,ax 
-  movdqa      [esp+60h],xmm0 
-  movzx       edx,ax 
-  movsx       eax,word [ebp+14h] 
-  punpcklwd   xmm6,xmm2 
-  movd        xmm1,edi 
-  movd        xmm7,ecx 
-  movsx       ecx,word [ebp+18h] 
-  movd        xmm0,edx 
-  punpcklwd   xmm7,xmm3 
-  punpcklwd   xmm5,xmm1 
-  movdqa      xmm1,[esp+60h] 
-  punpcklwd   xmm7,xmm5 
-  movdqa      xmm5,[esp+0A0h] 
-  punpcklwd   xmm0,xmm4 
-  punpcklwd   xmm0,xmm6 
-  movdqa      xmm6, [esp+70h] 
-  punpcklwd   xmm0,xmm7 
-  movdqa      xmm7,[esp+80h] 
-  movdqa      xmm2,xmm1 
-  psubw       xmm2,xmm0 
-  movdqa      [esp+0D0h],xmm2 
-  movd        xmm2,eax 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm4,xmm3,0 
-  movd        xmm2,ecx 
-  movdqa      xmm3,xmm2 
-  punpcklwd   xmm3,xmm2 
-  pshufd      xmm2,xmm3,0 
-  movdqa      xmm3, [esp+90h] 
-  movdqa      [esp+50h],xmm2 
-  movdqa      xmm2,xmm6 
-  punpcklbw   xmm2,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+40h],xmm2 
-  movdqa      [esp+0B0h],xmm6 
-  movdqa      xmm6,[esp+90h] 
-  movdqa      xmm2,xmm7 
-  punpckhbw   xmm7,xmm1 
-  punpckhbw   xmm6,xmm1 
-  punpcklbw   xmm2,xmm1 
-  punpcklbw   xmm3,xmm1 
-  punpcklbw   xmm5,xmm1 
-  movdqa      [esp+0F0h],xmm7 
-  movdqa      [esp+0C0h],xmm6 
-  movdqa      xmm6, [esp+0A0h] 
-  punpckhbw   xmm6,xmm1 
-  movdqa      [esp+0E0h],xmm6 
-  mov         edx,4 
-  movsx       eax,dx 
-  movd        xmm6,eax 
-  movdqa      xmm7,xmm6 
-  punpcklwd   xmm7,xmm6 
-  pshufd      xmm6,xmm7,0 
-  movdqa      [esp+30h],xmm6 
-  movdqa      xmm7, [esp+40h] 
-  psubw       xmm7,xmm5 
-  movdqa      xmm6,xmm0 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      [esp+60h],xmm6 
-  movdqa      xmm1, [esp+0D0h] 
-  movdqa      xmm6,xmm3 
-  psubw       xmm6,xmm2 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm7 
-  paddw       xmm6,[esp+30h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm1,xmm6 
-  movdqa      xmm7,[esp+50h] 
-  movdqa      [esp+20h],xmm0 
-  movdqa      xmm6, [esp+20h] 
-  pminsw      xmm6,xmm1 
-  movdqa      [esp+20h],xmm6 
-  movdqa      xmm6,xmm4 
-  movdqa      xmm1,xmm2 
-  psubw       xmm1,xmm3 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm6,xmm1 
-  movdqa      xmm1, [esp+40h] 
-  psubw       xmm1,xmm2 
-  pabsw       xmm1,xmm1 
-  pcmpgtw     xmm7,xmm1 
-  movdqa      xmm1, [esp+50h] 
-  pand        xmm6,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm5,xmm3 
-  pabsw       xmm5,xmm5 
-  pcmpgtw     xmm1,xmm5 
-  movdqa      xmm5, [esp+0B0h] 
-  psubw       xmm5,[esp+0E0h] 
-  pand        xmm6,xmm1 
-  pand        xmm6, [esp+60h] 
-  movdqa      xmm1, [esp+20h] 
-  pand        xmm1,xmm6 
-  movdqa      xmm6, [esp+0C0h] 
-  movdqa      [esp+40h],xmm1 
-  movdqa      xmm1, [esp+0F0h] 
-  psubw       xmm6,xmm1 
-  psllw       xmm6,2 
-  paddw       xmm6,xmm5 
-  paddw       xmm6, [esp+30h] 
-  movdqa      xmm5, [esp+0D0h] 
-  psraw       xmm6,3 
-  pmaxsw      xmm5,xmm6 
-  pminsw      xmm0,xmm5 
-  movdqa      xmm5,[esp+0C0h] 
-  movdqa      xmm6,xmm1 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm4,xmm6 
-  movdqa      xmm6,[esp+0B0h] 
-  psubw       xmm6,xmm1 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  movdqa      xmm6, [esp+0E0h] 
-  pand        xmm4,xmm7 
-  movdqa      xmm7, [esp+50h] 
-  psubw       xmm6,xmm5 
-  pabsw       xmm6,xmm6 
-  pcmpgtw     xmm7,xmm6 
-  pand        xmm4,xmm7 
-  pand        xmm4,[esp+60h] 
-  pand        xmm0,xmm4 
-  movdqa      xmm4, [esp+40h] 
-  paddw       xmm2,xmm4 
-  paddw       xmm1,xmm0 
-  psubw       xmm3,xmm4 
-  psubw       xmm5,xmm0 
-  packuswb    xmm2,xmm1 
-  packuswb    xmm3,xmm5 
-  movdqa      [esp+80h],xmm2 
-  movdqa      [esp+90h],xmm3 
-  mov         esi,dword [esp+1Ch] 
-  movdqa      xmm0, [esi] 
-  movdqa      xmm1, [esi+10h] 
-  movdqa      xmm2, [esi+20h] 
-  movdqa      xmm3, [esi+30h] 
-  movdqa      xmm6,xmm0 
-  punpcklbw   xmm0,xmm1 
-  punpckhbw   xmm6,xmm1 
-  movdqa      xmm7,xmm2 
-  punpcklbw   xmm2,xmm3 
-  punpckhbw   xmm7,xmm3 
-  movdqa      xmm4,xmm0 
-  movdqa      xmm5,xmm6 
-  punpcklwd   xmm0,xmm2 
-  punpckhwd   xmm4,xmm2 
-  punpcklwd   xmm6,xmm7 
-  punpckhwd   xmm5,xmm7 
-  movdqa      xmm1,xmm0 
-  movdqa      xmm2,xmm4 
-  punpckldq   xmm0,xmm6 
-  punpckhdq   xmm1,xmm6 
-  punpckldq   xmm4,xmm5 
-  punpckhdq   xmm2,xmm5 
-  movdqa      xmm5,xmm0 
-  movdqa      xmm6,xmm1 
-  punpcklqdq  xmm0,xmm4 
-  punpckhqdq  xmm5,xmm4 
-  punpcklqdq  xmm1,xmm2 
-  punpckhqdq  xmm6,xmm2 
-  mov         esi,dword [esp+14h] 
-  mov         ecx,dword [ebp+10h] 
-  mov         edx,dword [esp+0Ch] 
-  mov         edi,dword [esp+8] 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         esi,dword [esp+18h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  movd        dword [esi],xmm0 
-  movd        dword [esi+ecx],xmm5 
-  movd        dword [esi+ecx*2],xmm1 
-  movd        dword [esi+edx],xmm6 
-  psrldq      xmm0,4 
-  psrldq      xmm5,4 
-  psrldq      xmm1,4 
-  psrldq      xmm6,4 
-  mov         edi,dword [esp+10h] 
-  movd        dword [edi],xmm0 
-  movd        dword [edi+ecx],xmm5 
-  movd        dword [edi+ecx*2],xmm1 
-  movd        dword [edi+edx],xmm6  
-  pop         edi  
-  pop         esi   
-  mov         esp,ebp 
-  pop         ebp  
-  ret     
-  
-  
-  
+  push        ebp
+  mov         ebp,esp
+  and         esp,0FFFFFFF0h
+  sub         esp,108h
+  mov         ecx,dword [ebp+8]
+  mov         edx,dword [ebp+0Ch]
+  mov         eax,dword [ebp+10h]
+  sub         ecx,2
+  sub         edx,2
+  push        esi
+  lea         esi,[eax+eax*2]
+  mov         dword [esp+10h],ecx
+  mov         dword [esp+4],edx
+  lea         ecx,[ecx+eax*4]
+  lea         edx,[edx+eax*4]
+  lea         eax,[esp+6Ch]
+  push        edi
+  mov         dword [esp+0Ch],esi
+  mov         dword [esp+18h],ecx
+  mov         dword [esp+10h],edx
+  mov         dword [esp+1Ch],eax
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  movd        xmm0,dword [esi]
+  movd        xmm1,dword [esi+ecx]
+  movd        xmm2,dword [esi+ecx*2]
+  movd        xmm3,dword [esi+edx]
+  mov         esi,dword [esp+8]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [esi+ecx]
+  movd        xmm6,dword [esi+ecx*2]
+  movd        xmm7,dword [esi+edx]
+  punpckldq   xmm0,xmm4
+  punpckldq   xmm1,xmm5
+  punpckldq   xmm2,xmm6
+  punpckldq   xmm3,xmm7
+  mov         esi,dword [esp+18h]
+  mov         edi,dword [esp+10h]
+  movd        xmm4,dword [esi]
+  movd        xmm5,dword [edi]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm0,xmm4
+  movd        xmm4,dword [esi+ecx]
+  movd        xmm5,dword [edi+ecx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm1,xmm4
+  movd        xmm4,dword [esi+ecx*2]
+  movd        xmm5,dword [edi+ecx*2]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm2,xmm4
+  movd        xmm4,dword [esi+edx]
+  movd        xmm5,dword [edi+edx]
+  punpckldq   xmm4,xmm5
+  punpcklqdq  xmm3,xmm4
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         edi,dword [esp+1Ch]
+  movdqa      [edi],xmm0
+  movdqa      [edi+10h],xmm5
+  movdqa      [edi+20h],xmm1
+  movdqa      [edi+30h],xmm6
+  mov         eax,dword [ebp+1Ch]
+  movsx       cx,byte [eax+3]
+  movsx       dx,byte [eax+2]
+  movsx       si,byte [eax+1]
+  movsx       ax,byte [eax]
+  movzx       edi,cx
+  movzx       ecx,cx
+  movd        xmm2,ecx
+  movzx       ecx,dx
+  movzx       edx,dx
+  movd        xmm3,ecx
+  movd        xmm4,edx
+  movzx       ecx,si
+  movzx       edx,si
+  movd        xmm5,ecx
+  pxor        xmm0,xmm0
+  movd        xmm6,edx
+  movzx       ecx,ax
+  movdqa      [esp+60h],xmm0
+  movzx       edx,ax
+  movsx       eax,word [ebp+14h]
+  punpcklwd   xmm6,xmm2
+  movd        xmm1,edi
+  movd        xmm7,ecx
+  movsx       ecx,word [ebp+18h]
+  movd        xmm0,edx
+  punpcklwd   xmm7,xmm3
+  punpcklwd   xmm5,xmm1
+  movdqa      xmm1,[esp+60h]
+  punpcklwd   xmm7,xmm5
+  movdqa      xmm5,[esp+0A0h]
+  punpcklwd   xmm0,xmm4
+  punpcklwd   xmm0,xmm6
+  movdqa      xmm6, [esp+70h]
+  punpcklwd   xmm0,xmm7
+  movdqa      xmm7,[esp+80h]
+  movdqa      xmm2,xmm1
+  psubw       xmm2,xmm0
+  movdqa      [esp+0D0h],xmm2
+  movd        xmm2,eax
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm4,xmm3,0
+  movd        xmm2,ecx
+  movdqa      xmm3,xmm2
+  punpcklwd   xmm3,xmm2
+  pshufd      xmm2,xmm3,0
+  movdqa      xmm3, [esp+90h]
+  movdqa      [esp+50h],xmm2
+  movdqa      xmm2,xmm6
+  punpcklbw   xmm2,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+40h],xmm2
+  movdqa      [esp+0B0h],xmm6
+  movdqa      xmm6,[esp+90h]
+  movdqa      xmm2,xmm7
+  punpckhbw   xmm7,xmm1
+  punpckhbw   xmm6,xmm1
+  punpcklbw   xmm2,xmm1
+  punpcklbw   xmm3,xmm1
+  punpcklbw   xmm5,xmm1
+  movdqa      [esp+0F0h],xmm7
+  movdqa      [esp+0C0h],xmm6
+  movdqa      xmm6, [esp+0A0h]
+  punpckhbw   xmm6,xmm1
+  movdqa      [esp+0E0h],xmm6
+  mov         edx,4
+  movsx       eax,dx
+  movd        xmm6,eax
+  movdqa      xmm7,xmm6
+  punpcklwd   xmm7,xmm6
+  pshufd      xmm6,xmm7,0
+  movdqa      [esp+30h],xmm6
+  movdqa      xmm7, [esp+40h]
+  psubw       xmm7,xmm5
+  movdqa      xmm6,xmm0
+  pcmpgtw     xmm6,xmm1
+  movdqa      [esp+60h],xmm6
+  movdqa      xmm1, [esp+0D0h]
+  movdqa      xmm6,xmm3
+  psubw       xmm6,xmm2
+  psllw       xmm6,2
+  paddw       xmm6,xmm7
+  paddw       xmm6,[esp+30h]
+  psraw       xmm6,3
+  pmaxsw      xmm1,xmm6
+  movdqa      xmm7,[esp+50h]
+  movdqa      [esp+20h],xmm0
+  movdqa      xmm6, [esp+20h]
+  pminsw      xmm6,xmm1
+  movdqa      [esp+20h],xmm6
+  movdqa      xmm6,xmm4
+  movdqa      xmm1,xmm2
+  psubw       xmm1,xmm3
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm6,xmm1
+  movdqa      xmm1, [esp+40h]
+  psubw       xmm1,xmm2
+  pabsw       xmm1,xmm1
+  pcmpgtw     xmm7,xmm1
+  movdqa      xmm1, [esp+50h]
+  pand        xmm6,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm5,xmm3
+  pabsw       xmm5,xmm5
+  pcmpgtw     xmm1,xmm5
+  movdqa      xmm5, [esp+0B0h]
+  psubw       xmm5,[esp+0E0h]
+  pand        xmm6,xmm1
+  pand        xmm6, [esp+60h]
+  movdqa      xmm1, [esp+20h]
+  pand        xmm1,xmm6
+  movdqa      xmm6, [esp+0C0h]
+  movdqa      [esp+40h],xmm1
+  movdqa      xmm1, [esp+0F0h]
+  psubw       xmm6,xmm1
+  psllw       xmm6,2
+  paddw       xmm6,xmm5
+  paddw       xmm6, [esp+30h]
+  movdqa      xmm5, [esp+0D0h]
+  psraw       xmm6,3
+  pmaxsw      xmm5,xmm6
+  pminsw      xmm0,xmm5
+  movdqa      xmm5,[esp+0C0h]
+  movdqa      xmm6,xmm1
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm4,xmm6
+  movdqa      xmm6,[esp+0B0h]
+  psubw       xmm6,xmm1
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  movdqa      xmm6, [esp+0E0h]
+  pand        xmm4,xmm7
+  movdqa      xmm7, [esp+50h]
+  psubw       xmm6,xmm5
+  pabsw       xmm6,xmm6
+  pcmpgtw     xmm7,xmm6
+  pand        xmm4,xmm7
+  pand        xmm4,[esp+60h]
+  pand        xmm0,xmm4
+  movdqa      xmm4, [esp+40h]
+  paddw       xmm2,xmm4
+  paddw       xmm1,xmm0
+  psubw       xmm3,xmm4
+  psubw       xmm5,xmm0
+  packuswb    xmm2,xmm1
+  packuswb    xmm3,xmm5
+  movdqa      [esp+80h],xmm2
+  movdqa      [esp+90h],xmm3
+  mov         esi,dword [esp+1Ch]
+  movdqa      xmm0, [esi]
+  movdqa      xmm1, [esi+10h]
+  movdqa      xmm2, [esi+20h]
+  movdqa      xmm3, [esi+30h]
+  movdqa      xmm6,xmm0
+  punpcklbw   xmm0,xmm1
+  punpckhbw   xmm6,xmm1
+  movdqa      xmm7,xmm2
+  punpcklbw   xmm2,xmm3
+  punpckhbw   xmm7,xmm3
+  movdqa      xmm4,xmm0
+  movdqa      xmm5,xmm6
+  punpcklwd   xmm0,xmm2
+  punpckhwd   xmm4,xmm2
+  punpcklwd   xmm6,xmm7
+  punpckhwd   xmm5,xmm7
+  movdqa      xmm1,xmm0
+  movdqa      xmm2,xmm4
+  punpckldq   xmm0,xmm6
+  punpckhdq   xmm1,xmm6
+  punpckldq   xmm4,xmm5
+  punpckhdq   xmm2,xmm5
+  movdqa      xmm5,xmm0
+  movdqa      xmm6,xmm1
+  punpcklqdq  xmm0,xmm4
+  punpckhqdq  xmm5,xmm4
+  punpcklqdq  xmm1,xmm2
+  punpckhqdq  xmm6,xmm2
+  mov         esi,dword [esp+14h]
+  mov         ecx,dword [ebp+10h]
+  mov         edx,dword [esp+0Ch]
+  mov         edi,dword [esp+8]
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         esi,dword [esp+18h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  movd        dword [esi],xmm0
+  movd        dword [esi+ecx],xmm5
+  movd        dword [esi+ecx*2],xmm1
+  movd        dword [esi+edx],xmm6
+  psrldq      xmm0,4
+  psrldq      xmm5,4
+  psrldq      xmm1,4
+  psrldq      xmm6,4
+  mov         edi,dword [esp+10h]
+  movd        dword [edi],xmm0
+  movd        dword [edi+ecx],xmm5
+  movd        dword [edi+ecx*2],xmm1
+  movd        dword [edi+edx],xmm6
+  pop         edi
+  pop         esi
+  mov         esp,ebp
+  pop         ebp
+  ret
+
+
+
 ;*******************************************************************************
-;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;    void DeblockLumaLt4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta, int8_t * pTC)
 ;*******************************************************************************
-  
 
+
 WELS_EXTERN  DeblockLumaLt4V_sse2
-  
+
 ALIGN  16
 
 DeblockLumaLt4V_sse2:
@@ -1419,12 +1419,12 @@
 
 
 ;*******************************************************************************
-;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha, 
+;    void DeblockLumaEq4V_sse2(uint8_t * pPix, int32_t iStride, int32_t iAlpha,
 ;                                 int32_t iBeta)
 ;*******************************************************************************
 
 WELS_EXTERN  DeblockLumaEq4V_sse2
-  
+
 ALIGN  16
 
 DeblockLumaEq4V_sse2:
@@ -1965,11 +1965,11 @@
 	mov	esp, ebp
 	pop	ebp
 	ret
-  
-    
+
+
 ;********************************************************************************
 ;
-;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);     
+;   void DeblockLumaTransposeH2V_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pDst);
 ;
 ;********************************************************************************
 
@@ -1982,49 +1982,49 @@
     push    ebx
     mov     ebp,   esp
     and     esp,0FFFFFFF0h
-    sub     esp,   10h    
-    
-    mov     eax,   [ebp + 0Ch]  
+    sub     esp,   10h
+
+    mov     eax,   [ebp + 0Ch]
     mov     ecx,   [ebp + 10h]
     lea     edx,   [eax + ecx * 8]
     lea     ebx,   [ecx*3]
-    
-    movq    xmm0,  [eax] 
+
+    movq    xmm0,  [eax]
     movq    xmm7,  [edx]
-    punpcklqdq   xmm0,  xmm7  
+    punpcklqdq   xmm0,  xmm7
     movq    xmm1,  [eax + ecx]
     movq    xmm7,  [edx + ecx]
     punpcklqdq   xmm1,  xmm7
-    movq    xmm2,  [eax + ecx*2] 
+    movq    xmm2,  [eax + ecx*2]
     movq    xmm7,  [edx + ecx*2]
     punpcklqdq   xmm2,  xmm7
     movq    xmm3,  [eax + ebx]
     movq    xmm7,  [edx + ebx]
     punpcklqdq   xmm3,  xmm7
-    
+
     lea     eax,   [eax + ecx * 4]
     lea     edx,   [edx + ecx * 4]
-    movq    xmm4,  [eax] 
+    movq    xmm4,  [eax]
     movq    xmm7,  [edx]
-    punpcklqdq   xmm4,  xmm7  
+    punpcklqdq   xmm4,  xmm7
     movq    xmm5,  [eax + ecx]
     movq    xmm7,  [edx + ecx]
     punpcklqdq   xmm5,  xmm7
-    movq    xmm6,  [eax + ecx*2] 
+    movq    xmm6,  [eax + ecx*2]
     movq    xmm7,  [edx + ecx*2]
     punpcklqdq   xmm6,  xmm7
-    
+
     movdqa  [esp],   xmm0
     movq    xmm7,  [eax + ebx]
     movq    xmm0,  [edx + ebx]
     punpcklqdq   xmm7,  xmm0
     movdqa  xmm0,   [esp]
-    
+
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
+
     mov    eax,   [ebp + 14h]
-    movdqa  [eax],    xmm4 
+    movdqa  [eax],    xmm4
     movdqa  [eax + 10h],  xmm2
     movdqa  [eax + 20h],  xmm3
     movdqa  [eax + 30h],  xmm7
@@ -2031,15 +2031,15 @@
     movdqa  [eax + 40h],  xmm5
     movdqa  [eax + 50h],  xmm1
     movdqa  [eax + 60h],  xmm6
-    movdqa  [eax + 70h],  xmm0   
-    
+    movdqa  [eax + 70h],  xmm0
+
     mov     esp,   ebp
     pop     ebx
     pop     ebp
     ret
-    
-    
-    
+
+
+
 ;*******************************************************************************************
 ;
 ;   void DeblockLumaTransposeV2H_sse2(uint8_t * pPixY, int32_t iStride, uint8_t * pSrc);
@@ -2053,14 +2053,14 @@
 DeblockLumaTransposeV2H_sse2:
     push     ebp
     mov      ebp,   esp
-    
+
     and     esp,  0FFFFFFF0h
-    sub     esp,   10h  
-    
-    mov      eax,   [ebp + 10h]  
+    sub     esp,   10h
+
+    mov      eax,   [ebp + 10h]
     mov      ecx,   [ebp + 0Ch]
     mov      edx,   [ebp + 08h]
-      
+
     movdqa   xmm0,  [eax]
     movdqa   xmm1,  [eax + 10h]
     movdqa   xmm2,  [eax + 20h]
@@ -2069,23 +2069,23 @@
     movdqa   xmm5,	[eax + 50h]
     movdqa   xmm6,	[eax + 60h]
     movdqa   xmm7,	[eax + 70h]
-    
+
     SSE2_TransTwo8x8B  xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [esp]
     ;pOut: m5, m3, m4, m8, m6, m2, m7, m1
-    
+
     lea      eax,   [ecx * 3]
-    
-    movq     [edx],  xmm4 
+
+    movq     [edx],  xmm4
     movq     [edx + ecx],  xmm2
     movq     [edx + ecx*2],  xmm3
     movq     [edx + eax],  xmm7
-    
+
     lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
+    movq     [edx],  xmm5
     movq     [edx + ecx],  xmm1
     movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0    
-    
+    movq     [edx + eax],  xmm0
+
     psrldq    xmm4,   8
     psrldq    xmm2,   8
     psrldq    xmm3,   8
@@ -2094,20 +2094,20 @@
     psrldq    xmm1,   8
     psrldq    xmm6,   8
     psrldq    xmm0,   8
-    
+
     lea       edx,  [edx + ecx*4]
-    movq     [edx],  xmm4 
+    movq     [edx],  xmm4
     movq     [edx + ecx],  xmm2
     movq     [edx + ecx*2],  xmm3
     movq     [edx + eax],  xmm7
-    
+
     lea      edx,   [edx + ecx*4]
-    movq     [edx],  xmm5 
+    movq     [edx],  xmm5
     movq     [edx + ecx],  xmm1
     movq     [edx + ecx*2],  xmm6
-    movq     [edx + eax],  xmm0   
-    
-    
+    movq     [edx + eax],  xmm0
+
+
     mov      esp,   ebp
     pop      ebp
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/expand_picture.asm
+++ b/codec/encoder/core/asm/expand_picture.asm
@@ -153,11 +153,11 @@
 	lea %1, [%1+%2]
 %endmacro
 
-%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]		
+%macro exp_top_bottom_sse2	1	; iPaddingSize [luma(32)/chroma(16)]
 	; ebx [width/16(8)]
 	; esi [pSrc+0], edi [pSrc-1], ecx [-stride], 32(16)		; top
 	; eax [pSrc+(h-1)*stride], ebp [pSrc+(h+31)*stride], 32(16)	; bottom
-		
+
 %if %1 == 32		; for luma
 	sar ebx, 04h 	; width / 16(8) pixels
 .top_bottom_loops:
@@ -171,7 +171,7 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_end16x4_sse2 edi, ecx, xmm0, a
-	
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
@@ -182,15 +182,15 @@
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
-		
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
-	jnz near .top_bottom_loops		
+	jnz near .top_bottom_loops
 %elif %1 == 16	; for chroma ??
 	mov edx, ebx
 	sar ebx, 04h 	; (width / 16) pixels
@@ -200,21 +200,21 @@
 	mov_line_16x4_sse2 edi, ecx, xmm0, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
 	mov_line_16x4_sse2 edi, ecx, xmm0, a
-	mov_line_end16x4_sse2 edi, ecx, xmm0, a	
-	
+	mov_line_end16x4_sse2 edi, ecx, xmm0, a
+
 	; bottom
 	movdqa xmm1, [eax] 		; last line of picture pData
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a	; dst, stride, xmm?
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
 	mov_line_16x4_sse2 ebp, ecx, xmm1, a
-	mov_line_end16x4_sse2 ebp, ecx, xmm1, a	
-		
+	mov_line_end16x4_sse2 ebp, ecx, xmm1, a
+
 	lea esi, [esi+16]		; top pSrc
 	lea edi, [edi+16]		; top dst
 	lea eax, [eax+16]		; bottom pSrc
 	lea ebp, [ebp+16]		; bottom dst
-	neg ecx 			; positive/negative stride need for next loop?	
-	
+	neg ecx 			; positive/negative stride need for next loop?
+
 	dec ebx
 	jnz near .top_bottom_loops
 
@@ -241,13 +241,13 @@
 %endif
 %endmacro
 
-%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a	
+%macro exp_left_right_sse2	2	; iPaddingSize [luma(32)/chroma(16)], u/a
 	; ecx [height]
 	; esi [pSrc+0], 	   edi [pSrc-32], edx [stride], 32(16)	; left
 	; ebx [pSrc+(w-1)], ebp [pSrc+w], 32(16)			; right
 ;	xor eax, eax 	; for pixel pData (uint8_t)		; make sure eax=0 at least high 24 bits of eax = 0
-	
-%if %1 == 32		; for luma	
+
+%if %1 == 32		; for luma
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
@@ -254,37 +254,37 @@
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [edi], xmm0
 	movdqa [edi+16], xmm0
-	
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdqa [ebp], xmm1
 	movdqa [ebp+16], xmm1
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
-	jnz near .left_right_loops		
-%elif %1 == 16	; for chroma ??	
+	jnz near .left_right_loops
+%elif %1 == 16	; for chroma ??
 .left_right_loops:
 	; left
 	mov al, byte [esi]		; pixel pData for left border
 	butterfly_1to16_sse	xmm0, xmm1, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
-	movdqa [edi], xmm0	
-	
+	movdqa [edi], xmm0
+
 	; right
 	mov al, byte [ebx]
 	butterfly_1to16_sse	xmm1, xmm2, a				; dst, tmp, pSrc [generic register name: a/b/c/d]
 	movdq%2 [ebp], xmm1								; might not be aligned 16 bytes in case chroma planes
-	
+
 	lea esi, [esi+edx]		; left pSrc
 	lea edi, [edi+edx]		; left dst
 	lea ebx, [ebx+edx]		; right pSrc
-	lea ebp, [ebp+edx]		; right dst	
-	
+	lea ebp, [ebp+edx]		; right dst
+
 	dec ecx
 	jnz near .left_right_loops
 %endif
@@ -337,25 +337,25 @@
 	; TL
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	edi, ecx, xmm3, a	; dst, stride, xmm?
 
 	; TR
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebp, ecx, xmm4, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2 ebp, ecx, xmm4, %2	; dst, stride, xmm?
 
 	; BL
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
-	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?	
+	mov_line_16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 	mov_line_end16x4_sse2	eax, ecx, xmm5, a	; dst, stride, xmm?
 
 	; BR
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
-	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?	
+	mov_line_16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 	mov_line_end16x4_sse2	ebx, ecx, xmm6, %2	; dst, stride, xmm?
 %endif
 %endmacro
@@ -373,7 +373,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -385,10 +385,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -396,16 +396,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 32 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	32	
-	
+	exp_top_bottom_sse2	32
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -417,7 +417,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -424,7 +424,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	32, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -434,7 +434,7 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -32							; luma=-32, chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
 	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
@@ -442,19 +442,19 @@
 	mov ecx, [esp+28]					; stride
 	imul edx, ecx							; (height+32(16)) * stride
 	lea eax, [edi+edx]						; last line of bottom-left border
-	lea ebx, [ebp+edx]						; last line of bottom-right border	
+	lea ebx, [ebp+edx]						; last line of bottom-right border
 	neg ecx										; -stride
 	; for left & right border expanding
-	exp_cross_sse2		32, a	
-	
+	exp_cross_sse2		32, a
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -470,7 +470,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -482,10 +482,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -493,16 +493,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -514,7 +514,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -521,7 +521,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, a
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -531,9 +531,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	mov eax, -16							; chroma=-16
 	neg ecx										; -stride
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]				
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -543,15 +543,15 @@
 	neg ecx										; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, a
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
 ALIGN 16
@@ -567,7 +567,7 @@
 	push esi
 	push edi
 	push ebp
-	
+
 	; for both top and bottom border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -579,10 +579,10 @@
 	mov cl, byte [esi]
 	butterfly_1to16_sse xmm3, xmm4, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; load top border
-	mov ecx, edx							; stride	
+	mov ecx, edx							; stride
 	neg ecx 								; -stride
 	lea edi, [esi+ecx]						; last line of top border
-	; load bottom border 
+	; load bottom border
 	dec eax									; h-1
 	imul eax, edx 							; (h-1)*stride
 	lea eax, [esi+eax]						; last line of picture pData
@@ -590,16 +590,16 @@
 	lea ebp, [eax+edx]						; last line of bottom border, (h-1)*stride + 16 * stride
 	; also prepare for cross border pData: bottom-left with xmm5, bottom-right xmm6
 	dec ebx									; width-1
-	lea ebx, [eax+ebx]						; dst[w-1][h-1]	
+	lea ebx, [eax+ebx]						; dst[w-1][h-1]
 ;	xor edx, edx
 	mov dl, byte [eax]						; bottom-left
 	butterfly_1to16_sse xmm5, xmm6, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	mov dl, byte [ebx]						; bottom-right
 	butterfly_1to16_sse xmm6, xmm4, d		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	; for top & bottom expanding	
+	; for top & bottom expanding
 	mov ebx, [esp+32]						; width
-	exp_top_bottom_sse2	16	
-	
+	exp_top_bottom_sse2	16
+
 	; for both left and right border
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst: left border pSrc
@@ -611,7 +611,7 @@
 	lea edi, [esi+eax]						; left border dst
 	dec ebx
 	lea ebx, [esi+ebx]						; right border pSrc, (p_dst + width - 1)
-	lea ebp, [ebx+1]						; right border dst	
+	lea ebp, [ebx+1]						; right border dst
 	; prepare for cross border pData: top-right with xmm4
 ;	xor eax, eax
 	mov al, byte [ebx]						; top-right
@@ -618,7 +618,7 @@
 	butterfly_1to16_sse xmm4, xmm0, a		; dst, tmp, pSrc [generic register name: a/b/c/d]
 	; for left & right border expanding
 	exp_left_right_sse2	16, u
-	
+
 	; for cross border [top-left, top-right, bottom-left, bottom-right]
 	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	mov esi, [esp+24]						; p_dst
@@ -628,9 +628,9 @@
 	; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
 	neg ecx									; -stride
 	mov eax, -16							; chroma=-16
-	lea edi, [esi+eax]						
+	lea edi, [esi+eax]
 	lea edi, [edi+ecx]				; last line of top-left border
-	lea ebp, [esi+ebx]						
+	lea ebp, [esi+ebx]
 	lea ebp, [ebp+ecx]				; last line of top-right border
 	mov ecx, [esp+28]						; stride
 	add edx, 16							; height+16, luma=32, chroma=16
@@ -640,14 +640,14 @@
 	neg ecx									; -stride
 	; for left & right border expanding
 	exp_cross_sse2		16, u
-	
+
 ;	sfence									; commit cache write back memory
-	
+
 	pop ebp
 	pop edi
 	pop esi
 	pop edx
 	pop ebx
-	
+
 	ret
 
--- a/codec/encoder/core/asm/intra_pred.asm
+++ b/codec/encoder/core/asm/intra_pred.asm
@@ -95,13 +95,13 @@
 	punpcklbw	%1,	%3
 	movdqa		%3,	%1
 	punpcklbw	%1,	%3
-	
+
 	;add			%4,	%5
 	movd		%2,	[%4+%5-1]
 	movdqa		%3,	%2
 	punpcklbw	%2,	%3
 	movdqa		%3,	%2
-	punpcklbw	%2,	%3	
+	punpcklbw	%2,	%3
 	punpckldq	%1,	%2
 %endmacro
 
@@ -126,24 +126,24 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpcklwd %1,	%3
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%4,	[%5]
 		movd	%2,	[%5+%6]
 		punpcklbw %4,	%2
-		lea		%5,	[%5+2*%6]	
+		lea		%5,	[%5+2*%6]
 		movd	%3,	[%5]
 		movd	%2,	[%5+%6]
 		lea		%5,	[%5+2*%6]
 		punpcklbw %3,	%2
 		punpcklwd %4,	%3
-		punpckhdq %1,	%4	
-%endmacro	
+		punpckhdq %1,	%4
+%endmacro
 
 %macro  SUMW_HORIZON 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -173,7 +173,7 @@
 		movd	%2,	[%5+%6]
 		punpcklbw %3,	%2
 		punpckhwd %1,	%3
-		lea		%5,	[%5+2*%6]			
+		lea		%5,	[%5+2*%6]
 %endmacro
 
 %macro LOAD_2_LEFT_AND_ADD 0
@@ -197,7 +197,7 @@
 ALIGN 16
 ;***********************************************************************
 ;   void __cdecl WelsI4x4LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
-;   
+;
 ;	pred must align to 16
 ;***********************************************************************
 WelsI4x4LumaPredH_sse2:
@@ -207,11 +207,11 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm0,	edx
 	pmuludq		xmm0,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
 	movd		xmm1,	edx
 	pmuludq		xmm1,	[mmx_01bytes]
-	
+
 	unpcklps	xmm0,	xmm1
 
 	lea			eax,	[eax+ecx*2]
@@ -218,19 +218,19 @@
 	movzx		edx,	byte [eax-1]
 	movd		xmm2,	edx
 	pmuludq		xmm2,	[mmx_01bytes]
-	
+
 	movzx		edx,	byte [eax+ecx-1]
-	movd		xmm3,	edx	
+	movd		xmm3,	edx
 	pmuludq		xmm3,	[mmx_01bytes]
-	
+
 	unpcklps	xmm2,	xmm3
 	unpcklpd	xmm0,	xmm2
-	
+
 	mov			edx,	[esp+4]			;pred
 	movdqa		[edx],	xmm0
-	
+
 	ret
-	
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -241,9 +241,9 @@
 		mov		ecx,	[esp + pushsize + 12]
 		sub		esi,	1
 		sub		esi,	ecx
-		
+
 		;for H
-		pxor	xmm7,	xmm7	
+		pxor	xmm7,	xmm7
 		movq	xmm0,	[esi]
 		movdqa	xmm5,	[sse2_plane_dec]
 		punpcklbw xmm0,	xmm7
@@ -253,7 +253,7 @@
 		punpcklbw xmm1,	xmm7
 		pmullw	xmm1,	xmm6
 		psubw	xmm1,	xmm0
-		
+
 		SUMW_HORIZON	xmm1,xmm0,xmm2
 		movd    eax,	xmm1		; H += (i + 1) * (top[8 + i] - top[6 - i]);
 		movsx	eax,	ax
@@ -261,26 +261,26 @@
 		add		eax,	32
 		sar		eax,	6			; b = (5 * H + 32) >> 6;
 		SSE2_Copy8Times	xmm1, eax	; xmm1 = b,b,b,b,b,b,b,b
-		
-		movzx	edx,	BYTE [esi+16]	
+
+		movzx	edx,	BYTE [esi+16]
 		sub	esi, 3
 		LOAD_COLUMN		xmm0, xmm2, xmm3, xmm4, esi, ecx
-			
+
 		add		esi,	3
 		movzx	eax,	BYTE [esi+8*ecx]
 		add		edx,	eax
 		shl		edx,	4			;	a = (left[15*stride] + top[15]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN		xmm7, xmm2, xmm3, xmm4, esi, ecx
-		pxor	xmm4,	xmm4	
+		pxor	xmm4,	xmm4
 		punpckhbw xmm0,	xmm4
 		pmullw	xmm0,	xmm5
 		punpckhbw xmm7,	xmm4
 		pmullw	xmm7,	xmm6
 		psubw	xmm7,	xmm0
-		
+
 		SUMW_HORIZON   xmm7,xmm0,xmm2
 		movd    eax,   xmm7			; V
 		movsx	eax,	ax
@@ -288,17 +288,17 @@
 		imul	eax,	5
 		add		eax,	32
 		sar		eax,	6				; c = (5 * V + 32) >> 6;
-		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; xmm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-7
-		add		edx,	eax				; s = a + 16 + (-7)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-7)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_inc_minus]
-		
+
 get_i16x16_luma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -307,7 +307,7 @@
 		movdqa	xmm3,	xmm1
 		pmullw	xmm3,	xmm6
 		paddw	xmm3,	xmm0
-		psraw	xmm3,	5	
+		psraw	xmm3,	5
 		packuswb xmm2,	xmm3
 		movdqa	[esi],	xmm2
 		paddw	xmm0,	xmm4
@@ -314,13 +314,13 @@
 		add		esi,	16
 		inc		eax
 		cmp		eax,	16
-		jnz get_i16x16_luma_pred_plane_sse2_1					
-		
+		jnz get_i16x16_luma_pred_plane_sse2_1
+
 		pop		esi
 		ret
-		
-		
-		
+
+
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredH_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -327,7 +327,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE 1
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES	eax,	xmm0
     movdqa			[edx+%1],	xmm0
    COPY_16_TIMESS eax,	xmm0,	ecx
@@ -340,13 +340,13 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [edx],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
 	SSE2_PRED_H_16X16_TWO_LINE   0x40
 	SSE2_PRED_H_16X16_TWO_LINE   0x60
 	SSE2_PRED_H_16X16_TWO_LINE   0x80
@@ -353,9 +353,9 @@
 	SSE2_PRED_H_16X16_TWO_LINE   0xa0
 	SSE2_PRED_H_16X16_TWO_LINE   0xc0
 	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -364,10 +364,10 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     sub     eax, ecx
     movdqa  xmm0, [eax]
-    
+
     movdqa  [edx], xmm0
     movdqa  [edx+10h], xmm0
     movdqa  [edx+20h], xmm0
@@ -378,15 +378,15 @@
     movdqa  [edx+70h], xmm0
     movdqa  [edx+80h], xmm0
     movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
+    movdqa  [edx+160], xmm0
 	movdqa  [edx+176], xmm0
     movdqa  [edx+192], xmm0
     movdqa  [edx+208], xmm0
     movdqa  [edx+224], xmm0
     movdqa  [edx+240], xmm0
-    
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsIChromaPredPlane_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -398,8 +398,8 @@
 		mov		ecx,	[esp + pushsize + 12]	;stride
 		sub		esi,	1
 		sub		esi,	ecx
-		
-		pxor	mm7,	mm7	
+
+		pxor	mm7,	mm7
 		movq	mm0,	[esi]
 		movq	mm5,	[sse2_plane_dec_c]
 		punpcklbw mm0,	mm7
@@ -409,7 +409,7 @@
 		punpcklbw mm1,	mm7
 		pmullw	mm1,	mm6
 		psubw	mm1,	mm0
-		
+
 		movq2dq xmm1,   mm1
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm1,xmm0,xmm2
@@ -419,7 +419,7 @@
 		add		eax,	16
 		sar		eax,	5			; b = (17 * H + 16) >> 5;
 		SSE2_Copy8Times	xmm1, eax	; mm1 = b,b,b,b,b,b,b,b
-		
+
 		movzx	edx,	BYTE [esi+8]
 		sub	esi, 3
 		LOAD_COLUMN_C	mm0, mm2, mm3, mm4, esi, ecx
@@ -428,17 +428,17 @@
 		movzx	eax,	BYTE [esi+4*ecx]
 		add		edx,	eax
 		shl		edx,	4			; a = (left[7*stride] + top[7]) << 4;
-		
+
 		sub	esi, 3
 		add		esi,	ecx
 		LOAD_COLUMN_C	mm7, mm2, mm3, mm4, esi, ecx
-		pxor	mm4,	mm4	
+		pxor	mm4,	mm4
 		punpckhbw mm0,	mm4
 		pmullw	mm0,	mm5
 		punpckhbw mm7,	mm4
 		pmullw	mm7,	mm6
 		psubw	mm7,	mm0
-		
+
 		movq2dq xmm7,   mm7
 		pxor    xmm2,   xmm2
 		SUMW_HORIZON	xmm7,xmm0,xmm2
@@ -448,17 +448,17 @@
 		imul	eax,	17
 		add		eax,	16
 		sar		eax,	5				; c = (17 * V + 16) >> 5;
-		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c		
-		
+		SSE2_Copy8Times	xmm4, eax		; mm4 = c,c,c,c,c,c,c,c
+
 		mov		esi,	[esp + pushsize + 4]
 		add		edx,	16
 		imul	eax,	-3
-		add		edx,	eax				; s = a + 16 + (-3)*c		
-		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s		
-		
+		add		edx,	eax				; s = a + 16 + (-3)*c
+		SSE2_Copy8Times	xmm0, edx		; xmm0 = s,s,s,s,s,s,s,s
+
 		xor		eax,	eax
 		movdqa	xmm5,	[sse2_plane_mul_b_c]
-		
+
 get_i_chroma_pred_plane_sse2_1:
 		movdqa	xmm2,	xmm1
 		pmullw	xmm2,	xmm5
@@ -470,12 +470,12 @@
 		add		esi,	8
 		inc		eax
 		cmp		eax,	8
-		jnz get_i_chroma_pred_plane_sse2_1					
-		
+		jnz get_i_chroma_pred_plane_sse2_1
+
 		pop		esi
 		WELSEMMS
-		ret	
-		
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -487,13 +487,13 @@
 ;	pred[7] = ([6]+[0]*2+[1]+2)/4
 ;
 ;   void __cdecl WelsI4x4LumaPredDDR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDDR_mmx:	
+WelsI4x4LumaPredDDR_mmx:
 	mov			edx,[esp+4]			;pred
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
-	
+
 	movq        mm1,[eax+ecx-8]		;get value of 11,decreasing 8 is trying to improve the performance of movq mm1[8] = 11
 	movq        mm2,[eax-8]			;get value of 6 mm2[8] = 6
 	sub			eax, ecx			;mov eax to above line of current block(postion of 1)
@@ -520,17 +520,17 @@
 	pand        mm1,[mmx_01bytes]	;set the odd bit
 	psubusb     mm3,mm1				;decrease 1 from odd bytes
 	pavgb       mm2,mm3				;mm2=(([11]+[21]+1)/2+1+[16])/2
-	
-	movd        [edx+12],mm2 
-	psrlq       mm2,8 
-	movd        [edx+8],mm2 
-	psrlq       mm2,8 
-	movd        [edx+4],mm2 
-	psrlq       mm2,8 
+
+	movd        [edx+12],mm2
+	psrlq       mm2,8
+	movd        [edx+8],mm2
+	psrlq       mm2,8
+	movd        [edx+4],mm2
+	psrlq       mm2,8
 	movd        [edx],mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	0 |1 |2 |3 |4 |
@@ -542,44 +542,44 @@
 ;	pred[6] = ([1]+[2]+[3]+[4]+[5]+[10]+[15]+[20]+4)/8
 ;
 ;   void __cdecl WelsI4x4LumaPredDc_sse2(uint8_t *pred,uint8_t *pRef,int32_t stride)
-;   
+;
 ;***********************************************************************
-WelsI4x4LumaPredDc_sse2:	
+WelsI4x4LumaPredDc_sse2:
 	mov         eax,[esp+8]			;pRef
 	mov			ecx,[esp+12]		;stride
 	push		ebx
-		
+
 	movzx		edx,	byte [eax-1h]
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pxor		xmm1,	xmm1
 	psadbw		xmm0,	xmm1
-	
+
 	movd		ebx,	xmm0
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2-1h]
 	add			ebx,	edx
-	
+
 	lea			eax,	[eax+ecx*2-1]
 	movzx		edx,	byte [eax+ecx]
 	add			ebx,	edx
-	
+
 	movzx		edx,	byte [eax+ecx*2]
 	add			ebx,	edx
 	add			ebx,	4
 	sar			ebx,	3
 	imul		ebx,	0x01010101
-	
+
 	mov			edx,	[esp+8]			;pred
 	movd		xmm0,	ebx
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-				
+
 	pop ebx
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsIChromaPredH_mmx(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -588,7 +588,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINE 4
 	movq		%1,		[%3-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -598,7 +598,7 @@
 %macro MMX_PRED_H_8X8_ONE_LINEE 4
 	movq		%1,		[%3+ecx-8]
 	psrlq		%1,		38h
-	
+
 	;pmuludq		%1,		[mmx_01bytes]		;extend to 4 bytes
 	pmullw		%1,		[mmx_01bytes]
 	pshufw		%1,		%1,	0
@@ -610,34 +610,34 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	movq		mm0,	[eax-8]
 	psrlq		mm0,	38h
-	
+
 	;pmuludq		mm0,	[mmx_01bytes]		;extend to 4 bytes
 	pmullw		mm0,		[mmx_01bytes]
 	pshufw		mm0,	mm0,	0
 	movq		[edx],	mm0
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+8
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+16
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+24
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+32
-	
+
 	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+40
-	
+
 	lea			eax,[eax+ecx*2]
 	MMX_PRED_H_8X8_ONE_LINE	mm0, mm1, eax,edx+48
 
-	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56		
+	MMX_PRED_H_8X8_ONE_LINEE	mm0, mm1, eax,edx+56
 	WELSEMMS
-	ret	
-	
+	ret
+
 ALIGN 16
 ;***********************************************************************
 ;	void __cdecl WelsI4x4LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
@@ -648,12 +648,12 @@
 	mov			edx,	[esp+4]			;pred
 	mov         eax,	[esp+8]			;pRef
 	mov			ecx,	[esp+12]		;stride
-	
+
 	sub			eax,	ecx
 	movd		xmm0,	[eax]
 	pshufd		xmm0,	xmm0,	0
 	movdqa		[edx],	xmm0
-	ret	
+	ret
 
 ALIGN 16
 ;***********************************************************************
@@ -665,7 +665,7 @@
 	mov			edx,		[esp+4]			;pred
 	mov         eax,		[esp+8]			;pRef
 	mov			ecx,		[esp+12]		;stride
-	
+
 	sub			eax,		ecx
 	movq		xmm0,		[eax]
 	movdqa		xmm1,		xmm0
@@ -676,8 +676,8 @@
 	movdqa		[edx+32],	xmm0
 	movdqa		[edx+48],	xmm0
 	ret
-	
-	
+
+
 	ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -703,13 +703,13 @@
 
 ;   f = (2 + l1 + (l0<<1) + lt)>>2
 ;   h = (2 + l2 + (l1<<1) + l0)>>2
-;   j = (2 + l3 + (l2<<1) + l1)>>2   
+;   j = (2 + l3 + (l2<<1) + l1)>>2
 ;   [b a f e h g j i] + [d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHD_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHD_mmx
-WelsI4x4LumaPredHD_mmx:	
+WelsI4x4LumaPredHD_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -716,16 +716,16 @@
 	sub         eax, ecx
 	movd        mm0, [eax-1]            ; mm0 = [xx xx xx xx t2 t1 t0 lt]
 	psllq       mm0, 20h                ; mm0 = [t2 t1 t0 lt xx xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
-	movd        mm2, [eax+2*ecx-4]        
+	movd        mm2, [eax+2*ecx-4]
 	punpcklbw   mm2, [eax+ecx-4]        ; mm2[7] = l2, mm2[6] = l3
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 l3 xx xx xx xx]
 	psrlq       mm2, 20h
 	pxor        mm0, mm2                ; mm0 = [t2 t1 t0 lt l0 l1 l2 l3]
-	
+
 	movq        mm1, mm0
 	psrlq       mm1, 10h                ; mm1 = [xx xx t2 t1 t0 lt l0 l1]
 	movq        mm2, mm0
@@ -733,17 +733,17 @@
 	movq        mm3, mm2
 	movq        mm4, mm1
 	pavgb       mm1, mm0
-	
+
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx d  c  b  f  h  j]
-	
+
 	movq        mm4, mm0
 	pavgb       mm3, mm4                ; mm3 = [xx xx xx xx a  e  g  i]
 	punpcklbw   mm3, mm2                ; mm3 = [b  a  f  e  h  g  j  i]
-	
+
 	psrlq       mm2, 20h
 	psllq       mm2, 30h                ; mm2 = [d  c  0  0  0  0  0  0]
 	movq        mm4, mm3
@@ -750,7 +750,7 @@
 	psrlq       mm4, 10h                ; mm4 = [0  0  b  a  f  e  h  j]
 	pxor        mm2, mm4                ; mm2 = [d  c  b  a  xx xx xx xx]
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx  d  c  b  a]
-	
+
 	movd        [edx], mm2
 	movd        [edx+12], mm3
 	psrlq       mm3, 10h
@@ -759,9 +759,9 @@
 	movd        [edx+4], mm3
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -784,17 +784,17 @@
 ;   b = (2 + l0 + (l1<<1) + l2)>>2
 ;   d = (2 + l1 + (l2<<1) + l3)>>2
 ;   f = (2 + l2 + (l3<<1) + l3)>>2
- 
+
 ;   [g g f e d c b a] + [g g g g] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredHU_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredHU_mmx
-WelsI4x4LumaPredHU_mmx:	
+WelsI4x4LumaPredHU_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	movd        mm0, [eax-4]            ; mm0[3] = l0
 	punpcklbw   mm0, [eax+ecx-4]        ; mm0[7] = l1, mm0[6] = l0
 	lea         eax, [eax+2*ecx]
@@ -802,38 +802,38 @@
 	movd        mm4, [eax+ecx-4]        ; mm4[3] = l3
 	punpcklbw   mm2, mm4
 	punpckhwd   mm0, mm2                ; mm0 = [l3 l2 l1 l0 xx xx xx xx]
-	
+
 	psrlq       mm4, 18h
 	psllq       mm4, 38h                ; mm4 = [l3 xx xx xx xx xx xx xx]
 	psrlq       mm0, 8h
 	pxor        mm0, mm4                ; mm0 = [l3 l3 l2 l1 l0 xx xx xx]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [l3 l2 l1 l0 xx xx xx xx]
 	movq        mm3, mm1                ; mm3 = [l3 l2 l1 l0 xx xx xx xx]
 	pavgb       mm1, mm0                ; mm1 = [g  e  c  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [l2 l1 l0 xx xx xx xx xx]
 	movq        mm5, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm5, mm0				; find odd value in the lowest bit of each byte
 	pand        mm5, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm5				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm3                ; mm2 = [f  d  b  xx xx xx xx xx]
-	
+
 	psrlq       mm2, 8h
 	pxor        mm2, mm4                ; mm2 = [g  f  d  b  xx xx xx xx]
-	
+
 	punpckhbw   mm1, mm2                ; mm1 = [g  g  f  e  d  c  b  a]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  xx xx xx xx xx xx]
 	punpckhbw   mm4, mm4                ; mm4 = [g  g  g  g  xx xx xx xx]
-	
+
 	psrlq       mm4, 20h
 	movd        [edx+12], mm4
-	
+
 	movd        [edx], mm1
 	psrlq       mm1, 10h
 	movd        [edx+4], mm1
@@ -841,9 +841,9 @@
 	movd        [edx+8], mm1
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|
@@ -869,12 +869,12 @@
 
 ;   h = (2 + t1 + (t2<<1) + t3)>>2
 ;   i = (2 + lt + (l0<<1) + l1)>>2
-;   j = (2 + l0 + (l1<<1) + l2)>>2   
-;   
+;   j = (2 + l0 + (l1<<1) + l2)>>2
+;
 ;   void WelsI4x4LumaPredVR_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVR_mmx
-WelsI4x4LumaPredVR_mmx:	
+WelsI4x4LumaPredVR_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -881,57 +881,57 @@
 	sub         eax, ecx
 	movq        mm0, [eax-1]            ; mm0 = [xx xx xx t3 t2 t1 t0 lt]
 	psllq       mm0, 18h                ; mm0 = [t3 t2 t1 t0 lt xx xx xx]
-	
-	movd        mm1, [eax+2*ecx-4]        
-	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1	
+
+	movd        mm1, [eax+2*ecx-4]
+	punpcklbw   mm1, [eax+ecx-4]        ; mm1[7] = l0, mm1[6] = l1
 	lea         eax, [eax+2*ecx]
 	movq        mm2, [eax+ecx-8]        ; mm2[7] = l2
 	punpckhwd   mm2, mm1                ; mm2 = [l0 l1 l2 xx xx xx xx xx]
 	psrlq       mm2, 28h
 	pxor        mm0, mm2                ; mm0 = [t3 t2 t1 t0 lt l0 l1 l2]
-	
+
 	movq        mm1, mm0
 	psllq       mm1, 8h                 ; mm1 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm1, mm0                ; mm1 = [d  c  b  a  xx xx xx xx]
-	
+
 	movq        mm2, mm0
 	psllq       mm2, 10h                ; mm2 = [t1 t0 lt l0 l1 l2 xx xx]
 	movq        mm3, mm2
 	pavgb       mm2, mm0
-	
+
 	pxor        mm3, mm0				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm3				; decrease 1 from odd bytes
-	
+
 	movq        mm3, mm0
 	psllq       mm3, 8h                 ; mm3 = [t2 t1 t0 lt l0 l1 l2 xx]
 	pavgb       mm3, mm2                ; mm3 = [h  g  f  e  i  j  xx xx]
 	movq        mm2, mm3
-	
+
 	psrlq       mm1, 20h                ; mm1 = [xx xx xx xx d  c  b  a]
 	movd        [edx], mm1
-	
+
 	psrlq       mm2, 20h                ; mm2 = [xx xx xx xx h  g  f  e]
 	movd        [edx+4], mm2
-	
+
 	movq        mm4, mm3
 	psllq       mm4, 20h
 	psrlq       mm4, 38h                ; mm4 = [xx xx xx xx xx xx xx i]
-	
+
 	movq        mm5, mm3
 	psllq       mm5, 28h
 	psrlq       mm5, 38h                ; mm5 = [xx xx xx xx xx xx xx j]
-	
+
 	psllq       mm1, 8h
 	pxor        mm4, mm1                ; mm4 = [xx xx xx xx c  b  a  i]
 	movd        [edx+8], mm4
-	
+
 	psllq       mm2, 8h
 	pxor        mm5, mm2                ; mm5 = [xx xx xx xx g  f  e  j]
 	movd        [edx+12], mm5
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -954,13 +954,13 @@
 ;   e = (2 + t4 + t6 + (t5<<1))>>2
 ;   f = (2 + t5 + t7 + (t6<<1))>>2
 ;   g = (2 + t6 + t7 + (t7<<1))>>2
- 
+
 ;   [g f e d c b a] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredDDL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredDDL_mmx
-WelsI4x4LumaPredDDL_mmx:	
+WelsI4x4LumaPredDDL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
@@ -968,11 +968,11 @@
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	movq        mm3, mm0
 	psrlq       mm3, 38h
 	psllq       mm3, 38h                ; mm3 = [t7 xx xx xx xx xx xx xx]
-	
+
 	psllq       mm1, 8h                 ; mm1 = [t6 t5 t4 t3 t2 t1 t0 xx]
 	psrlq       mm2, 8h
 	pxor        mm2, mm3                ; mm2 = [t7 t7 t6 t5 t4 t3 t2 t1]
@@ -982,9 +982,9 @@
 	pxor        mm3, mm2				; find odd value in the lowest bit of each byte
 	pand        mm3, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm1, mm3				; decrease 1 from odd bytes
-	
+
 	pavgb       mm0, mm1                ; mm0 = [g f e d c b a xx]
-	
+
 	psrlq       mm0, 8h
 	movd        [edx], mm0
 	psrlq       mm0, 8h
@@ -995,8 +995,8 @@
 	movd        [edx+12], mm0
 	WELSEMMS
 	ret
-	
-	
+
+
 ALIGN 16
 ;***********************************************************************
 ;	lt|t0|t1|t2|t3|t4|t5|t6|t7
@@ -1022,46 +1022,46 @@
 ;   g = (2 + t2 + (t3<<1) + t4)>>2
 ;   h = (2 + t3 + (t4<<1) + t5)>>2
 ;   j = (2 + t4 + (t5<<1) + t6)>>2
- 
+
 ;   [i d c b a] + [j h g f e] --> mov to memory
-;   
+;
 ;   void WelsI4x4LumaPredVL_mmx(uint8_t *pred,uint8_t *pRef,int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI4x4LumaPredVL_mmx
-WelsI4x4LumaPredVL_mmx:	
+WelsI4x4LumaPredVL_mmx:
 	mov			edx, [esp+4]			; pred
 	mov         eax, [esp+8]			; pRef
 	mov			ecx, [esp+12]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]              ; mm0 = [t7 t6 t5 t4 t3 t2 t1 t0]
 	movq        mm1, mm0
 	movq        mm2, mm0
-	
+
 	psrlq       mm1, 8h                 ; mm1 = [xx t7 t6 t5 t4 t3 t2 t1]
 	psrlq       mm2, 10h                ; mm2 = [xx xx t7 t6 t5 t4 t3 t2]
 
 	movq        mm3, mm1
 	pavgb       mm3, mm0                ; mm3 = [xx xx xx i  d  c  b  a]
-	
+
 	movq        mm4, mm2
-	pavgb       mm2, mm0	
+	pavgb       mm2, mm0
 	pxor        mm4, mm0				; find odd value in the lowest bit of each byte
 	pand        mm4, [mmx_01bytes]	    ; set the odd bit
 	psubusb     mm2, mm4				; decrease 1 from odd bytes
-	
+
 	pavgb       mm2, mm1                ; mm2 = [xx xx xx j  h  g  f  e]
-	
+
 	movd        [edx], mm3
 	psrlq       mm3, 8h
 	movd        [edx+8], mm3
-	
+
 	movd        [edx+4], mm2
 	psrlq       mm2, 8h
 	movd        [edx+12], mm2
 	WELSEMMS
 	ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1068,14 +1068,14 @@
 ;   void WelsIChromaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsIChromaPredDc_sse2
-WelsIChromaPredDc_sse2:	
+WelsIChromaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movq        mm0, [eax]
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l1
 	movzx		ebx, byte [eax+ecx-0x01] ; l1
@@ -1089,7 +1089,7 @@
 	movzx		edx, byte [eax-0x01]     ; l4
 	add			ebx, edx
 	movd        mm1, ebx                 ; mm1 = l1+l2+l3+l4
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01] ; l5
 	movzx		ebx, byte [eax+ecx-0x01] ; l5
@@ -1103,7 +1103,7 @@
 	movzx		edx, byte [eax-0x01]     ; l8
 	add			ebx, edx
 	movd        mm2, ebx                 ; mm2 = l5+l6+l7+l8
-	
+
 	movq        mm3, mm0
 	psrlq       mm0, 0x20
 	psllq       mm3, 0x20
@@ -1110,56 +1110,56 @@
 	psrlq       mm3, 0x20
 	pxor		mm4, mm4
 	psadbw		mm0, mm4
-	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2	
-	
+	psadbw		mm3, mm4                 ; sum1 = mm3+mm1, sum2 = mm0, sum3 = mm2
+
 	paddq       mm3, mm1
 	movq        mm1, mm2
 	paddq       mm1, mm0;                ; sum1 = mm3, sum2 = mm0, sum3 = mm2, sum4 = mm1
-	
+
 	movq        mm4, [mmx_0x02]
-	
+
 	paddq       mm0, mm4
 	psrlq       mm0, 0x02
-	
+
 	paddq       mm2, mm4
 	psrlq       mm2, 0x02
-	
+
 	paddq       mm3, mm4
 	paddq       mm3, mm4
 	psrlq       mm3, 0x03
-	
+
 	paddq       mm1, mm4
 	paddq       mm1, mm4
 	psrlq       mm1, 0x03
-	
+
 	pmuludq     mm0, [mmx_01bytes]
 	pmuludq     mm3, [mmx_01bytes]
 	psllq       mm0, 0x20
 	pxor        mm0, mm3                 ; mm0 = m_up
-	
+
 	pmuludq     mm2, [mmx_01bytes]
 	pmuludq     mm1, [mmx_01bytes]
 	psllq       mm1, 0x20
 	pxor        mm1, mm2                 ; mm2 = m_down
-	
+
 	mov         edx, [esp+8]			 ; pRef
-	
+
 	movq        [edx], mm0
 	movq        [edx+0x08], mm0
 	movq        [edx+0x10], mm0
 	movq        [edx+0x18], mm0
-	
+
 	movq        [edx+0x20], mm1
 	movq        [edx+0x28], mm1
 	movq        [edx+0x30], mm1
 	movq        [edx+0x38], mm1
-	
+
 	pop         ebx
 	WELSEMMS
 	ret
-	
-	
-	
+
+
+
 ALIGN 16
 ;***********************************************************************
 ;
@@ -1166,11 +1166,11 @@
 ;   void WelsI16x16LumaPredDc_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride)
 ;***********************************************************************
 WELS_EXTERN WelsI16x16LumaPredDc_sse2
-WelsI16x16LumaPredDc_sse2:	
+WelsI16x16LumaPredDc_sse2:
 	push        ebx
 	mov         eax, [esp+12]			; pRef
 	mov			ecx, [esp+16]           ; stride
-	
+
 	sub         eax, ecx
 	movdqa      xmm0, [eax]             ; read one row
 	pxor		xmm1, xmm1
@@ -1180,7 +1180,7 @@
 	pslldq      xmm0, 0x08
 	psrldq      xmm0, 0x08
 	paddw       xmm0, xmm1
-	
+
 	;xor         ebx, ebx
 	;movzx		edx, byte [eax+ecx-0x01]
 	movzx		ebx, byte [eax+ecx-0x01]
@@ -1201,7 +1201,7 @@
 	psrld       xmm0, 0x05
 	pmuludq     xmm0, [mmx_01bytes]
 	pshufd      xmm0, xmm0, 0
-	
+
 	mov         edx, [esp+8]			; pred
 	movdqa      [edx], xmm0
 	movdqa      [edx+0x10], xmm0
@@ -1219,7 +1219,7 @@
 	movdqa      [edx+0xd0], xmm0
 	movdqa      [edx+0xe0], xmm0
 	movdqa      [edx+0xf0], xmm0
-	
+
 	pop         ebx
 
 	ret
@@ -1226,7 +1226,7 @@
 
 ;***********************************************************************
 ;
-;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc, 
+;int32_t WelsSmpleSatdThree4x4_sse2( uint8_t *pDec, int32_t iLineSizeDec, uint8_t *pEnc, int32_t iLinesizeEnc,
 ;                             uint8_t* pRed, int32_t* pBestMode, int32_t, int32_t, int32_t);
 ;
 ;***********************************************************************
@@ -1238,7 +1238,7 @@
 	push      edi
 	mov       eax,  [esp+24];p_enc
 	mov       ebx,  [esp+28];linesize_enc
-	
+
 	; load source 4x4 samples and Hadamard transform
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
@@ -1247,16 +1247,16 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-       
+
     pxor      xmm6, xmm6
     punpcklbw xmm0, xmm6
     punpcklbw xmm1, xmm6
-    
+
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp  qdq, xmm0, xmm2, xmm3
-    
+
     movdqa    xmm4, xmm0
     paddw     xmm0, xmm3
     psubw     xmm4, xmm3
@@ -1264,7 +1264,7 @@
     movdqa    xmm2, xmm0
     punpcklwd xmm0, xmm4
     punpckhwd xmm4, xmm2
-    
+
 	SSE2_XSawp  dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp  qdq, xmm0, xmm3, xmm5
 
@@ -1271,14 +1271,14 @@
     movdqa    xmm7, xmm0
     paddw     xmm0, xmm5
     psubw     xmm7, xmm5
-    
+
 	SSE2_XSawp  qdq,  xmm0, xmm7, xmm1
-    
+
     ; Hadamard transform results are saved in xmm0 and xmm2
     movdqa    xmm2, xmm0
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
-  	
+
 	; load top boundary samples: [a b c d]
     mov       eax,  [esp+16];p_dec
 	sub		  eax,	[esp+20];linesize_dec
@@ -1286,7 +1286,7 @@
 	movzx     edx,  byte [eax+1]
 	movzx     esi,  byte [eax+2]
 	movzx     edi,  byte [eax+3]
-	
+
 	; get the transform results of top boundary samples: [a b c d]
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1300,7 +1300,7 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]
-	
+
 	movdqa    xmm6, xmm0
 	movdqa    xmm7, xmm2
 	movd      xmm5, edi ; store the edi for DC mode
@@ -1312,16 +1312,16 @@
 	pinsrw    xmm4, edx, 0
 	pinsrw    xmm4, ecx, 4
 	psllw     xmm4, 2
-	
+
 	; get the satd of H
 	psubw     xmm0, xmm3
 	psubw     xmm2, xmm4
-	
+
 	WELS_AbsW  xmm0, xmm1
 	WELS_AbsW  xmm2, xmm1
     paddusw        xmm0, xmm2
     SUMW_HORIZON1  xmm0, xmm1 ; satd of V is stored in xmm0
-	
+
 	; load left boundary samples: [a b c d]'
     mov       eax,  [esp+16]
 	mov       ebx,  [esp+20]
@@ -1330,7 +1330,7 @@
 	lea       eax , [eax+2*ebx]
 	movzx     esi,  byte [eax-1]
 	movzx     edi,  byte [eax+ebx-1]
-	
+
 	; get the transform results of left boundary samples: [a b c d]'
 	add       edx, ecx ; edx = a + b
 	add       edi, esi ; edi = c + d
@@ -1344,14 +1344,14 @@
 	add       esi, ecx ; esi = (a - b) + (c - d)
 	add       ecx, ecx
 	sub       ecx, esi ; ecx = (a - b) - (c - d) ; [edi edx ecx esi]'
-	
-	; store the transform results in xmm3	
+
+	; store the transform results in xmm3
     movd      xmm3, edi
 	pinsrw    xmm3, edx, 1
 	pinsrw    xmm3, ecx, 2
 	pinsrw    xmm3, esi, 3
 	psllw     xmm3, 2
-	
+
 	; get the satd of V
 	movdqa    xmm2, xmm6
 	movdqa    xmm4, xmm7
@@ -1368,7 +1368,7 @@
 	psrlw     xmm1, 3
 	movdqa    xmm5, xmm1
 	psllw     xmm1, 4
-	
+
     ; get the satd of DC
     psubw          xmm6, xmm1
     WELS_AbsW  xmm6, xmm1
@@ -1375,7 +1375,7 @@
 	WELS_AbsW  xmm7, xmm1
     paddusw        xmm6, xmm7
     SUMW_HORIZON1  xmm6, xmm1 ; satd of DC is stored in xmm6
-    
+
     ; comparing order: DC H V
     mov       edx, [esp+32]
     movd      eax, xmm6
@@ -1394,9 +1394,9 @@
     jg near   not_dc
     cmp       ax, si
     jg near   not_dc_h
-    
+
     ; for DC mode
-    movd      ebx, xmm5 
+    movd      ebx, xmm5
     imul      ebx, 0x01010101
     movd	  xmm5, ebx
 	pshufd    xmm5, xmm5, 0
@@ -1407,11 +1407,11 @@
     pop       esi
     pop       ebx
     ret
-    
+
 not_dc:
     cmp       di, si
     jg near   not_dc_h
-    
+
     ; for H mode
     SSE_DB_1_2REG  xmm6, xmm7
     mov       eax,  [esp+16]
@@ -1422,20 +1422,20 @@
 
 	movzx     ecx,  byte [eax+ebx-1]
 	movd      xmm1, ecx
-    pmuludq   xmm1, xmm6 
+    pmuludq   xmm1, xmm6
 %if 1
     punpckldq xmm0, xmm1
-%else    
+%else
 	unpcklps  xmm0,	xmm1
 %endif
 	lea       eax,	[eax+ebx*2]
 	movzx	  ecx,	byte [eax-1]
 	movd	  xmm2,	ecx
-    pmuludq   xmm2, xmm6  
+    pmuludq   xmm2, xmm6
 
 	movzx	  ecx,	byte [eax+ebx-1]
-	movd	  xmm3,	ecx	
-    pmuludq   xmm3, xmm6  
+	movd	  xmm3,	ecx
+    pmuludq   xmm3, xmm6
 %if 1
     punpckldq  xmm2, xmm3
     punpcklqdq xmm0, xmm2
@@ -1442,13 +1442,13 @@
 %else
 	unpcklps  xmm2,	xmm3
 	unpcklpd  xmm0,	xmm2
-%endif	
+%endif
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, edi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x01
-    
+
     pop       edi
     pop       esi
     pop       ebx
@@ -1460,14 +1460,14 @@
 	movd	  xmm0,	[eax]
 	pshufd	  xmm0,	xmm0, 0
 	movdqa	  [edx],xmm0
-	
+
 	mov       eax, esi
     mov       ebx, [esp+36]
 	mov       dword [ebx], 0x00
-    
+
     pop       edi
     pop       esi
     pop       ebx
     ret
-    
+
 
--- a/codec/encoder/core/asm/intra_pred_util.asm
+++ b/codec/encoder/core/asm/intra_pred_util.asm
@@ -32,7 +32,7 @@
 ;*  intra_pred_util.asm
 ;*
 ;*  Abstract
-;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and 
+;*      mmxext/sse for WelsFillingPred8to16, WelsFillingPred8x2to16 and
 ;*		WelsFillingPred1to16 etc.
 ;*
 ;*  History
@@ -84,7 +84,7 @@
 	movq mm0, [ecx]
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 	ret
 
@@ -100,16 +100,16 @@
 	movq mm1, [ecx+8]
 	movq [eax  ], mm0
 	movq [eax+8], mm1
-	
+
 	WELSEMMS
 
 	ret
 
 %macro butterfly_1to8_mmx	3	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	mov %3h, %3l	
-	movd %2, e%3x		; i.e, 1% = eax (=b0)	
-	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro 
+	mov %3h, %3l
+	movd %2, e%3x		; i.e, 1% = eax (=b0)
+	pshufw %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ALIGN 16
 ;***********************************************************************----------------
@@ -120,10 +120,10 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to8_mmx	mm0, mm1, c	; mm? for dst, mm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
-	
+
 	movq [eax  ], mm0
 	movq [eax+8], mm0
-	
+
 	WELSEMMS
 
 	ret
@@ -136,9 +136,9 @@
 	mov eax, [esp+4]	; pred
 	mov ecx, [esp+8]	; v
 
-	movdqa xmm0, [ecx]	
-	movdqa [eax], xmm0	
-	
+	movdqa xmm0, [ecx]
+	movdqa [eax], xmm0
+
 	ret
 
 ALIGN 16
@@ -150,7 +150,7 @@
 
 	mov cl, byte [esp+8]	; v
 	butterfly_1to16_sse	xmm0, xmm1, c		; dst, tmp, pSrc [generic register name: a/b/c/d]
-	
+
 	movdqa [eax], xmm0
-	
+
 	ret
--- a/codec/encoder/core/asm/mb_copy.asm
+++ b/codec/encoder/core/asm/mb_copy.asm
@@ -32,7 +32,7 @@
 ;*  mb_copy.asm
 ;*
 ;*  Abstract
-;*      mb_copy 
+;*      mb_copy
 ;*
 ;*
 ;*********************************************************************************************/
@@ -52,9 +52,9 @@
 WELS_EXTERN WelsCopy16x16_sse2
 WELS_EXTERN WelsCopy16x16NotAligned_sse2
 WELS_EXTERN WelsCopy8x8_mmx
-WELS_EXTERN WelsCopy16x8NotAligned_sse2	; 
-WELS_EXTERN WelsCopy8x16_mmx		; 
-WELS_EXTERN UpdateMbMv_sse2		; 
+WELS_EXTERN WelsCopy16x8NotAligned_sse2	;
+WELS_EXTERN WelsCopy8x16_mmx		;
+WELS_EXTERN UpdateMbMv_sse2		;
 
 ;***********************************************************************
 ; void WelsCopy16x16_sse2(	uint8_t* Dst,
@@ -66,7 +66,7 @@
 WelsCopy16x16_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -107,7 +107,7 @@
 	movdqa xmm5, [esi+ecx]
 	movdqa xmm6, [esi+2*ecx]
 	movdqa xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -116,7 +116,7 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
+	movdqa [edi+ebx], xmm7
 
 	pop ebx
 	pop edi
@@ -134,7 +134,7 @@
 WelsCopy16x16NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -175,7 +175,7 @@
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
 	movdqu xmm7, [esi+edx]
-	
+
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
 	movdqa [edi+2*eax], xmm2
@@ -184,8 +184,8 @@
 	movdqa [edi], xmm4
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
-	movdqa [edi+ebx], xmm7	
-	
+	movdqa [edi+ebx], xmm7
+
 	pop ebx
 	pop edi
 	pop esi
@@ -202,7 +202,7 @@
 WelsCopy16x8NotAligned_sse2:
 	push esi
 	push edi
-	push ebx	
+	push ebx
 
 	mov edi, [esp+16]	; Dst
 	mov eax, [esp+20]	; iStrideD
@@ -220,7 +220,7 @@
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+ecx]
 	movdqu xmm6, [esi+2*ecx]
-	movdqu xmm7, [esi+edx]	
+	movdqu xmm7, [esi+edx]
 
 	movdqa [edi], xmm0
 	movdqa [edi+eax], xmm1
@@ -231,7 +231,7 @@
 	movdqa [edi+eax], xmm5
 	movdqa [edi+2*eax], xmm6
 	movdqa [edi+ebx], xmm7
-	
+
 	pop ebx
 	pop edi
 	pop esi
@@ -245,7 +245,7 @@
 ;                       int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x16_mmx:	
+WelsCopy8x16_mmx:
 	push ebx
 
 	mov eax, [esp + 8 ]           ;Dst
@@ -253,60 +253,60 @@
 	mov ebx, [esp + 16]           ;Src
 	mov edx, [esp + 20]           ;iStrideS
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
+	movq [eax], mm6
 	movq [eax+ecx], mm7
 	lea eax, [eax+2*ecx]
 
-	movq mm0, [ebx]	
-	movq mm1, [ebx+edx]	
+	movq mm0, [ebx]
+	movq mm1, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm2, [ebx]	
-	movq mm3, [ebx+edx]	
+	movq mm2, [ebx]
+	movq mm3, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm4, [ebx]	
-	movq mm5, [ebx+edx]	
+	movq mm4, [ebx]
+	movq mm5, [ebx+edx]
 	lea ebx, [ebx+2*edx]
-	movq mm6, [ebx]	
-	movq mm7, [ebx+edx]		
-	
-	movq [eax], mm0	
-	movq [eax+ecx], mm1	
+	movq mm6, [ebx]
+	movq mm7, [ebx+edx]
+
+	movq [eax], mm0
+	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
-	movq [eax], mm2	
+	movq [eax], mm2
 	movq [eax+ecx], mm3
 	lea eax, [eax+2*ecx]
-	movq [eax], mm4	
+	movq [eax], mm4
 	movq [eax+ecx], mm5
 	lea eax, [eax+2*ecx]
-	movq [eax], mm6	
-	movq [eax+ecx], mm7	
+	movq [eax], mm6
+	movq [eax+ecx], mm7
 
 	WELSEMMS
-	pop ebx	
+	pop ebx
 	ret
-	
+
 ;***********************************************************************
 ; void WelsCopy8x8_mmx(  uint8_t* Dst,
 ;                        int32_t  iStrideD,
@@ -314,7 +314,7 @@
 ;                        int32_t  iStrideS )
 ;***********************************************************************
 ALIGN 16
-WelsCopy8x8_mmx:	
+WelsCopy8x8_mmx:
 	push ebx
 	push esi
 	mov eax, [esp + 12]           ;Dst
@@ -343,7 +343,7 @@
 	lea esi, [esi+2*ebx]
 	movq mm6, [esi]
 	movq mm7, [esi+ebx]
-	
+
 	movq [eax], mm0
 	movq [eax+ecx], mm1
 	lea eax, [eax+2*ecx]
@@ -355,12 +355,12 @@
 	lea eax, [eax+2*ecx]
 	movq [eax], mm6
 	movq [eax+ecx], mm7
-		
+
 	WELSEMMS
-	pop esi	
+	pop esi
 	pop ebx
 	ret
-	
+
 ; (dunhuang@cisco), 12/21/2011
 ;***********************************************************************
 ; void UpdateMbMv_sse2( SMVUnitXY *pMvBuffer, const SMVUnitXY sMv )
@@ -417,8 +417,8 @@
 WELS_EXTERN McCopyWidthEq4_mmx
 WELS_EXTERN McCopyWidthEq8_mmx
 WELS_EXTERN McCopyWidthEq16_sse2
-                          
 
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq8_mmx( uint8_t *dst,  int32_t iDstStride,
@@ -432,19 +432,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
-	movq        mm0, [esi]	
+	movq        mm0, [esi]
     pavgb       mm0, [edx]
     movq        [edi], mm0
-	movq		mm1, [esi+eax]		
+	movq		mm1, [esi+eax]
 	pavgb		mm1, [edx+ebx]
 	movq		[edi+ebp], mm1
 	lea         edi, [edi+2*ebp]
@@ -451,19 +451,19 @@
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
 
-	movq        mm2, [esi]	
+	movq        mm2, [esi]
 	pavgb       mm2, [edx]
     movq        [edi], mm2
-	movq		mm3, [esi+eax]	
+	movq		mm3, [esi+eax]
 	pavgb		mm3, [edx+ebx]
 	movq		[edi+ebp], mm3
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
 	lea         edx, [edx+2*ebx]
-	
+
 	dec         ecx
     jne         .height_loop
-	
+
 	WELSEMMS
     pop         edi
     pop         esi
@@ -485,19 +485,19 @@
     push        esi
     push        edi
 
-    mov         edi, [esp+20]       
-    mov         esi, [esp+28]       
-    mov         edx, [esp+36]       
-    mov         ebp, [esp+24]       
-    mov         eax, [esp+32]       
-    mov         ebx, [esp+40]       
-    mov         ecx, [esp+44]       
+    mov         edi, [esp+20]
+    mov         esi, [esp+28]
+    mov         edx, [esp+36]
+    mov         ebp, [esp+24]
+    mov         eax, [esp+32]
+    mov         ebx, [esp+40]
+    mov         ecx, [esp+44]
 	sar			ecx, 2
 .height_loop:
 	movdqu      xmm0, [esi]
 	movdqu      xmm1, [edx]
 	movdqu      xmm2, [esi+eax]
-	movdqu      xmm3, [edx+ebx]	
+	movdqu      xmm3, [edx+ebx]
 	pavgb       xmm0, xmm1
 	pavgb       xmm2, xmm3
 	movdqu      [edi], xmm0
@@ -504,12 +504,12 @@
 	movdqu      [edi+ebp], xmm2
 	lea			edi, [edi+2*ebp]
 	lea			esi, [esi+2*eax]
-	lea			edx, [edx+2*ebx]	
+	lea			edx, [edx+2*ebx]
 
 	movdqu      xmm4, [esi]
 	movdqu      xmm5, [edx]
 	movdqu      xmm6, [esi+eax]
-	movdqu      xmm7, [edx+ebx]	
+	movdqu      xmm7, [edx+ebx]
 	pavgb       xmm4, xmm5
 	pavgb       xmm6, xmm7
 	movdqu      [edi], xmm4
@@ -516,11 +516,11 @@
 	movdqu      [edi+ebp], xmm6
 	lea         edi, [edi+2*ebp]
 	lea         esi, [esi+2*eax]
-    lea         edx, [edx+2*ebx]	
-    
+    lea         edx, [edx+2*ebx]
+
 	dec         ecx
 	jne         .height_loop
-	
+
     pop         edi
     pop         esi
     pop         ebx
@@ -540,7 +540,7 @@
     dec    dword [esp+4]
     jg     avg_w16_align_0_ssse3
     ret
-    
+
     ALIGN 64
 avg_w16_align_1_ssse3:
     movdqa  xmm1, [ebx+16]
@@ -555,7 +555,7 @@
     jg     avg_w16_align_1_ssse3
     ret
 
-  
+
 ALIGN 16
 ;***********************************************************************
 ; void PixelAvgWidthEq16_ssse3(uint8_t *pDst,  int32_t iDstStride,
@@ -574,7 +574,7 @@
     mov         ebx, [esp+28]       ; src1
     mov         ecx, [esp+36]       ; src2
     mov         esi, [esp+24]       ; i_dst_stride
-    
+
      %define avg_w16_offset (avg_w16_align_1_ssse3-avg_w16_align_0_ssse3)
     mov edx, ebx
     and edx, 0x01
@@ -582,11 +582,11 @@
     lea ebp, [avg_w16_offset]
     imul ebp, edx
     lea edx, [ebp+eax]
-    
-    mov eax, [esp+32]  
-    mov ebp, [esp+44] 
+
+    mov eax, [esp+32]
+    mov ebp, [esp+44]
     push ebp
-    mov ebp, [esp+44]	
+    mov ebp, [esp+44]
     and ebx, 0xfffffff0
     call edx
 	pop		   ebp
@@ -607,7 +607,7 @@
     push    edi
     push    ebx
 
-    
+
     mov esi,  [esp+16]
     mov eax, [esp+20]
     mov edi,  [esp+24]
@@ -617,12 +617,12 @@
 .height_loop:
 	mov ebx, [esi]
 	mov [edi], ebx
-	
+
 	add esi, eax
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	WELSEMMS   
+	WELSEMMS
 	pop	   ebx
     pop     edi
     pop     esi
@@ -650,12 +650,12 @@
 	add edi, ecx
 	dec edx
 	jnz .height_loop
-	
-	WELSEMMS   
+
+	WELSEMMS
     pop     edi
     pop     esi
     ret
-	
+
 ALIGN 16
 ;***********************************************************************
 ;   void McCopyWidthEq16_sse2( uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride, int32_t iHeight )
@@ -664,11 +664,11 @@
     push    esi
     push    edi
 
-    mov     esi, [esp+12]       
-    mov     eax, [esp+16]       
-    mov     edi, [esp+20]       
-    mov     edx, [esp+24]       
-    mov     ecx, [esp+28]       
+    mov     esi, [esp+12]
+    mov     eax, [esp+16]
+    mov     edi, [esp+20]
+    mov     edx, [esp+24]
+    mov     ecx, [esp+28]
 
 ALIGN 4
 .height_loop:
@@ -681,7 +681,7 @@
     lea     esi, [esi+eax*2]
     lea     edi, [edi+edx*2]
     jnz     .height_loop
-  
+
     pop     edi
     pop     esi
     ret
--- a/codec/encoder/core/asm/mc_chroma.asm
+++ b/codec/encoder/core/asm/mc_chroma.asm
@@ -69,11 +69,11 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void McChromaWidthEq4_mmx( uint8_t *src, 
-;							int32_t iSrcStride, 
-;							uint8_t *pDst, 
-;							int32_t iDstStride, 
-;							uint8_t *pABCD, 
+; void McChromaWidthEq4_mmx( uint8_t *src,
+;							int32_t iSrcStride,
+;							uint8_t *pDst,
+;							int32_t iDstStride,
+;							uint8_t *pABCD,
 ;							int32_t iHeigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq4_mmx
@@ -81,29 +81,29 @@
 	push esi
 	push edi
 	push ebx
-	
+
 	mov eax, [esp +12 + 20]
 	movd mm3, [eax]
 	WELS_Zero mm7
 	punpcklbw mm3, mm3
 	movq      mm4, mm3
-	punpcklwd mm3, mm3       
-	punpckhwd mm4, mm4		 
-	
+	punpcklwd mm3, mm3
+	punpckhwd mm4, mm4
+
 	movq	  mm5, mm3
 	punpcklbw mm3, mm7
 	punpckhbw mm5, mm7
-	
+
 	movq	  mm6, mm4
 	punpcklbw mm4, mm7
 	punpckhbw mm6, mm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
 	lea ebx, [esi + eax]
 	movd mm0, [esi]
 	movd mm1, [esi+1]
@@ -110,17 +110,17 @@
 	punpcklbw mm0, mm7
 	punpcklbw mm1, mm7
 .xloop:
-	
+
 	pmullw mm0, mm3
 	pmullw mm1, mm5
 	paddw  mm0, mm1
-	
+
 	movd  mm1, [ebx]
 	punpcklbw mm1, mm7
 	movq mm2, mm1
 	pmullw mm1, mm4
 	paddw mm0, mm1
-	
+
 	movd mm1, [ebx+1]
 	punpcklbw mm1, mm7
 	movq mm7, mm1
@@ -130,13 +130,13 @@
 
 	paddw mm0, [h264_d0x20_mmx]
 	psrlw mm0, 6
-	
+
 	WELS_Zero mm7
 	packuswb mm0, mm7
-	movd [edi], mm0	
+	movd [edi], mm0
 
 	movq mm0, mm2
-	
+
 	lea edi, [edi +edx  ]
 	lea ebx, [ebx + eax]
 
@@ -151,11 +151,11 @@
 
 ALIGN 16
 ;*******************************************************************************
-; void McChromaWidthEq8_sse2( uint8_t *pSrc, 
-;						int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
-;						uint8_t *pABCD, 
+; void McChromaWidthEq8_sse2( uint8_t *pSrc,
+;						int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
+;						uint8_t *pABCD,
 ;						int32_t iheigh );
 ;*******************************************************************************
 WELS_EXTERN McChromaWidthEq8_sse2
@@ -163,30 +163,30 @@
 	push esi
 	push edi
 	push ebx
-	
+
 	mov eax, [esp +12 + 20]
 	movd xmm3, [eax]
 	WELS_Zero xmm7
 	punpcklbw  xmm3, xmm3
 	punpcklwd  xmm3, xmm3
-	
+
 	movdqa	   xmm4, xmm3
 	punpckldq  xmm3, xmm3
 	punpckhdq  xmm4, xmm4
 	movdqa     xmm5, xmm3
 	movdqa	   xmm6, xmm4
-	
+
 	punpcklbw  xmm3, xmm7
 	punpckhbw  xmm5, xmm7
 	punpcklbw  xmm4, xmm7
 	punpckhbw  xmm6, xmm7
-	
-	mov esi, [esp +12+ 4]   
-	mov eax, [esp + 12 + 8]   
-	mov edi, [esp + 12 + 12]  
-	mov edx, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-		
+
+	mov esi, [esp +12+ 4]
+	mov eax, [esp + 12 + 8]
+	mov edi, [esp + 12 + 12]
+	mov edx, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
 	lea ebx, [esi + eax]
 	movq xmm0, [esi]
 	movq xmm1, [esi+1]
@@ -193,17 +193,17 @@
 	punpcklbw xmm0, xmm7
 	punpcklbw xmm1, xmm7
 .xloop:
-	
+
 	pmullw xmm0, xmm3
 	pmullw xmm1, xmm5
 	paddw  xmm0, xmm1
-	
+
 	movq  xmm1, [ebx]
 	punpcklbw xmm1, xmm7
 	movdqa xmm2, xmm1
 	pmullw xmm1, xmm4
 	paddw xmm0, xmm1
-	
+
 	movq xmm1, [ebx+1]
 	punpcklbw xmm1, xmm7
 	movdqa xmm7, xmm1
@@ -213,19 +213,19 @@
 
 	paddw xmm0, [h264_d0x20_sse2]
 	psrlw xmm0, 6
-	
+
 	WELS_Zero xmm7
 	packuswb xmm0, xmm7
-	movq [edi], xmm0	
+	movq [edi], xmm0
 
 	movdqa xmm0, xmm2
-	
+
 	lea edi, [edi +edx  ]
 	lea ebx, [ebx + eax]
 
 	dec ecx
 	jnz near .xloop
-	
+
 	pop ebx
 	pop edi
 	pop esi
@@ -237,8 +237,8 @@
 ALIGN 16
 ;***********************************************************************
 ; void McChromaWidthEq8_ssse3( uint8_t *pSrc,
-;						 int32_t iSrcStride, 
-;                        uint8_t *pDst,  
+;						 int32_t iSrcStride,
+;                        uint8_t *pDst,
 ;                        int32_t iDstStride,
 ;                        uint8_t *pABCD,
 ;					     int32_t iHeigh);
@@ -248,23 +248,23 @@
 	push ebx
 	push esi
 	push edi
-		
+
 	mov eax, [esp + 12 + 20]
 
     pxor      xmm7, xmm7
-    movd   xmm5, [eax]   
-    punpcklwd xmm5, xmm5  
-    punpckldq xmm5, xmm5 
+    movd   xmm5, [eax]
+    punpcklwd xmm5, xmm5
+    punpckldq xmm5, xmm5
     movdqa    xmm6, xmm5
     punpcklqdq xmm5, xmm5
-    punpckhqdq xmm6, xmm6    
-    
-	mov eax, [esp + 12 + 4]   
-	mov edx, [esp + 12 + 8]   
-	mov esi, [esp + 12 + 12]  
-	mov edi, [esp + 12 + 16]  
-    mov ecx, [esp + 12 + 24]   
-    
+    punpckhqdq xmm6, xmm6
+
+	mov eax, [esp + 12 + 4]
+	mov edx, [esp + 12 + 8]
+	mov esi, [esp + 12 + 12]
+	mov edi, [esp + 12 + 16]
+    mov ecx, [esp + 12 + 24]
+
     sub esi, edi
     sub esi, edi
 	movdqa xmm7, [h264_d0x20_sse2]
@@ -273,16 +273,16 @@
 	movdqa xmm1, xmm0
 	psrldq xmm1, 1
 	punpcklbw xmm0, xmm1
-	
-.hloop_chroma:	
+
+.hloop_chroma:
 	lea	esi, [esi+2*edi]
-	
+
 	movdqu xmm2, [eax+edx]
 	movdqa xmm3, xmm2
 	psrldq xmm3, 1
 	punpcklbw xmm2, xmm3
 	movdqa      xmm4, xmm2
-	
+
     pmaddubsw  xmm0, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm0, xmm2
@@ -289,8 +289,8 @@
     paddw      xmm0, xmm7
 	psrlw      xmm0, 6
     packuswb   xmm0, xmm0
-    movq       [esi],xmm0	
-    
+    movq       [esi],xmm0
+
     lea eax, [eax+2*edx]
     movdqu xmm2, [eax]
     movdqa xmm3, xmm2
@@ -297,7 +297,7 @@
     psrldq xmm3, 1
     punpcklbw xmm2, xmm3
     movdqa      xmm0, xmm2
-    
+
     pmaddubsw  xmm4, xmm5
     pmaddubsw  xmm2, xmm6
     paddw      xmm4, xmm2
@@ -304,8 +304,8 @@
     paddw      xmm4, xmm7
 	psrlw      xmm4, 6
     packuswb   xmm4, xmm4
-    movq       [esi+edi],xmm4	
-	
+    movq       [esi+edi],xmm4
+
 	sub ecx, 2
 	jnz .hloop_chroma
 	pop edi
--- a/codec/encoder/core/asm/mc_luma.asm
+++ b/codec/encoder/core/asm/mc_luma.asm
@@ -91,10 +91,10 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc, 
-;								int32_t iSrcStride, 
-;								uint8_t *pDst, 
-;								int32_t iDstStride, 
+; void McHorVer20WidthEq16_sse2(  uint8_t *pSrc,
+;								int32_t iSrcStride,
+;								uint8_t *pDst,
+;								int32_t iDstStride,
 ;								int32_t iHeight,
 ;                      );
 ;***********************************************************************
@@ -101,19 +101,19 @@
 McHorVer20WidthEq16_sse2:
 	push	esi
 	push	edi
-	
 
-	mov esi, [esp + 12]         
-	mov eax, [esp + 16]         
-	mov edi, [esp + 20]         
-	mov ecx, [esp + 28]         
-	mov edx, [esp + 24]			
-	sub esi, 2                  
-	
+
+	mov esi, [esp + 12]
+	mov eax, [esp + 16]
+	mov edi, [esp + 20]
+	mov ecx, [esp + 28]
+	mov edx, [esp + 24]
+	sub esi, 2
+
 	WELS_Zero  xmm7
 	movdqa xmm6, [h264_w0x10_1]
 .y_loop:
-	
+
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -126,7 +126,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -152,7 +152,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -165,8 +165,8 @@
 	psraw xmm0, 5
 	packuswb xmm0, xmm7
 	movq [edi+8], xmm0
-	
-	
+
+
 	add esi, eax
 	add edi, edx
 	dec ecx
@@ -178,9 +178,9 @@
 
 ALIGN 16
 ;***********************************************************************
-; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc, 
-;									int32_t iSrcStride, 
-;									uint8_t* pTap,	
+; void McHorVer22Width8HorFirst_sse2( uint8_t*pSrc,
+;									int32_t iSrcStride,
+;									uint8_t* pTap,
 ;									int32_t iTapStride,
 ;									int32_t iHeight);
 ;***********************************************************************
@@ -193,11 +193,11 @@
 	mov edi, [esp+24]		;tap
 	mov edx, [esp+28]	;tap_stride
 	mov ebx, [esp+32]	;i_height
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-		
+
 .yloop_width_8:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
@@ -211,7 +211,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -221,7 +221,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -230,12 +230,12 @@
 	pop edi
 	pop esi
 	ret
-	
+
 ;***********************************************************************
-; void McHorVer02WidthEq8_sse2( uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
-;                       int32_t iDstStride, 
+; void McHorVer02WidthEq8_sse2( uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
+;                       int32_t iDstStride,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -242,18 +242,18 @@
 McHorVer02WidthEq8_sse2:
 	push esi
 	push edi
-	
-	mov esi, [esp + 12]           
-	mov edx, [esp + 16]	          
-	mov edi, [esp + 20]           
-	mov eax, [esp + 24]           
-	mov ecx, [esp + 28]           
 
+	mov esi, [esp + 12]
+	mov edx, [esp + 16]
+	mov edi, [esp + 20]
+	mov eax, [esp + 24]
+	mov ecx, [esp + 28]
+
 	sub esi, edx
 	sub esi, edx
 
 	WELS_Zero xmm7
-			
+
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -262,8 +262,8 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .xx_exit
@@ -273,7 +273,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .xx_exit
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -356,11 +356,11 @@
 
 
 ;***********************************************************************
-; void McHorVer02_sse2(	uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;                       uint8_t *pDst, 
+; void McHorVer02_sse2(	uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;                       uint8_t *pDst,
 ;                       int32_t iDstStride,
-;						int32_t iWidth, 
+;						int32_t iWidth,
 ;                       int32_t iHeight )
 ;***********************************************************************
 ALIGN 16
@@ -368,19 +368,19 @@
 	push esi
 	push edi
 	push ebx
-	
-	mov esi, [esp + 16]           
-	mov edx, [esp + 20]	          
-	mov edi, [esp + 24]           
-	mov eax, [esp + 28]           
-	mov ecx, [esp + 36]           
-	mov ebx, [esp + 32]			  
+
+	mov esi, [esp + 16]
+	mov edx, [esp + 20]
+	mov edi, [esp + 24]
+	mov eax, [esp + 28]
+	mov ecx, [esp + 36]
+	mov ebx, [esp + 32]
 	shr ebx, 3
 	sub esi, edx
 	sub esi, edx
-	
-.xloop:	
-	WELS_Zero xmm7			
+
+.xloop:
+	WELS_Zero xmm7
 	SSE_LOAD_8P xmm0, xmm7, [esi]
 	SSE_LOAD_8P xmm1, xmm7, [esi+edx]
 	lea esi, [esi+2*edx]
@@ -389,7 +389,7 @@
 	lea esi, [esi+2*edx]
 	SSE_LOAD_8P xmm4, xmm7, [esi]
 	SSE_LOAD_8P xmm5, xmm7, [esi+edx]
-	
+
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*edx]
@@ -402,8 +402,8 @@
 	movdqa xmm5,xmm6
 	add edi, eax
 	sub esi, edx
-	
-.start:	
+
+.start:
 	FILTER_HV_W8 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
@@ -413,7 +413,7 @@
 	FILTER_HV_W8 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [edi+eax]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*eax]
 	SSE_LOAD_8P xmm7, xmm0, [esi+edx]
 	FILTER_HV_W8 xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
@@ -454,16 +454,16 @@
 	SSE_LOAD_8P xmm5, xmm6, [esi+edx]
 	jmp near .start
 
-.x_loop_dec:	
+.x_loop_dec:
 	dec ebx
 	jz  near .xx_exit
-	mov esi, [esp + 16]           
-	mov edi, [esp + 24]           
+	mov esi, [esp + 16]
+	mov edi, [esp + 24]
 	sub esi, edx
 	sub esi, edx
 	add esi, 8
 	add edi, 8
-	mov ecx, [esp + 36] 
+	mov ecx, [esp + 36]
 	jmp near .xloop
 
 .xx_exit:
@@ -473,12 +473,12 @@
 	ret
 
 
-ALIGN 16                  
+ALIGN 16
 ;***********************************************************************
-; void McHorVer20_sse2(		uint8_t *pSrc, 
-;                       int32_t iSrcStride, 
-;						uint8_t *pDst, 
-;						int32_t iDstStride, 
+; void McHorVer20_sse2(		uint8_t *pSrc,
+;                       int32_t iSrcStride,
+;						uint8_t *pDst,
+;						int32_t iDstStride,
 ;						int32_t iWidth,
 ;						int32_t iHeight
 ;                      );
@@ -487,19 +487,19 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
 	sub esi, 2
-	pxor xmm7, xmm7	
-	
+	pxor xmm7, xmm7
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -512,7 +512,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -526,12 +526,12 @@
 	paddw xmm0, [h264_w0x10_1]
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -543,8 +543,8 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+1], xmm2	
-		
+	movq [edi+1], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -553,8 +553,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -569,7 +569,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -582,7 +582,7 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movq [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -595,7 +595,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -610,12 +610,12 @@
 	psraw  xmm0, 5
 	packuswb xmm0, xmm0
 	movd [edi+8], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -627,7 +627,7 @@
 	paddw xmm2, [h264_w0x10_1]
 	psraw  xmm2, 5
 	packuswb xmm2, xmm2
-	movq [edi+9], xmm2		
+	movq [edi+9], xmm2
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -636,14 +636,14 @@
 	pop edi
 	pop esi
 	ret
-	
-	
 
+
+
 ALIGN 16
 ;***********************************************************************
 ;void McHorVer22HorFirst_sse2
-;							(uint8_t *pSrc, 
-;							int32_t iSrcStride, 
+;							(uint8_t *pSrc,
+;							int32_t iSrcStride,
 ;							uint8_t * pTap,
 ;							int32_t iTapStride,
 ;							int32_t iWidth,int32_t iHeight);
@@ -652,21 +652,21 @@
 	push esi
 	push edi
 	push ebx
-	mov esi, [esp+16]     
-	mov eax, [esp+20]	
-	mov edi, [esp+24]	
-	mov edx, [esp+28]	
-	mov ecx, [esp+32]	
-	mov ebx, [esp+36]	
-	pxor xmm7, xmm7	
-	
+	mov esi, [esp+16]
+	mov eax, [esp+20]
+	mov edi, [esp+24]
+	mov edx, [esp+28]
+	mov ecx, [esp+32]
+	mov ebx, [esp+36]
+	pxor xmm7, xmm7
+
 	sub esi, eax				;;;;;;;;need more 5 lines.
 	sub esi, eax
-	
+
 	cmp ecx, 9
-	jne near .width_17	
-	
-.yloop_width_9:	
+	jne near .width_17
+
+.yloop_width_9:
 	movq xmm0, [esi]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5]
@@ -679,7 +679,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -690,12 +690,12 @@
 	paddw xmm0, xmm6
 	psllw xmm6, 2
 	paddw xmm0, xmm6
-	movd [edi], xmm0	
-	
+	movd [edi], xmm0
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -704,9 +704,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+2], xmm2	
-	movhps [edi+2+8], xmm2	
-	
+	movq [edi+2], xmm2
+	movhps [edi+2+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -715,8 +715,8 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 .width_17:
 .yloop_width_17:
 	movq xmm0, [esi]
@@ -731,7 +731,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3]
 	punpcklbw xmm5, xmm7
-	
+
 	paddw xmm2, xmm3
 	paddw xmm4, xmm5
 	psllw xmm4, 2
@@ -741,7 +741,7 @@
 	psllw xmm4, 2
 	paddw xmm0, xmm4
 	movdqa [edi], xmm0
-		
+
 	movq xmm0, [esi+8]
 	punpcklbw xmm0, xmm7
 	movq xmm1, [esi+5+8]
@@ -754,7 +754,7 @@
 	punpcklbw xmm4, xmm7
 	movq xmm5, [esi+3+8]
 	punpcklbw xmm5, xmm7
-	
+
 	movdqa xmm7, xmm2
 	paddw   xmm7, xmm3
 	movdqa xmm6, xmm4
@@ -766,12 +766,12 @@
 	psllw xmm6, 2
 	paddw xmm0, xmm6
 	movd [edi+16], xmm0
-	
-	
+
+
 	pxor  xmm7, xmm7
 	movq xmm0, [esi+6+8]
 	punpcklbw xmm0, xmm7
-	
+
 	paddw xmm4, xmm1
 	paddw xmm5, xmm3
 	psllw xmm5, 2
@@ -780,9 +780,9 @@
 	paddw xmm2, xmm5
 	psllw xmm5, 2
 	paddw xmm2, xmm5
-	movq [edi+18], xmm2	
-	movhps [edi+18+8], xmm2	
-	
+	movq [edi+18], xmm2
+	movhps [edi+18+8], xmm2
+
 	add esi, eax
 	add edi, edx
 	dec ebx
@@ -791,23 +791,23 @@
 	pop edi
 	pop esi
 	ret
-	
-	
+
+
 %macro FILTER_VER 9
 	paddw  %1, %6
 	movdqa %7, %2
 	movdqa %8, %3
-	
-	
+
+
 	paddw %7, %5
 	paddw %8, %4
-	
-	psubw  %1, %7   
-	psraw   %1, 2	  
-	paddw  %1, %8   
-	psubw  %1, %7 
-	psraw   %1, 2	
-	paddw  %8, %1   
+
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %1, %8
+	psubw  %1, %7
+	psraw   %1, 2
+	paddw  %8, %1
 	paddw  %8, [h264_mc_hc_32]
 	psraw   %8, 6
 	packuswb %8, %8
@@ -815,8 +815,8 @@
 %endmacro
 ;***********************************************************************
 ;void McHorVer22VerLastAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -828,15 +828,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqa xmm0, [esi]
 	movdqa xmm1, [esi+eax]
@@ -846,12 +846,12 @@
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	movdqa xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -858,61 +858,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqa xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqa xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -922,9 +922,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
@@ -934,8 +934,8 @@
 
 ;***********************************************************************
 ;void McHorVer22VerLastUnAlign_sse2(
-;											uint8_t *pTap, 
-;											int32_t iTapStride, 
+;											uint8_t *pTap,
+;											int32_t iTapStride,
 ;											uint8_t * pDst,
 ;											int32_t iDstStride,
 ;											int32_t iWidth,
@@ -947,15 +947,15 @@
 	push edi
 	push ebx
 	push ebp
-	
+
 	mov esi, [esp+20]
 	mov eax, [esp+24]
 	mov edi, [esp+28]
 	mov edx, [esp+32]
 	mov ebx, [esp+36]
-	mov ecx, [esp+40]	
-	shr ebx, 3	
-	
+	mov ecx, [esp+40]
+	shr ebx, 3
+
 .width_loop:
 	movdqu xmm0, [esi]
 	movdqu xmm1, [esi+eax]
@@ -965,12 +965,12 @@
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	movdqu xmm5, [esi+eax]
-	
+
 	FILTER_VER xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
-	
+
 	movdqa xmm0, xmm1
 	movdqa xmm1, xmm2
 	movdqa xmm2, xmm3
@@ -977,61 +977,61 @@
 	movdqa xmm3, xmm4
 	movdqa xmm4, xmm5
 	movdqa xmm5, xmm6
-	
+
 	add edi, edx
-	sub esi, eax		
-	
+	sub esi, eax
+
 .start:
 	FILTER_VER xmm0,xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm6, [esi]
 	FILTER_VER xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm7, [esi+eax]
 	FILTER_VER  xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, [edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm0, [esi]
 	FILTER_VER  xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm1, [esi+eax]
 	FILTER_VER  xmm4, xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm2, [esi]
 	FILTER_VER  xmm5, xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,[edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm3, [esi+eax]
 	FILTER_VER  xmm6, xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,[edi]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea esi, [esi+2*eax]
 	movdqu xmm4, [esi]
 	FILTER_VER  xmm7, xmm0, xmm1, xmm2, xmm3,xmm4,xmm5,xmm6, [edi+edx]
 	dec ecx
 	jz near .x_loop_dec
-	
+
 	lea edi, [edi+2*edx]
 	movdqu xmm5, [esi+eax]
 	jmp near .start
-	
+
 .x_loop_dec:
 	dec ebx
 	jz near .exit
@@ -1041,9 +1041,9 @@
 	add esi, 16
 	add edi, 8
 	jmp .width_loop
-	
-	
-	
+
+
+
 .exit:
 	pop ebp
 	pop ebx
--- a/codec/encoder/core/asm/memzero.asm
+++ b/codec/encoder/core/asm/memzero.asm
@@ -32,8 +32,8 @@
 ;*  memzero.asm
 ;*
 ;*  Abstract
-;*      
 ;*
+;*
 ;*  History
 ;*      9/16/2009 Created
 ;*
@@ -47,8 +47,8 @@
 ; Code
 ;***********************************************************************
 
-SECTION .text			
-		
+SECTION .text
+
 ALIGN 16
 ;***********************************************************************
 ;_inline void __cdecl WelsPrefetchZero_mmx(int8_t const*_A);
@@ -57,7 +57,7 @@
 WelsPrefetchZero_mmx:
 	mov  eax,[esp+4]
 	prefetchnta [eax]
-	ret 			
+	ret
 
 
 ALIGN 16
@@ -69,7 +69,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	xmm0,		xmm0
 .memzeroa64_sse2_loops:
 		movdqa	[eax],		xmm0
@@ -77,12 +77,12 @@
 		movdqa	[eax+32],	xmm0
 		movdqa	[eax+48],	xmm0
 		add		eax, 0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzeroa64_sse2_loops
-			
-		ret	
 
+		ret
+
 ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
@@ -92,7 +92,7 @@
 		mov		eax,	[esp + 4]          ; dst
 		mov		ecx,	[esp + 8]
 		neg		ecx
-			
+
 		pxor	mm0,		mm0
 .memzero64_mmx_loops:
 		movq	[eax],		mm0
@@ -102,16 +102,16 @@
 		movq	[eax+32],	mm0
 		movq	[eax+40],	mm0
 		movq	[eax+48],	mm0
-		movq	[eax+56],	mm0		
+		movq	[eax+56],	mm0
 		add		eax,		0x40
-		
+
 		add ecx, 0x40
 		jnz near .memzero64_mmx_loops
-			
-		WELSEMMS	
-		ret	
-	
-ALIGN 16		
+
+		WELSEMMS
+		ret
+
+ALIGN 16
 ;***********************************************************************
 ;   void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
 ;***********************************************************************
@@ -119,17 +119,17 @@
 WelsSetMemZeroSize8_mmx:
 		mov		eax,	[esp + 4]		; dst
 		mov		ecx,	[esp + 8]		; size
-		neg		ecx			
+		neg		ecx
 		pxor	mm0,		mm0
-		
+
 .memzero8_mmx_loops:
 		movq	[eax],		mm0
 		add		eax,		0x08
-	
+
 		add		ecx,		0x08
 		jnz near .memzero8_mmx_loops
-		
-		WELSEMMS	
-		ret	
 
-							
+		WELSEMMS
+		ret
+
+
--- a/codec/encoder/core/asm/quant.asm
+++ b/codec/encoder/core/asm/quant.asm
@@ -44,17 +44,17 @@
 
 BITS 32
 
-SECTION .text	
+SECTION .text
 ;************************************************
-;NEW_QUANT 
+;NEW_QUANT
 ;************************************************
 
 %macro SSE2_Quant8  5
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -64,10 +64,10 @@
 
 %macro SSE2_QuantMax8  6
 		MOVDQ	%1, %5
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2								
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pmaxsw	%6, %1
@@ -86,17 +86,17 @@
 WELS_EXTERN WelsQuant4x4_sse2
 align 16
 WelsQuant4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
+
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 
 		ret
-	
+
 ;***********************************************************************
 ;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
 ;***********************************************************************
@@ -104,17 +104,17 @@
 align 16
 WelsQuant4x4Dc_sse2:
 		mov		ax,		[mf]
-		SSE2_Copy8Times xmm3, eax						
-		
+		SSE2_Copy8Times xmm3, eax
+
 		mov		cx, [ff]
-		SSE2_Copy8Times xmm2, ecx						
+		SSE2_Copy8Times xmm2, ecx
 
 		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
-				
-		ret		
-		
+
+		ret
+
 ;***********************************************************************
 ;	void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff,  int16_t *mf);
 ;***********************************************************************
@@ -121,20 +121,20 @@
 WELS_EXTERN WelsQuantFour4x4_sse2
 align 16
 WelsQuantFour4x4_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]	
+
+		mov		edx,  [pDct]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x40]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50]
 		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60]
-		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]	
+		SSE2_Quant8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70]
 
 		ret
 
@@ -144,17 +144,17 @@
 WELS_EXTERN WelsQuantFour4x4Max_sse2
 align 16
 WelsQuantFour4x4Max_sse2:
-		mov		eax,  [ff]		
-		mov		ecx,  [mf]			
+		mov		eax,  [ff]
+		mov		ecx,  [mf]
 		MOVDQ	xmm2, [eax]
 		MOVDQ	xmm3, [ecx]
-		
-		mov		edx,  [pDct]		
+
+		mov		edx,  [pDct]
 		pxor	xmm4, xmm4
 		pxor	xmm5, xmm5
 		pxor	xmm6, xmm6
 		pxor	xmm7, xmm7
-		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4		
+		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx	   ], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x10], xmm4
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x20], xmm5
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x30], xmm5
@@ -162,20 +162,20 @@
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x50], xmm6
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x60], xmm7
 		SSE2_QuantMax8	xmm0, xmm1, xmm2, xmm3, [edx + 0x70], xmm7
-		
+
 		SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
-		pmaxsw  xmm0,  xmm4	
+		pmaxsw  xmm0,  xmm4
 		pmaxsw  xmm0,  xmm5
-		pmaxsw  xmm0,  xmm7			
+		pmaxsw  xmm0,  xmm7
 		movdqa	xmm1,  xmm0
 		punpckhqdq	xmm0, xmm1
 		pmaxsw	xmm0, xmm1
 
-		mov		edx,  [max]	
-		movq	[edx], xmm0	
-			
-		ret		
+		mov		edx,  [max]
+		movq	[edx], xmm0
 
+		ret
+
 %macro  MMX_Copy4Times 2
 		movd		%1, %2
 		punpcklwd	%1, %1
@@ -185,10 +185,10 @@
 SECTION .text
 
 %macro MMX_Quant4  4
-		pxor	%2, %2							
-		pcmpgtw	%2, %1							
-		pxor	%1, %2							
-		psubw	%1, %2							
+		pxor	%2, %2
+		pcmpgtw	%2, %1
+		pxor	%1, %2
+		psubw	%1, %2
 		paddusw	%1, %3
 		pmulhuw	%1, %4
 		pxor	%1, %2
@@ -211,13 +211,13 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		mov			cx,				0
 		mov			[eax],			cx
 		mov			[eax + 0x20],	cx
 		mov			[eax + 0x40],	cx
 		mov			[eax + 0x60],	cx
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -229,22 +229,22 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; store dct_2x2
-		mov			edx,			[dct2x2]	
+		mov			edx,			[dct2x2]
 		movq		[edx],			mm1
 		mov			ecx,			[iChromaDc]
 		movq		[ecx],			mm1
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -251,10 +251,10 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-					
+
 		WELSEMMS
 		ret
-	
+
 ;***********************************************************************
 ;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff,  int16_t mf);
 ;***********************************************************************
@@ -269,7 +269,7 @@
 		movd		mm3,			[eax + 0x40]
 		movd		mm1,			[eax + 0x60]
 		punpcklwd	mm3,			mm1
-		
+
 		;hdm_2x2,	mm0 = dct0 dct1, mm3 = dct2 dct3
 		movq		mm5,			mm3
 		paddw		mm3,			mm0
@@ -281,16 +281,16 @@
 		paddw		mm1,			mm3
 		psubw		mm3,			mm5
 		punpcklwd	mm1,			mm3
-		
+
 		;quant_2x2_dc
 		mov			ax,				[mf]
-		MMX_Copy4Times	mm3,		eax		
+		MMX_Copy4Times	mm3,		eax
 		mov			cx,				[ff]
 		MMX_Copy4Times	mm2,		ecx
 		MMX_Quant4		mm1,	mm0,	mm2,	mm3
-		
+
 		; pNonZeroCount of dct_2x2
-		pcmpeqb		mm2,			mm2		; mm2 = FF 
+		pcmpeqb		mm2,			mm2		; mm2 = FF
 		pxor		mm3,			mm3
 		packsswb	mm1,			mm3
 		pcmpeqb		mm1,			mm3		; set FF if equal, 0 if not equal
@@ -297,16 +297,16 @@
 		psubsb		mm1,			mm2		; set 0 if equal, 1 if not equal
 		psadbw		mm1,			mm3		;
 		movd		eax,			mm1
-			
-		WELSEMMS		
-		ret	
-		
-		
-%macro SSE2_DeQuant8 3  
+
+		WELSEMMS
+		ret
+
+
+%macro SSE2_DeQuant8 3
     MOVDQ  %2, %1
     pmullw %2, %3
     MOVDQ  %1, %2
-%endmacro 
+%endmacro
 
 
 ALIGN  16
@@ -329,7 +329,7 @@
 ;***********************************************************************====
 ;void WelsDequantFour4x4_sse2(int16_t *pDct, const uint16_t* mf);
 ;***********************************************************************====
-    
+
 align 16
 
 WELS_EXTERN WelsDequantFour4x4_sse2
@@ -356,15 +356,15 @@
 WELS_EXTERN WelsDequantIHadamard4x4_sse2
 align 16
 WelsDequantIHadamard4x4_sse2:
-		mov			eax,			[esp + 4]				
+		mov			eax,			[esp + 4]
 		mov			cx,				[esp + 8]
-		
+
 		; WelsDequantLumaDc4x4
-		SSE2_Copy8Times	xmm1,		ecx		
+		SSE2_Copy8Times	xmm1,		ecx
 		;psrlw		xmm1,		2		; for the (>>2) in ihdm
 		MOVDQ		xmm0,		[eax]
 		MOVDQ		xmm2,		[eax+0x10]
-		pmullw		xmm0,		xmm1		
+		pmullw		xmm0,		xmm1
 		pmullw		xmm2,		xmm1
 
 		; ihdm_4x4
@@ -371,24 +371,23 @@
 		movdqa		xmm1,		xmm0
 		psrldq		xmm1,		8
 		movdqa		xmm3,		xmm2
-		psrldq		xmm3,		8		
-		
-		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3	
-		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2														
+		psrldq		xmm3,		8
+
+		SSE2_SumSub		xmm0, xmm3,	xmm5					; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
+		SSE2_SumSub		xmm1, xmm2, xmm5					; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
 		SSE2_SumSub		xmm3, xmm2, xmm5					; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
 		SSE2_SumSub		xmm0, xmm1, xmm5               		; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
 
-		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4		
-		SSE2_SumSub		xmm2, xmm4,	xmm5		
-		SSE2_SumSub		xmm1, xmm0, xmm5																		
-		SSE2_SumSub		xmm4, xmm0, xmm5							
-		SSE2_SumSub		xmm2, xmm1, xmm5 
+		SSE2_TransTwo4x4W	xmm2, xmm1, xmm3, xmm0, xmm4
+		SSE2_SumSub		xmm2, xmm4,	xmm5
+		SSE2_SumSub		xmm1, xmm0, xmm5
+		SSE2_SumSub		xmm4, xmm0, xmm5
+		SSE2_SumSub		xmm2, xmm1, xmm5
 		SSE2_TransTwo4x4W	xmm0, xmm1, xmm4, xmm2, xmm3
-		
+
 		punpcklqdq	xmm0,		xmm1
 		MOVDQ		[eax],		xmm0
-		
+
 		punpcklqdq	xmm2,		xmm3
-		MOVDQ		[eax+16],	xmm2			
+		MOVDQ		[eax+16],	xmm2
 		ret
-	
\ No newline at end of file
--- a/codec/encoder/core/asm/satd_sad.asm
+++ b/codec/encoder/core/asm/satd_sad.asm
@@ -37,7 +37,7 @@
 ;*      WelsSampleSatd16x8_sse2
 ;*      WelsSampleSatd8x16_sse2
 ;*      WelsSampleSatd16x16_sse2
-;*      
+;*
 ;*      WelsSampleSad16x8_sse2
 ;*      WelsSampleSad16x16_sse2
 ;*
@@ -99,12 +99,12 @@
 
 %macro SSE2_HDMTwo4x4 5 ;in: xmm1,xmm2,xmm3,xmm4  pOut: xmm4,xmm2,xmm1,xmm3
    SSE2_SumSub %1, %2, %5
-   SSE2_SumSub %3, %4, %5 
-   SSE2_SumSub %2, %4, %5 
-   SSE2_SumSub %1, %3, %5 
-%endmacro 
+   SSE2_SumSub %3, %4, %5
+   SSE2_SumSub %2, %4, %5
+   SSE2_SumSub %1, %3, %5
+%endmacro
 
-%macro SSE2_SumAbs4 7  
+%macro SSE2_SumAbs4 7
 	WELS_AbsW %1, %3
 	WELS_AbsW %2, %3
 	WELS_AbsW %4, %6
@@ -113,13 +113,13 @@
 	paddusw       %4, %5
 	paddusw       %7, %1
 	paddusw       %7, %4
-%endmacro 
+%endmacro
 
 %macro  SSE2_SumWHorizon 3
 	movhlps		%2, %1			; x2 = xx xx xx xx d7 d6 d5 d4
 	paddw		%1, %2			; x1 = xx xx xx xx d37 d26 d15 d04
-	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04 
-	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26 
+	punpcklwd	%1, %3			; x1 =  d37  d26 d15 d04
+	movhlps		%2, %1			; x2 = xxxx xxxx d37 d26
 	paddd		%1, %2			; x1 = xxxx xxxx d1357 d0246
 	pshuflw		%2, %1, 0x4e	; x2 = xxxx xxxx d0246 d1357
 	paddd		%1, %2			; x1 = xxxx xxxx xxxx  d01234567
@@ -132,12 +132,12 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
 	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
-	
+
 	lea					eax,    [eax+2*ebx]
     lea					ecx,    [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm0,xmm4,xmm7,[eax],[ecx]
@@ -146,11 +146,11 @@
 	lea                 ecx, [ecx+2*edx]
 	SSE2_LoadDiff8P    xmm2,xmm4,xmm7,[eax],[ecx]
 	SSE2_LoadDiff8P    xmm3,xmm5,xmm7,[eax+ebx],[ecx+edx]
-	
+
 	SSE2_HDMTwo4x4       xmm0,xmm1,xmm2,xmm3,xmm4
 	SSE2_TransTwo4x4W     xmm3,xmm1,xmm0,xmm2,xmm4
-	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5 
-	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6	
+	SSE2_HDMTwo4x4       xmm3,xmm1,xmm2,xmm4,xmm5
+	SSE2_SumAbs4         xmm4,xmm1,xmm0,xmm2,xmm3,xmm5,xmm6
 %endmacro
 
 ;***********************************************************************
@@ -165,8 +165,8 @@
 	mov       eax,  [esp+8]
 	mov       ebx,  [esp+12]
 	mov       ecx,  [esp+16]
-	mov       edx,  [esp+20]    
-	
+	mov       edx,  [esp+20]
+
     movd      xmm0, [eax]
     movd      xmm1, [eax+ebx]
     lea       eax , [eax+2*ebx]
@@ -174,7 +174,7 @@
     movd      xmm3, [eax+ebx]
     punpckldq xmm0, xmm2
     punpckldq xmm1, xmm3
-   
+
     movd      xmm4, [ecx]
     movd      xmm5, [ecx+edx]
     lea       ecx , [ecx+2*edx]
@@ -188,7 +188,7 @@
     punpcklbw xmm1, xmm6
     punpcklbw xmm4, xmm6
     punpcklbw xmm5, xmm6
-    
+
     psubw     xmm0, xmm4
     psubw     xmm1, xmm5
 
@@ -196,7 +196,7 @@
     paddw     xmm0, xmm1
     psubw     xmm2, xmm1
     SSE2_XSawp qdq, xmm0, xmm2, xmm3
-    
+
     movdqa     xmm4, xmm0
     paddw      xmm0, xmm3
     psubw      xmm4, xmm3
@@ -204,7 +204,7 @@
     movdqa         xmm2, xmm0
     punpcklwd      xmm0, xmm4
     punpckhwd      xmm4, xmm2
-    
+
 	SSE2_XSawp     dq,  xmm0, xmm4, xmm3
 	SSE2_XSawp     qdq, xmm0, xmm3, xmm5
 
@@ -211,16 +211,16 @@
     movdqa         xmm7, xmm0
     paddw          xmm0, xmm5
     psubw          xmm7, xmm5
-    
+
 	SSE2_XSawp     qdq,  xmm0, xmm7, xmm1
 
     movdqa         xmm2, xmm0
     paddw          xmm0, xmm1
     psubw          xmm2, xmm1
-    
-    WELS_AbsW  xmm0, xmm3   
+
+    WELS_AbsW  xmm0, xmm3
     paddusw        xmm6, xmm0
-	WELS_AbsW  xmm2, xmm4   
+	WELS_AbsW  xmm2, xmm4
     paddusw        xmm6, xmm2
     SSE2_SumWHorizon1  xmm6, xmm4
 	movd           eax,  xmm6
@@ -228,7 +228,7 @@
     shr            eax,  1
 	pop            ebx
 	ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -241,16 +241,16 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7     
-     SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+     SSE2_GetSatd8x8
      psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
 	 pop     ebx
 	 ret
- 
+
  ;***********************************************************************
  ;
  ;int32_t WelsSampleSatd8x16_sse2( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -263,15 +263,15 @@
 	 mov    eax,    [esp+8]
 	 mov    ebx,    [esp+12]
 	 mov    ecx,    [esp+16]
-	 mov    edx,    [esp+20]    
+	 mov    edx,    [esp+20]
 	 pxor   xmm6,   xmm6
-     pxor   xmm7,   xmm7  
-        
-	 SSE2_GetSatd8x8	 
+     pxor   xmm7,   xmm7
+
+	 SSE2_GetSatd8x8
      lea    eax,    [eax+2*ebx]
-     lea    ecx,    [ecx+2*edx]     
-	 SSE2_GetSatd8x8	
-	  
+     lea    ecx,    [ecx+2*edx]
+	 SSE2_GetSatd8x8
+
 	 psrlw   xmm6,  1
 	 SSE2_SumWHorizon   xmm6,xmm4,xmm7
 	 movd    eax,   xmm6
@@ -290,15 +290,15 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
+
 	SSE2_GetSatd8x8
 	mov    eax,    [esp+8]
     mov    ecx,    [esp+16]
     add    eax,    8
-    add    ecx,    8    
+    add    ecx,    8
 	SSE2_GetSatd8x8
 
 	psrlw   xmm6,  1
@@ -319,25 +319,25 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
     pxor   xmm7,   xmm7
-    
-	SSE2_GetSatd8x8		
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
 	mov    eax,    [esp+8]
 	mov    ecx,    [esp+16]
 	add    eax,    8
 	add    ecx,    8
-	
-	SSE2_GetSatd8x8	
+
+	SSE2_GetSatd8x8
 	lea    eax,    [eax+2*ebx]
-	lea    ecx,    [ecx+2*edx]	
+	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSatd8x8
-	
+
  ; each column sum of SATD is necessarily even, so we don't lose any precision by shifting first.
     psrlw   xmm6,  1
 	SSE2_SumWHorizon   xmm6,xmm4,xmm7
@@ -353,18 +353,18 @@
 
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 BEGIN 
+;Pixel_satd_intra_sse2 BEGIN
 ;
 ;***********************************************************************
 
-%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 
+%macro SSE41_I16x16Get8WSumSub 3 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -373,14 +373,14 @@
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
-%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2  
+%macro SSE41_ChromaGet8WSumSub 4 ;xmm5 HSumSubDB1, xmm6 HSumSubDW1, xmm7 PDW1 : in %1, pOut %1, %3 : %4 tempsse2
 	pmaddubsw    %1, xmm5
 	movdqa       %2, %1
 	pmaddwd      %1, xmm7
 	pmaddwd      %2, xmm6
 	movdqa       %3, %1
-	punpckldq    %1, %2 
-	punpckhdq    %2, %3 
+	punpckldq    %1, %2
+	punpckhdq    %2, %3
 	movdqa       %3, %1
 	punpcklqdq   %1, %2
 	punpckhqdq   %3, %2
@@ -387,7 +387,7 @@
 ;    paddd        xmm4, %1 ;for dc
 ;	 paddd        xmm4, %3 ;for dc
 	movdqa       %4, %1
-	punpcklqdq   %4, %3 
+	punpcklqdq   %4, %3
 	packssdw     %1, %3
 	psllw        %1, 2
 %endmacro
@@ -415,25 +415,25 @@
 	pinsrw      xmm0,   word[esi+%2+8], 4
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+2],  0
 	pinsrw      xmm0,   word[esi+%2+10], 4
 	psubsw      xmm0,   xmm1
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+4],  0
 	pinsrw      xmm0,   word[esi+%2+12], 4
 	psubsw      xmm0,   xmm3
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 	pxor        xmm0,   xmm0
 	pinsrw      xmm0,   word[esi+%2+6],  0
 	pinsrw      xmm0,   word[esi+%2+14], 4
 	psubsw      xmm0,   xmm2
 	pabsw       xmm0,   xmm0
-	paddw       xmm4,   xmm0 
+	paddw       xmm4,   xmm0
 %endmacro
 %macro SSE41_GetX38x4SatdH  3
 	movq        xmm0,   [esi+%3+8*%1]
@@ -455,7 +455,7 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_ChromaGetX38x4SatdDC 1
 	shl         %1,     4
@@ -463,13 +463,13 @@
 	psubsw      xmm0,   xmm7
 	pabsw       xmm0,   xmm0
 	paddw       xmm6,   xmm0
-	paddw       xmm6,   xmm2 
+	paddw       xmm6,   xmm2
 %endmacro
 %macro SSE41_I16x16GetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
 	SSE41_GetX38x4SatdV   %1, %2
 	SSE41_GetX38x4SatdH   %1, %2, 32
-	SSE41_I16X16GetX38x4SatdDC 
+	SSE41_I16X16GetX38x4SatdDC
 %endmacro
 %macro SSE41_ChromaGetX38x4Satd 2
 	SSE41_GetX38x4SatdDec
@@ -478,11 +478,11 @@
 	SSE41_ChromaGetX38x4SatdDC %1
 %endmacro
 %macro SSE41_HSum8W 3
-	pmaddwd     %1, %2 
-	movhlps     %3, %1 
-	paddd       %1, %3 
-	pshuflw     %3, %1,0Eh 
-	paddd       %1, %3 
+	pmaddwd     %1, %2
+	movhlps     %3, %1
+	paddd       %1, %3
+	pshuflw     %3, %1,0Eh
+	paddd       %1, %3
 %endmacro
 
 WELS_EXTERN WelsIntra16x16Combined3Satd_sse41
@@ -493,7 +493,7 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	pxor        xmm4,   xmm4
 	movdqa      xmm5,   [HSumSubDB1]
@@ -507,29 +507,29 @@
 	SSE41_I16x16Get8WSumSub xmm0, xmm2, xmm3
 	SSE41_I16x16Get8WSumSub xmm1, xmm2, xmm3
 	movdqa      [esi],  xmm0 ;V
-	movdqa      [esi+16], xmm1 
+	movdqa      [esi+16], xmm1
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     8
 	pinsrb      xmm0,   byte[ecx+edx-1], 9
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     10
 	pinsrb      xmm0,   byte[ecx+edx-1], 11
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     12
 	pinsrb      xmm0,   byte[ecx+edx-1], 13
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     14
 	pinsrb      xmm0,   byte[ecx+edx-1], 15
 	movhlps		xmm1,   xmm0
@@ -549,7 +549,7 @@
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
 	mov         edi,    0
-.loop16x16_get_satd:    
+.loop16x16_get_satd:
 .loopStart1:
 	SSE41_I16x16GetX38x4Satd ecx, edi
 	inc          ecx
@@ -562,8 +562,8 @@
 	mov         ecx, 0
 	add         edi, 16
 	jmp         .loop16x16_get_satd
- .loop16x16_get_satd_end:   
-	MMX_DW_1_2REG    xmm0, xmm1 
+ .loop16x16_get_satd_end:
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -570,7 +570,7 @@
 	SSE41_HSum8W     xmm4, xmm0, xmm1
 	SSE41_HSum8W     xmm5, xmm0, xmm1
 	SSE41_HSum8W     xmm6, xmm0, xmm1
-	
+
 	; comparing order: DC H V
 	movd      ebx, xmm6 ;DC
 	movd      edi, xmm5 ;H
@@ -577,33 +577,33 @@
 	movd      ecx, xmm4 ;V
 	mov      edx, [esp+36]
 	shl       edx, 1
-	add       edi, edx 
-	add       ebx, edx 
+	add       edi, edx
+	add       ebx, edx
 	mov       edx, [esp+32]
 	cmp       ebx, edi
 	jge near   not_dc_16x16
 	cmp        ebx, ecx
 	jge near   not_dc_h_16x16
-	
+
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_16x16_x3
 not_dc_16x16:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_16x16
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_16x16_x3
 not_dc_h_16x16:
 	; for V mode
 	mov       dword[edx], 0;I16_PRED_V
 	mov       eax, ecx
-return_satd_intra_16x16_x3: 
+return_satd_intra_16x16_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
@@ -619,13 +619,13 @@
 	add         ecx,    edx
 	pinsrb      xmm0,   byte[ecx-1], 0
 	pinsrb      xmm0,   byte[ecx+edx-1], 1
-	lea         ecx,    [ecx+2*edx]  
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     2
 	pinsrb      xmm0,   byte[ecx+edx-1], 3
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     4
 	pinsrb      xmm0,   byte[ecx+edx-1], 5
-	lea         ecx,    [ecx+2*edx] 
+	lea         ecx,    [ecx+2*edx]
 	pinsrb      xmm0,   byte[ecx-1],     6
 	pinsrb      xmm0,   byte[ecx+edx-1], 7
 	punpcklqdq  xmm0,   xmm0
@@ -634,10 +634,10 @@
 ;(sum+2)>>2
 	movdqa      xmm6,   [PDQ2]
 	movdqa      xmm5,   xmm4
-	punpckhqdq  xmm5,   xmm1    
+	punpckhqdq  xmm5,   xmm1
 	paddd       xmm5,   xmm6
 	psrld       xmm5,   2
-;(sum1+sum2+4)>>3   
+;(sum1+sum2+4)>>3
 	paddd       xmm6,   xmm6
 	paddd       xmm4,   xmm1
 	paddd       xmm4,   xmm6
@@ -644,8 +644,8 @@
 	psrld       xmm4,   3
 ;satd *16
 	pslld       xmm5,   4
-	pslld       xmm4,   4    
-;temp satd    
+	pslld       xmm4,   4
+;temp satd
 	movdqa      xmm6,   xmm4
 	punpcklqdq  xmm4,   xmm5
 	psllq       xmm4,   32
@@ -655,12 +655,12 @@
 	psllq       xmm5,   32
 	psrlq       xmm5,   32
 	movdqa      [esi+48], xmm5
-	
+
 	pxor        xmm4,   xmm4 ;V
 	pxor        xmm5,   xmm5 ;H
 	pxor        xmm6,   xmm6 ;DC
 	mov         ecx,    0
-loop_chroma_satdx3_cb_cr:    
+loop_chroma_satdx3_cb_cr:
 	SSE41_ChromaGetX38x4Satd ecx, 0
 	inc             ecx
 	cmp             ecx, 2
@@ -668,13 +668,13 @@
 %endmacro
 
 %macro SSEReg2MMX 3
-	movdq2q     %2, %1 
-	movhlps     %1, %1 
-	movdq2q     %3, %1 
+	movdq2q     %2, %1
+	movhlps     %1, %1
+	movdq2q     %3, %1
 %endmacro
 %macro MMXReg2SSE 4
-	movq2dq     %1, %3 
-	movq2dq     %2, %4 
+	movq2dq     %1, %3
+	movq2dq     %2, %4
 	punpcklqdq  %1, %2
 %endmacro
 ;for reduce the code size of WelsIntraChroma8x8Combined3Satd_sse41
@@ -687,10 +687,10 @@
 	mov    ecx,    [esp+16]
 	mov    edx,    [esp+20]
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
 	mov    esi,    [esp+40] ;temp_satd
 	xor    edi,    edi
-loop_chroma_satdx3: 
+loop_chroma_satdx3:
 	SSE41_ChromaGetX38x8Satd
 	cmp             edi, 1
 	je              loop_chroma_satdx3end
@@ -701,16 +701,16 @@
 	mov         ecx,  [esp+44]
 	mov         eax,  [esp+48]
 	jmp         loop_chroma_satdx3
-loop_chroma_satdx3end:    
+loop_chroma_satdx3end:
 	MMXReg2SSE  xmm0, xmm3, mm0, mm1
 	MMXReg2SSE  xmm1, xmm3, mm2, mm3
 	MMXReg2SSE  xmm2, xmm3, mm5, mm6
-	
+
 	paddw       xmm4, xmm0
 	paddw       xmm5, xmm1
 	paddw       xmm6, xmm2
-	
-	MMX_DW_1_2REG    xmm0, xmm1 
+
+	MMX_DW_1_2REG    xmm0, xmm1
 	psrlw       xmm4, 1 ;/2
 	psrlw       xmm5, 1 ;/2
 	psrlw       xmm6, 1 ;/2
@@ -730,57 +730,57 @@
 	jge near   not_dc_8x8
 	cmp        ebx, ecx
 	jge near   not_dc_h_8x8
-	
+
 	; for DC mode
 	mov       dword[edx], 0;I8_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
 	jmp near return_satd_intra_8x8_x3
 not_dc_8x8:
-	; for H mode 
+	; for H mode
 	cmp       edi, ecx
 	jge near   not_dc_h_8x8
 	mov       dword[edx], 1;I8_PRED_H
-	mov       eax, edi 
+	mov       eax, edi
 	jmp near return_satd_intra_8x8_x3
 not_dc_h_8x8:
 	; for V mode
 	mov       dword[edx], 2;I8_PRED_V
 	mov       eax, ecx
-return_satd_intra_8x8_x3: 
+return_satd_intra_8x8_x3:
 	WELSEMMS
-	pop         edi 
-	pop         esi 
+	pop         edi
+	pop         esi
 	pop         ebx
 ret
 
-	
+
 ;***********************************************************************
 ;
-;Pixel_satd_intra_sse2 END 
+;Pixel_satd_intra_sse2 END
 ;
 ;***********************************************************************
 %macro SSSE3_Get16BSadHVDC 2
-  movd        xmm6,%1 
-  pshufb      xmm6,xmm1 
+  movd        xmm6,%1
+  pshufb      xmm6,xmm1
   movdqa      %1,  xmm6
-  movdqa      xmm0,%2 
-  psadbw      xmm0,xmm7 
-  paddw       xmm4,xmm0 
   movdqa      xmm0,%2
-  psadbw      xmm0,xmm5 
-  paddw       xmm2,xmm0 
+  psadbw      xmm0,xmm7
+  paddw       xmm4,xmm0
+  movdqa      xmm0,%2
+  psadbw      xmm0,xmm5
+  paddw       xmm2,xmm0
   psadbw      xmm6,%2
-  paddw       xmm3,xmm6 
+  paddw       xmm3,xmm6
 %endmacro
 %macro WelsAddDCValue 4
     movzx   %2, byte %1
-    mov    %3, %2 
+    mov    %3, %2
     add     %4, %2
-%endmacro   
+%endmacro
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 BEGIN 
+;Pixel_sad_intra_ssse3 BEGIN
 ;
 ;***********************************************************************
 WELS_EXTERN WelsIntra16x16Combined3Sad_ssse3
@@ -792,14 +792,14 @@
 	mov    edx,    [esp+20]
 	mov    edi,    [esp+40] ;temp_sad
 	sub    ecx,    edx
-    movdqa      xmm5,[ecx] 
+    movdqa      xmm5,[ecx]
     pxor        xmm0,xmm0
-    psadbw      xmm0,xmm5 
-    movhlps     xmm1,xmm0 
-    paddw       xmm0,xmm1 
+    psadbw      xmm0,xmm5
+    movhlps     xmm1,xmm0
+    paddw       xmm0,xmm1
     movd        eax,xmm0
-     
-    add         ecx,edx 
+
+    add         ecx,edx
     lea         ebx, [edx+2*edx]
     WelsAddDCValue [ecx-1      ], esi, [edi   ], eax
     WelsAddDCValue [ecx-1+edx  ], esi, [edi+16], eax
@@ -824,45 +824,45 @@
     WelsAddDCValue [ecx-1+edx*2], esi, [edi+32], eax
     WelsAddDCValue [ecx-1+ebx  ], esi, [edi+48], eax
     sub        edi, 192
-    add         eax,10h 
-    shr         eax,5 
-    movd        xmm7,eax 
+    add         eax,10h
+    shr         eax,5
+    movd        xmm7,eax
     pxor        xmm1,xmm1
     pshufb      xmm7,xmm1
-    pxor        xmm4,xmm4 
-    pxor        xmm3,xmm3 
-    pxor        xmm2,xmm2 
-;sad begin  
+    pxor        xmm4,xmm4
+    pxor        xmm3,xmm3
+    pxor        xmm2,xmm2
+;sad begin
 	mov    eax,    [esp+24]
-	mov    ebx,    [esp+28]    
+	mov    ebx,    [esp+28]
     lea         esi, [ebx+2*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    add         edi, 64  
+    add         edi, 64
     lea         eax, [eax+4*ebx]
     SSSE3_Get16BSadHVDC [edi], [eax]
     SSSE3_Get16BSadHVDC [edi+16], [eax+ebx]
     SSSE3_Get16BSadHVDC [edi+32], [eax+2*ebx]
     SSSE3_Get16BSadHVDC [edi+48], [eax+esi]
-    
-    pslldq      xmm3,4 
-    por         xmm3,xmm2 
-    movhlps     xmm1,xmm3 
-    paddw       xmm3,xmm1 
+
+    pslldq      xmm3,4
+    por         xmm3,xmm2
+    movhlps     xmm1,xmm3
+    paddw       xmm3,xmm1
     movhlps     xmm0,xmm4
     paddw       xmm4,xmm0
 ; comparing order: DC H V
@@ -872,8 +872,8 @@
 	movd        esi, xmm3 ;H
 	mov         eax, [esp+36] ;lamda
 	shl         eax, 1
-	add         esi, eax 
-	add         ebx, eax 
+	add         esi, eax
+	add         ebx, eax
 	mov         edx, [esp+32]
 	cmp         ebx, esi
 	jge near   not_dc_16x16_sad
@@ -881,7 +881,7 @@
 	jge near   not_dc_h_16x16_sad
 	; for DC mode
 	mov       dword[edx], 2;I16_PRED_DC
-	mov       eax, ebx 
+	mov       eax, ebx
     sub        edi, 192
 %assign x 0
 %rep 16
@@ -890,11 +890,11 @@
 %endrep
 	jmp near return_sad_intra_16x16_x3
 not_dc_16x16_sad:
-	; for H mode 
+	; for H mode
 	cmp       esi, ecx
 	jge near   not_dc_h_16x16_sad
 	mov       dword[edx], 1;I16_PRED_H
-	mov       eax, esi 
+	mov       eax, esi
 	jmp near return_sad_intra_16x16_x3
 not_dc_h_16x16_sad:
 	; for V mode
@@ -914,12 +914,12 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_intra_ssse3 END 
+;Pixel_sad_intra_ssse3 END
 ;
 ;***********************************************************************
 ;***********************************************************************
 ;
-;Pixel_satd_wxh_sse41 BEGIN 
+;Pixel_satd_wxh_sse41 BEGIN
 ;
 ;***********************************************************************
 
@@ -934,9 +934,9 @@
 	movq             xmm2, [ecx]
 	punpcklqdq       xmm2, xmm2
 	pmaddubsw        xmm2, xmm7
-	movq             xmm3, [ecx+edx]	
-	punpcklqdq       xmm3, xmm3	
-	pmaddubsw        xmm3, xmm7	
+	movq             xmm3, [ecx+edx]
+	punpcklqdq       xmm3, xmm3
+	pmaddubsw        xmm3, xmm7
 	psubsw           xmm0, xmm2
 	psubsw           xmm1, xmm3
 	movq             xmm2, [eax+2*ebx]
@@ -948,12 +948,12 @@
 	movq             xmm4, [ecx+2*edx]
 	punpcklqdq       xmm4, xmm4
 	pmaddubsw        xmm4, xmm7
-	movq             xmm5, [ecx+edi]	
-	punpcklqdq       xmm5, xmm5	
+	movq             xmm5, [ecx+edi]
+	punpcklqdq       xmm5, xmm5
 	pmaddubsw        xmm5, xmm7
 	psubsw           xmm2, xmm4
 	psubsw           xmm3, xmm5
-	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4	
+	SSE2_HDMTwo4x4   xmm0, xmm1, xmm2, xmm3, xmm4
 	pabsw            xmm0, xmm0
 	pabsw            xmm2, xmm2
 	pabsw            xmm1, xmm1
@@ -970,18 +970,18 @@
 	pslld            xmm2, 16
 	psrld            xmm4, 16
 	por              xmm2, xmm4
-	pmaxuw           xmm0, xmm2	
+	pmaxuw           xmm0, xmm2
 	paddw            xmm6, xmm0
 %endmacro
 
 %macro SSSE3_SumWHorizon 4 ;eax, srcSSE, tempSSE, tempSSE
-	MMX_DW_1_2REG    %3, %4 
-	pmaddwd     %2, %3 
-	movhlps     %4, %2 
-	paddd       %2, %4 
-	pshuflw     %4, %2,0Eh 
-	paddd       %2, %4 
-	movd		%1, %2 
+	MMX_DW_1_2REG    %3, %4
+	pmaddwd     %2, %3
+	movhlps     %4, %2
+	paddd       %2, %4
+	pshuflw     %4, %2,0Eh
+	paddd       %2, %4
+	movd		%1, %2
 %endmacro
 ;***********************************************************************
 ;
@@ -990,53 +990,53 @@
 ;***********************************************************************
 WELS_EXTERN WelsSampleSatd4x4_sse41
 WelsSampleSatd4x4_sse41:
-	push        ebx  
-	mov         eax,[esp+8] 
-	mov         ebx,[esp+12] 
-	mov         ecx,[esp+16] 
-	mov         edx,[esp+20] 
-	movdqa      xmm4,[HSwapSumSubDB1] 
-	movd        xmm2,[ecx] 
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm2,xmm5,0 
-	movd        xmm3,[ecx+edx*2] 
+	push        ebx
+	mov         eax,[esp+8]
+	mov         ebx,[esp+12]
+	mov         ecx,[esp+16]
+	mov         edx,[esp+20]
+	movdqa      xmm4,[HSwapSumSubDB1]
+	movd        xmm2,[ecx]
+	movd        xmm5,[ecx+edx]
+	shufps      xmm2,xmm5,0
+	movd        xmm3,[ecx+edx*2]
 	lea         ecx, [edx*2+ecx]
-	movd        xmm5,[ecx+edx] 
-	shufps      xmm3,xmm5,0 
-	movd        xmm0,[eax] 
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm0,xmm5,0 
-	movd        xmm1,[eax+ebx*2] 
+	movd        xmm5,[ecx+edx]
+	shufps      xmm3,xmm5,0
+	movd        xmm0,[eax]
+	movd        xmm5,[eax+ebx]
+	shufps      xmm0,xmm5,0
+	movd        xmm1,[eax+ebx*2]
 	lea         eax, [ebx*2+eax]
-	movd        xmm5,[eax+ebx] 
-	shufps      xmm1,xmm5,0 
-	pmaddubsw   xmm0,xmm4 
-	pmaddubsw   xmm1,xmm4 
-	pmaddubsw   xmm2,xmm4 
-	pmaddubsw   xmm3,xmm4 
-	psubw       xmm0,xmm2 
-	psubw       xmm1,xmm3 
-	movdqa      xmm2,xmm0 
-	paddw       xmm0,xmm1 
-	psubw       xmm1,xmm2 
-	movdqa      xmm2,xmm0 
-	punpcklqdq  xmm0,xmm1 
-	punpckhqdq  xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	paddw       xmm0,xmm2 
-	psubw       xmm2,xmm1 
-	movdqa      xmm1,xmm0 
-	pblendw     xmm0,xmm2,0AAh 
-	pslld       xmm2,16 
-	psrld       xmm1,16 
-	por         xmm2,xmm1 
-	pabsw       xmm0,xmm0 
-	pabsw       xmm2,xmm2 
-	pmaxsw      xmm0,xmm2 
+	movd        xmm5,[eax+ebx]
+	shufps      xmm1,xmm5,0
+	pmaddubsw   xmm0,xmm4
+	pmaddubsw   xmm1,xmm4
+	pmaddubsw   xmm2,xmm4
+	pmaddubsw   xmm3,xmm4
+	psubw       xmm0,xmm2
+	psubw       xmm1,xmm3
+	movdqa      xmm2,xmm0
+	paddw       xmm0,xmm1
+	psubw       xmm1,xmm2
+	movdqa      xmm2,xmm0
+	punpcklqdq  xmm0,xmm1
+	punpckhqdq  xmm2,xmm1
+	movdqa      xmm1,xmm0
+	paddw       xmm0,xmm2
+	psubw       xmm2,xmm1
+	movdqa      xmm1,xmm0
+	pblendw     xmm0,xmm2,0AAh
+	pslld       xmm2,16
+	psrld       xmm1,16
+	por         xmm2,xmm1
+	pabsw       xmm0,xmm0
+	pabsw       xmm2,xmm2
+	pmaxsw      xmm0,xmm2
 	SSSE3_SumWHorizon eax, xmm0, xmm5, xmm7
-	pop         ebx  
-	ret 
- 
+	pop         ebx
+	ret
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x8_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1051,10 +1051,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6, xmm6
 	SSE41_GetSatd8x4
 	lea			eax,	[eax+4*ebx]
@@ -1065,7 +1065,7 @@
 	pop 		esi
 	pop 		ebx
 	ret
- 
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSatd8x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
@@ -1078,17 +1078,17 @@
 	push   esi
 	push   edi
 	push   ebp
-%define pushsize   16	
+%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor        xmm6, xmm6
 	mov         ebp,    0
-loop_get_satd_8x16:	
+loop_get_satd_8x16:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1116,10 +1116,10 @@
 	mov    eax,    [esp+16]
 	mov    ebx,    [esp+20]
 	mov    ecx,    [esp+24]
-	mov    edx,    [esp+28]    
+	mov    edx,    [esp+28]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
@@ -1144,7 +1144,7 @@
 ;int32_t WelsSampleSatd16x16_sse41( uint8_t *, int32_t, uint8_t *, int32_t, );
 ;
 ;***********************************************************************
-   
+
 WELS_EXTERN WelsSampleSatd16x16_sse41
 align 16
 WelsSampleSatd16x16_sse41:
@@ -1152,17 +1152,17 @@
 	push   esi
 	push   edi
 	push   ebp
-	%define pushsize   16	
+	%define pushsize   16
 	mov    eax,    [esp+pushsize+4]
 	mov    ebx,    [esp+pushsize+8]
 	mov    ecx,    [esp+pushsize+12]
-	mov    edx,    [esp+pushsize+16]    
+	mov    edx,    [esp+pushsize+16]
 	movdqa      xmm7, [HSumSubDB1]
-	lea         esi,  [ebx+ebx*2] 
-	lea         edi,  [edx+edx*2] 
+	lea         esi,  [ebx+ebx*2]
+	lea         edi,  [edx+edx*2]
 	pxor		xmm6,   xmm6
 	mov         ebp,    0
-loop_get_satd_16x16_left:	
+loop_get_satd_16x16_left:
 	SSE41_GetSatd8x4
 	lea			eax,  [eax+4*ebx]
 	lea			ecx,  [ecx+4*edx]
@@ -1206,8 +1206,8 @@
 	lea    ecx,    [ecx+2*edx]
 	movdqu xmm1,   [ecx]
 	MOVDQ  xmm2,   [eax];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm0,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm0,   xmm1
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1218,7 +1218,7 @@
 %macro SSE2_GetSad4x16 0
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	paddw  xmm7,   xmm0
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
@@ -1226,8 +1226,8 @@
 	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+2*edx]
 	MOVDQ  xmm2,   [eax+2*ebx];[eax] must aligned 16
-	psadbw xmm1,   xmm2 
-	paddw  xmm7,   xmm1	
+	psadbw xmm1,   xmm2
+	paddw  xmm7,   xmm1
 	movdqu xmm1,   [ecx+edi]
 	MOVDQ  xmm2,   [eax+esi]
 	psadbw xmm1,   xmm2
@@ -1265,17 +1265,17 @@
 WelsSampleSad16x16_sse2:
 	push ebx
 	push edi
-	push esi	
-		
+	push esi
+
 	%define _STACK_SIZE		12
-	
+
 	mov eax, [esp+_STACK_SIZE+4 ]
 	mov	ebx, [esp+_STACK_SIZE+8 ]
 	lea esi, [3*ebx]
 	mov ecx, [esp+_STACK_SIZE+12]
-	mov edx, [esp+_STACK_SIZE+16]	
-	lea edi, [3*edx]	
-	
+	mov edx, [esp+_STACK_SIZE+16]
+	lea edi, [3*edx]
+
 	pxor   xmm7,   xmm7
 	SSE2_GetSad4x16
 	lea   eax,    [eax+4*ebx]
@@ -1290,14 +1290,14 @@
 	movhlps xmm0, xmm7
 	paddw xmm0, xmm7
 	movd eax, xmm0
-	
-	%undef _STACK_SIZE	
-	
+
+	%undef _STACK_SIZE
+
 	pop esi
 	pop edi
 	pop ebx
 	ret
-   
+
 ;***********************************************************************
 ;
 ;int32_t WelsSampleSad16x8_sse2( uint8_t *, int32_t, uint8_t *, int32_t, )
@@ -1312,10 +1312,10 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	movdqu xmm0,   [ecx]
 	MOVDQ  xmm2,   [eax]
-	psadbw xmm0,   xmm2 
+	psadbw xmm0,   xmm2
 	movdqu xmm1,   [ecx+edx]
 	MOVDQ  xmm2,   [eax+ebx]
 	psadbw xmm1,   xmm2
@@ -1339,19 +1339,19 @@
 	mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
 	mov    ecx,    [esp+16]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
     pxor   xmm6,   xmm6
-	
+
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
 
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
@@ -1375,15 +1375,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -1391,10 +1391,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1402,17 +1402,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1420,7 +1420,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -1427,10 +1427,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1438,17 +1438,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -1456,10 +1456,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -1469,12 +1469,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
@@ -1485,7 +1485,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_wxh_sse2 END 
+;Pixel_sad_wxh_sse2 END
 ;
 ;***********************************************************************
 
@@ -1492,7 +1492,7 @@
 
 ;***********************************************************************
 ;
-;Pixel_sad_4_wxh_sse2 BEGIN 
+;Pixel_sad_4_wxh_sse2 BEGIN
 ;
 ;***********************************************************************
 
@@ -1525,20 +1525,20 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm3,   xmm1
 	paddw  xmm4,   xmm3
-	
+
 	movdqu xmm2,   [ecx+edx-1]
 	psadbw xmm2,   xmm0
 	paddw  xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+edx+1]
 	psadbw xmm3,   xmm0
 	paddw  xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	movdqa xmm2,   [eax]
@@ -1599,30 +1599,30 @@
 	movdqu xmm3,   [ecx]
 	psadbw xmm2,   xmm3
 	paddw xmm5,   xmm2
-	
+
 	movdqu xmm2,   [ecx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [ecx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [ecx+edx]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	mov        ecx,  [esp+24]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [ecx],xmm4
 	pop  ebx
@@ -1646,20 +1646,20 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm3,   xmm0
 	paddw xmm4,   xmm3
-	
+
 	movdqa xmm1,   [eax+ebx]
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm3,   xmm1
 	paddw xmm4,   xmm3
-	
+
 	movdqu xmm2,   [edi+edx-1]
 	psadbw xmm2,   xmm0
 	paddw xmm6,   xmm2
-	
+
 	movdqu xmm3,   [edi+edx+1]
 	psadbw xmm3,   xmm0
 	paddw xmm7,   xmm3
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    edi,    [edi+2*edx]
 	movdqa xmm2,   [eax]
@@ -1688,36 +1688,36 @@
 	movdqu xmm3,   [edi]
 	psadbw xmm0,   xmm3
 	paddw xmm5,   xmm0
-	
+
 	movdqu xmm0,   [edi-1]
 	psadbw xmm0,   xmm1
 	paddw xmm6,   xmm0
-	
+
 	movdqu xmm3,   [edi+1]
 	psadbw xmm3,   xmm1
 	paddw xmm7,   xmm3
-	
+
 	movdqu xmm3,   [edi+edx]
 	psadbw xmm1,   xmm3
 	paddw xmm5,   xmm1
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour8x16_sse2
 WelsSampleSadFour8x16_sse2:
 	push ebx
@@ -1737,10 +1737,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1749,191 +1749,191 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
-	
+
+
 WELS_EXTERN WelsSampleSadFour8x8_sse2
 WelsSampleSadFour8x8_sse2:
 	push ebx
@@ -1953,10 +1953,10 @@
 	movhps xmm3,   [edi+edx]
 	psadbw xmm3,   xmm0
 	paddw  xmm4,   xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
@@ -1965,99 +1965,99 @@
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	movq   xmm0,  [eax]
 	movhps xmm0,  [eax+ebx]
 	psadbw xmm3,  xmm0
 	paddw  xmm4,  xmm3
-	
-	
+
+
 	movq   xmm1,  [edi+edx-1]
 	movq   xmm3,  [edi+edx+1]
-	
+
 	lea    eax,   [eax+2*ebx]
 	lea    edi,   [edi+2*edx]
 	movhps xmm1,  [edi-1]
 	movhps xmm3,  [edi+1]
-	
+
 	psadbw xmm1,  xmm0
 	paddw  xmm6,  xmm1
 	psadbw xmm3,  xmm0
 	paddw  xmm7,  xmm3
-	
+
 	movq   xmm3,  [edi]
 	movhps xmm3,  [edi+edx]
 	psadbw xmm0,  xmm3
 	paddw  xmm5,  xmm0
-	
+
 	mov        edi,  [esp+28]
 	movhlps    xmm0, xmm4
-	paddw      xmm4, xmm0 
+	paddw      xmm4, xmm0
 	movhlps    xmm0, xmm5
-	paddw      xmm5, xmm0 
+	paddw      xmm5, xmm0
 	movhlps    xmm0, xmm6
-	paddw      xmm6, xmm0 
+	paddw      xmm6, xmm0
 	movhlps    xmm0, xmm7
 	paddw      xmm7, xmm0
 	punpckldq  xmm4, xmm5
-	punpckldq  xmm6, xmm7 
+	punpckldq  xmm6, xmm7
 	punpcklqdq xmm4, xmm6
 	movdqa     [edi],xmm4
 	pop  edi
 	pop  ebx
 	ret
-	
+
 WELS_EXTERN WelsSampleSadFour4x4_sse2
 WelsSampleSadFour4x4_sse2:
 	push ebx
@@ -2080,23 +2080,23 @@
 	punpckldq  xmm1, xmm2
 	movd       xmm2, [edi+edx-1]
 	movd       xmm3, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
-	
+
 	movd       xmm4, [edi]
 	movd       xmm5, [edi-1]
 	punpckldq  xmm2, xmm5
 	movd       xmm5, [edi+1]
 	punpckldq  xmm3, xmm5
-	
+
 	movd       xmm5, [edi+edx]
 	punpckldq  xmm4, xmm5
-	
+
 	punpcklqdq xmm1, xmm4 ;-L
-	
+
 	movd       xmm5, [edi+edx-1]
 	movd       xmm6, [edi+edx+1]
-	
+
 	lea        edi,  [edi+2*edx]
 	movd       xmm7, [edi-1]
 	punpckldq  xmm5, xmm7
@@ -2107,12 +2107,12 @@
 	movd       xmm6, [edi]
 	movd       xmm7, [edi+edx]
 	punpckldq  xmm6, xmm7
-	punpcklqdq xmm4, xmm6 ;+L 
+	punpcklqdq xmm4, xmm6 ;+L
 	psadbw     xmm1, xmm0
 	psadbw     xmm2, xmm0
 	psadbw     xmm3, xmm0
 	psadbw     xmm4, xmm0
-	
+
 	movhlps    xmm0, xmm1
 	paddw      xmm1, xmm0
 	movhlps    xmm0, xmm2
@@ -2123,13 +2123,13 @@
 	paddw      xmm4, xmm0
 	mov        edi,  [esp+28]
 	punpckldq  xmm1, xmm4
-	punpckldq  xmm2, xmm3 
+	punpckldq  xmm2, xmm3
 	punpcklqdq xmm1, xmm2
 	movdqa     [edi],xmm1
 	pop  edi
 	pop  ebx
 	ret
-	
+
 ;***********************************************************************
 ;
 ;Pixel_sad_4_wxh_sse2 END
@@ -2150,40 +2150,40 @@
 %define pix2address  esp+pushsize+12
 %define pix2stride   esp+pushsize+16
 
-    mov		  eax, [pix1address]    
-    mov		  ebx, [pix1stride ]    
-    mov		  ecx, [pix2address]    
-    mov		  edx, [pix2stride ]    
+    mov		  eax, [pix1address]
+    mov		  ebx, [pix1stride ]
+    mov		  ecx, [pix2address]
+    mov		  edx, [pix2stride ]
 
 	movd	  mm0, [eax]
 	movd	  mm1, [eax+ebx]
 	punpckldq mm0, mm1
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm0, mm3
-	
+
 	lea       eax, [eax+2*ebx]
 	lea       ecx, [ecx+2*edx]
-	
+
 	movd      mm1, [eax]
 	movd      mm2, [eax+ebx]
 	punpckldq mm1, mm2
-	
+
 	movd      mm3, [ecx]
 	movd      mm4, [ecx+edx]
 	punpckldq mm3, mm4
 	psadbw    mm1, mm3
 	paddw     mm0, mm1
-	
+
     movd      eax, mm0
 
 	WELSEMMS
     pop ebx
-%undef pushsize     
-%undef pix1address	
-%undef pix1stride   
-%undef pix2address  
-%undef pix2stride   
+%undef pushsize
+%undef pix1address
+%undef pix1stride
+%undef pix2address
+%undef pix2stride
     ret
\ No newline at end of file
--- a/codec/encoder/core/asm/score.asm
+++ b/codec/encoder/core/asm/score.asm
@@ -45,7 +45,7 @@
 bits 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 ;***********************************************************************
@@ -59,7 +59,7 @@
 sse2_1: dw 1, 1, 1, 1, 1, 1, 1, 1
 align 16
 sse2_b1: db 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
-i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 
+i_ds_table: db 3, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 align 16
 sse2_plane_inc_minus: dw -7, -6, -5, -4, -3, -2, -1, 0
 align 16
@@ -139,7 +139,7 @@
     db  4, 8, 5, 8, 8,12, 1, 4, 4, 8
     db  4, 7, 7,11, 4, 8, 7,11, 8,11
     db 11,15, 1, 4, 3, 7, 4, 7, 7,11
-    db  3, 7, 6,10, 7,10,10,14, 4, 7 
+    db  3, 7, 6,10, 7,10,10,14, 4, 7
     db  7,11, 7,10,10,14, 7,11,10,14
     db 11,14,14,18, 0, 4, 3, 7, 3, 6
     db  6,10, 3, 7, 6,10, 7,10,10,14
@@ -191,7 +191,7 @@
 	movdqa     [eax],xmm0
 	movdqa     [eax+16], xmm1
 	ret
-	
+
 ;***********************************************************************
 ;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
 ;***********************************************************************
@@ -206,7 +206,7 @@
 	pinsrw		xmm0, eax, 7			; xmm0[7]	=	[8]
 	pinsrw		xmm1, ecx, 0			; xmm1[0]	=	[7]
 	pshufb		xmm1, [pb_scanacdc_maskb]
-	pshufb		xmm0, [pb_scanacdc_maska]	
+	pshufb		xmm0, [pb_scanacdc_maska]
 
 	mov        eax,  [esp+4]
 	movdqa     [eax],xmm0
@@ -224,7 +224,7 @@
 	movdqa     xmm2, xmm0
 	punpcklqdq xmm0, xmm1
 	punpckhqdq xmm2, xmm1
-	
+
 	movdqa     xmm3, xmm0
 	punpckldq  xmm0, xmm2
 	punpckhdq  xmm3, xmm2
@@ -236,10 +236,10 @@
 	pextrw     edx,  xmm3, 0
 	pinsrw     xmm3, eax,  0
 	pinsrw     xmm0, edx,  3
-	
+
 	pshufhw    xmm1, xmm0, 0x93
 	pshuflw    xmm2, xmm3, 0x39
-    
+
     movdqa     xmm3, xmm2
     psrldq     xmm1, 2
     pslldq     xmm3, 14
@@ -255,13 +255,13 @@
 ;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
 ;***********************************************************************
 ALIGN 16
-WELS_EXTERN WelsCalculateSingleCtr4x4_sse2 
+WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
 WelsCalculateSingleCtr4x4_sse2:
 	push      ebx
 	mov       eax,  [esp+8]
 	movdqa    xmm0, [eax]
 	movdqa    xmm1, [eax+16]
-	
+
 	packsswb  xmm0, xmm1
 
     pxor      xmm3, xmm3
@@ -317,7 +317,7 @@
 	and       edx,  0xff
 	shr       ecx,  8
 ;	and       ecx,  0xff	; we do not need this due to high 16bits equal to 0 yet
-	xor       eax,  eax	
+	xor       eax,  eax
 	add       al,  [nozero_count_table+ecx]
 	add       al,  [nozero_count_table+edx]
 	ret
--- a/codec/encoder/core/asm/vaa.asm
+++ b/codec/encoder/core/asm/vaa.asm
@@ -38,7 +38,7 @@
 ;*      04/14/2010	Created
 ;*		06/07/2010	Added AnalysisVaaInfoIntra_sse2(ssse3)
 ;*		06/10/2010	Tune rc_sad_frame_sse2 and got about 40% improvement
-;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2 
+;*		08/11/2010	Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
 ;*
 ;*************************************************************************/
 %include "asm_inc.asm"
@@ -167,7 +167,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -179,31 +179,31 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
+	movq [esp+8], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
 	movq [esp+24], xmm0
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
 	paddw xmm0, xmm1
 	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
+
 	pmullw xmm1, xmm1
 	pmullw xmm2, xmm2
 	movdqa xmm3, xmm1
@@ -219,7 +219,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low word truncated
 	mov ecx, ebx
@@ -227,7 +227,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -253,7 +253,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -265,25 +265,25 @@
 	add edx, ecx		; iLineSize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; iLineSize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
+	movq [esp+8], xmm1
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
 	movq [esp+24], xmm1
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
@@ -305,7 +305,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low work truncated
 	mov ecx, ebx
@@ -313,7 +313,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -323,7 +323,7 @@
 	pop edx
 	pop ebx
 	ret
-	
+
 WELS_EXTERN MdInterAnalysisVaaInfo_sse41
 ;***********************************************************************
 ;	uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
@@ -331,11 +331,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse41:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -342,7 +342,7 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets	
+	pmulld xmm3, xmm3	; [comment]: pmulld from SSE4.1 instruction sets
 	pshufd xmm4, xmm3, 01Bh
 	paddd xmm4, xmm3
 	pshufd xmm3, xmm4, 0B1h
@@ -354,7 +354,7 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
 
@@ -365,11 +365,11 @@
 ALIGN 16
 MdInterAnalysisVaaInfo_sse2:
 	mov eax, [esp+4]
-	movdqa xmm0, [eax]	; load 4 sad_8x8	
+	movdqa xmm0, [eax]	; load 4 sad_8x8
 	pshufd xmm1, xmm0, 01Bh
 	paddd xmm1, xmm0
 	pshufd xmm2, xmm1, 0B1h
-	paddd xmm1, xmm2	
+	paddd xmm1, xmm2
 	psrad xmm1, 02h		; iAverageSad
 	movdqa xmm2, xmm1
 	psrad xmm2, 06h
@@ -376,9 +376,9 @@
 	movdqa xmm3, xmm0	; iSadBlock
 	psrad xmm3, 06h
 	psubd xmm3, xmm2
-	
+
 	; to replace pmulld functionality as below
-	movdqa xmm2, xmm3	
+	movdqa xmm2, xmm3
 	pmuludq xmm2, xmm3
 	pshufd xmm4, xmm3, 0B1h
 	pmuludq xmm4, xmm4
@@ -385,8 +385,8 @@
 	movdqa xmm5, xmm2
 	punpckldq xmm5, xmm4
 	punpckhdq xmm2, xmm4
-	punpcklqdq xmm5, xmm2	
-	
+	punpcklqdq xmm5, xmm2
+
 	pshufd xmm4, xmm5, 01Bh
 	paddd xmm4, xmm5
 	pshufd xmm5, xmm4, 0B1h
@@ -398,6 +398,6 @@
 	pcmpgtd xmm0, xmm1	; iSadBlock > iAverageSad
 	movmskps eax, xmm0
 	ret
-.threshold_exit:	
+.threshold_exit:
 	mov eax, 15
 	ret
--- a/codec/encoder/plus/res/welsenc.rc
+++ b/codec/encoder/plus/res/welsenc.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""afxres.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/processing/build/linux/makefile
+++ b/processing/build/linux/makefile
@@ -3,7 +3,7 @@
 
 OUTDIR    = ../../../bin/linux
 BINDIR    = ../../bin
-OBJDIR    = ../../obj  
+OBJDIR    = ../../obj
 SRCDIRS   = ../../src/asm \
             ../../src/common \
             ../../src/adaptivequantization \
@@ -12,7 +12,7 @@
             ../../src/downsample \
             ../../src/scenechangedetection \
             ../../src/vaacalc \
-            ../../src/complexityanalysis 
+            ../../src/complexityanalysis
 SRCDIRS  += ../../src/imagerotate
 
 
@@ -28,7 +28,7 @@
 endif
 ASMFLAGS  = -f elf -DNOPREFIX  -I ../../src/asm/
 LDFLAGS   = -lstdc++ -ldl
-          
+
 SRCEXTS  = .cpp
 ifeq ($(NASM), 1)
 SRCEXTS += .asm
@@ -54,11 +54,11 @@
 .SUFFIXES:
 
 all: $(TARGETLIB)
-	
+
 %.d:%.cpp
 	@echo -n $(dir $<) > $@
 	@$(DEPEND_cpp.d) $< >> $@
-	
+
 %.d:%.asm
 	@echo -n $(dir $<) > $@
 	@$(DEPEND_asm.d) $< >> $@
@@ -67,9 +67,9 @@
 
 %.o:%.cpp
 	$(COMPILE.cpp) $< -o $@
-	
+
 %.o:%.asm
-	$(COMPILE.asm) $< -o $@	
+	$(COMPILE.asm) $< -o $@
 
 tags: $(HEADERS) $(SOURCES)
 	etags $(HEADERS) $(SOURCES)
--- a/processing/src/asm/asm_inc.asm
+++ b/processing/src/asm/asm_inc.asm
@@ -43,7 +43,7 @@
 ; Options, for DEBUG
 ;***********************************************************************
 
-%if 1 
+%if 1
 	%define MOVDQ movdqa
 %else
 	%define MOVDQ movdqu
@@ -58,7 +58,7 @@
 BITS 32
 
 ;***********************************************************************
-; Macros 
+; Macros
 ;***********************************************************************
 
 %macro WELS_EXTERN 1
@@ -74,7 +74,7 @@
 	pxor        %2, %2
     psubw       %2, %1
     pmaxsw      %1, %2
-%endmacro 	
+%endmacro
 
 %macro MMX_XSwap  4
     movq		%4, %2
@@ -105,7 +105,7 @@
     SSE2_XSawp qdq, %5, %2, %3
 %endmacro
 
-;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4 
+;in: xmm0, xmm1, xmm2, xmm3  pOut:  xmm0, xmm1, xmm3, xmm4
 %macro SSE2_TransTwo4x4W 5
     SSE2_XSawp wd,  %1, %2, %5
     SSE2_XSawp wd,  %3, %4, %2
@@ -125,26 +125,26 @@
 	movdqa	%6, %9
 	movdqa	%9, %4
 	SSE2_XSawp bw,  %7, %6, %4
-	
-	SSE2_XSawp wd,  %1, %3, %6	
+
+	SSE2_XSawp wd,  %1, %3, %6
 	SSE2_XSawp wd,  %8, %2, %3
 	SSE2_XSawp wd,  %5, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %3	
+	movdqa	%9, %3
 	SSE2_XSawp wd,  %7, %4, %3
-	
-	SSE2_XSawp dq,  %1, %5, %4	
+
+	SSE2_XSawp dq,  %1, %5, %4
 	SSE2_XSawp dq,  %6, %2, %5
 	SSE2_XSawp dq,  %8, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %5		
+	movdqa	%9, %5
 	SSE2_XSawp dq,  %7, %3, %5
-	
+
 	SSE2_XSawp qdq,  %1, %8, %3
 	SSE2_XSawp qdq,  %4, %2, %8
 	SSE2_XSawp qdq,  %6, %7, %2
 	movdqa	%7, %9
-	movdqa	%9, %1		
+	movdqa	%9, %1
 	SSE2_XSawp qdq,  %7, %5, %1
 	movdqa	%5, %9
 %endmacro
@@ -170,9 +170,9 @@
 %macro butterfly_1to16_sse	3	; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
 	mov %3h, %3l
 	movd %1, e%3x		; i.e, 1% = eax (=b0)
-	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0	
-	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0	
-%endmacro  
+	pshuflw %2, %1, 00h	; ..., b0 b0 b0 b0 b0 b0 b0 b0
+	pshufd %1, %2, 00h	; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
+%endmacro
 
 ;copy a dw into a xmm for 8 times
 %macro  SSE2_Copy8Times 2
--- a/processing/src/asm/cpuid.asm
+++ b/processing/src/asm/cpuid.asm
@@ -84,12 +84,12 @@
 ;   void WelsCPUId( int32_t uiIndex, int32_t *pFeatureA, int32_t *pFeatureB, int32_t *pFeatureC, int32_t *pFeatureD )
 ;****************************************************************************************************
 WelsCPUId:
-	push	ebx	
+	push	ebx
 	push	edi
-	
+
 	mov     eax, [esp+12]	; operating index
     cpuid					; cpuid
-	
+
 	; processing various information return
 	mov     edi, [esp+16]
     mov     [edi], eax
@@ -100,10 +100,10 @@
     mov     edi, [esp+28]
     mov     [edi], edx
 
-	pop		edi	
+	pop		edi
     pop     ebx
 	ret
-	
+
 WELS_EXTERN WelsCPUSupportAVX
 ; need call after cpuid=1 and eax, ecx flag got then
 ALIGN 16
@@ -139,7 +139,7 @@
 WelsCPUSupportFMA:
 	mov eax, [esp+4]
 	mov ecx, [esp+8]
-	
+
 	; refer to detection of FMA addressed in INTEL AVX manual document
 	and ecx, 018001000H
 	cmp ecx, 018001000H		; check OSXSAVE, AVX, FMA feature flags
@@ -153,7 +153,7 @@
 	mov eax, 1
 	ret
 fma_not_supported:
-	mov eax, 0	
+	mov eax, 0
 	ret
 
 WELS_EXTERN WelsEmms
--- a/processing/src/asm/denoisefilter.asm
+++ b/processing/src/asm/denoisefilter.asm
@@ -55,25 +55,25 @@
 ; Code
 ;***********************************************************************
 SECTION .text
-	
+
 %macro	WEIGHT_LINE	9
 		movq		%2,	%9
 		punpcklbw	%2,	%7
 		movdqa		%8,	%2
-		
+
 		movdqa		%1,	%6
 		psubusb		%1,	%8
 		psubusb		%8,	%6
 		por			%8,	%1		; ABS(curPixel - centerPixel);
-		
+
 		movdqa		%1,	%3
 		psubusb		%1,	%8
 
 		pmullw		%1,	%1
 		psrlw		%1,	5
-		pmullw		%2,	%1		
+		pmullw		%2,	%1
 		paddusw		%4,	%1
-		paddusw		%5,	%2	
+		paddusw		%5,	%2
 %endmacro
 
 %macro	WEIGHT_LINE1_UV	4
@@ -91,12 +91,12 @@
 		punpcklbw	%2,	%4
 		psllw		%2,	1
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	3
 		punpcklbw	%2,	%4
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	4
 		punpcklbw	%2,	%4
@@ -119,13 +119,13 @@
 		punpcklbw	%2,	%4
 		psllw		%2,	2
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	3
 		punpcklbw	%2,	%4
 		psllw		%2,	1
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	4
 		punpcklbw	%2,	%4
@@ -149,13 +149,13 @@
 		punpcklbw	%2,	%4
 		pmullw		%2,	[sse2_20]
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	3
 		punpcklbw	%2,	%4
 		psllw		%2,	2
 		paddw		%3,	%2
-		
+
 		movdqa		%2,	%1
 		psrldq		%2,	4
 		punpcklbw	%2,	%4
@@ -177,7 +177,7 @@
 %define		stride		esp + pushsize + 8
 BilateralLumaFilter8_sse2:
 		push		ebx
-		
+
 		pxor		xmm7,	xmm7
 		mov			eax,	[pixel]
 		mov			ebx,	eax
@@ -186,23 +186,23 @@
 		movdqa		xmm3,	[sse2_32]
 		pxor		xmm4,	xmm4		; nTotWeight
 		pxor		xmm5,	xmm5		; nSum
-		
+
 		dec			eax
 		mov			ecx,	[stride]
-		
+
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 4
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 5
-		
+
 		sub			eax,	ecx
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 1
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 2
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 3
-		
+
 		lea			eax,	[eax + ecx * 2]
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax]			; pixel 6
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 1]		; pixel 7
 		WEIGHT_LINE	xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0,  [eax + 2]		; pixel 8
-		
+
 		pcmpeqw		xmm0,	xmm0
 		psrlw		xmm0,	15
 		psllw		xmm0,	8
@@ -211,10 +211,10 @@
 		paddusw		xmm5,	xmm0
 		psrlw		xmm5,	8
 		packuswb	xmm5,	xmm5
-		movq		[ebx],	xmm5		
-		
+		movq		[ebx],	xmm5
+
 		pop ebx
-		ret	
+		ret
 
 WELS_EXTERN WaverageChromaFilter8_sse2
 ;***********************************************************************
@@ -231,33 +231,33 @@
 WaverageChromaFilter8_sse2:
 		mov		edx,	[esp + 4]	; pixels
 		mov		ecx,	[esp + 8]	; stride
-		
+
 		mov		eax,	ecx
 		add		eax,	eax
 		sub		edx,	eax			; pixels - 2 * stride
 		sub		edx,	2
-			
-		pxor	xmm0,	xmm0	
+
+		pxor	xmm0,	xmm0
 		pxor	xmm3,	xmm3
-	
+
 		movdqu		xmm1,	[edx]
 		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
+
 		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
-		add		edx,	eax	
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
+		add		edx,	eax
 		movdqu		xmm1,	[edx]
 		WEIGHT_LINE3_UV	xmm1,	xmm2,	xmm3,	xmm0
-		
+
 		movdqu		xmm1,	[edx + ecx]
-		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0	
-		
+		WEIGHT_LINE2_UV	xmm1,	xmm2,	xmm3,	xmm0
+
 		movdqu		xmm1,	[edx + ecx * 2]
-		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0		
-	
+		WEIGHT_LINE1_UV	xmm1,	xmm2,	xmm3,	xmm0
+
 		psrlw		xmm3,		6
 		packuswb	xmm3,		xmm3
-		movq		[edx + 2],		xmm3			
+		movq		[edx + 2],		xmm3
 
-		ret	
\ No newline at end of file
+		ret
\ No newline at end of file
--- a/processing/src/asm/downsample_bilinear.asm
+++ b/processing/src/asm/downsample_bilinear.asm
@@ -92,11 +92,11 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
 
+	sar ebp, $1			; iSrcHeight >> 1
+
 .yloops:
 	mov eax, [esp+40]	; iSrcWidth
 	sar eax, $1			; iSrcWidth >> 1
@@ -112,7 +112,7 @@
 	;=> target:
 	;: H G F E D C B A, P O N M L K J I
 	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	movq mm0, [esi]			; 1st pSrc line
 	movq mm1, [esi+8]		; 1st pSrc line + 8
 	movq mm2, [esi+ecx]		; 2nd pSrc line
@@ -140,7 +140,7 @@
 	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
 
 	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
+	movq mm0, mm4		;
 	punpckldq mm0, mm5 	; H G F E D C B A
 	punpckhdq mm4, mm5 	; h g f e d c b a
 
@@ -152,7 +152,7 @@
 	pavgb mm0, mm4		; (A+a+1)>>1, .., (H+h+1)>>1, temp_row1
 	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
 	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
-	
+
 	; 2nd part horizonal loop: x16 bytes
 	;               mem  hi<-       ->lo
 	;1st Line Src:	mm0: d D c C b B a A	mm1: h H g G f F e E
@@ -245,11 +245,11 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
 
+	sar ebp, $1		; iSrcHeight >> 1
+
 .yloops:
 	mov eax, [esp+40]	; iSrcWidth
 	sar eax, $1		; iSrcWidth >> 1
@@ -265,7 +265,7 @@
 	;=> target:
 	;: H G F E D C B A, P O N M L K J I
 	;: h g f e d c b a, p o n m l k j i
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	movq mm0, [esi]			; 1st pSrc line
 	movq mm1, [esi+8]		; 1st pSrc line + 8
 	movq mm2, [esi+ecx]		; 2nd pSrc line
@@ -293,7 +293,7 @@
 	pshufw mm7, mm7, 0d8h  	; p o n m P O N M ; 11011000 B: mm7
 
 	; to handle mm4, mm5, mm6, mm7
-	movq mm0, mm4		; 
+	movq mm0, mm4		;
 	punpckldq mm0, mm5 	; H G F E D C B A
 	punpckhdq mm4, mm5 	; h g f e d c b a
 
@@ -306,7 +306,7 @@
 	pavgb mm1, mm6		; (I+i+1)>>1, .., (P+p+1)>>1, temp_row2
 	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movq [edi  ], mm0	
+	movq [edi  ], mm0
 
 	; next SMB
 	lea esi, [esi+16]
@@ -349,11 +349,11 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
 
+	sar ebp, $1		; iSrcHeight >> 1
+
 .yloops:
 	mov eax, [esp+40]	; iSrcWidth
 	sar eax, $1		; iSrcWidth >> 1
@@ -369,9 +369,9 @@
 	;=> target:
 	;: H G F E D C B A
 	;: h g f e d c b a
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movq mm0, [esi]			; 1st pSrc line	
-	movq mm1, [esi+ecx]		; 2nd pSrc line	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movq mm0, [esi]			; 1st pSrc line
+	movq mm1, [esi+ecx]		; 2nd pSrc line
 
 	; to handle mm0, mm1, mm2, mm3
 	pshufw mm2, mm0, 0d8h	; d D b B c C a A ; 11011000 B
@@ -382,19 +382,19 @@
 	pshufw mm4, mm1, 0d8h	; h H f F g G e E ; 11011000 B
 	pshufw mm5, mm4, 04eh	; g G e E h H f F ; 01001110 B
 	punpcklbw mm4, mm5		; h g H G f e F E
-	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5	
+	pshufw mm4, mm4, 0d8h  	; h g f e H G F E ; 11011000 B: mm5
 
 	; to handle mm2, mm4
-	movq mm0, mm2		; 
+	movq mm0, mm2		;
 	punpckldq mm0, mm4 	; H G F E D C B A
 	punpckhdq mm2, mm4 	; h g f e d c b a
 
 	; avg within MB horizon width (16 x 2 lines)
 	pavgb mm0, mm2		; (H+h+1)>>1, .., (A+a+1)>>1, temp_row1, 2
-	pshufw mm1, mm0, 04eh	; 01001110 B	
+	pshufw mm1, mm0, 04eh	; 01001110 B
 	pavgb mm0, mm1		; (temp_row1+temp_row2+1)>>1, pending here and wait another horizonal part done then write memory once
 
-	movd [edi],	mm0	
+	movd [edi],	mm0
 
 	; next unit
 	lea esi, [esi+8]
@@ -440,11 +440,11 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
 
+	sar ebp, $1			; iSrcHeight >> 1
+
 	movdqa xmm7, [shufb_mask_low]	; mask low
 	movdqa xmm6, [shufb_mask_high]	; mask high
 
@@ -467,13 +467,13 @@
 	;: p o n m l k j i h g f e d c b a
 	;: P ..                          A
 	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	movdqa xmm0, [esi]			; 1st_src_line
 	movdqa xmm1, [esi+16]		; 1st_src_line + 16
 	movdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
+	movdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
 	; packing & avg
 	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
 	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -487,7 +487,7 @@
 	pshufb xmm1, xmm7
 	pshufb xmm5, xmm6
 ;	psubb xmm5, xmm1
-;	psrlw xmm5, 8	
+;	psrlw xmm5, 8
 	pavgb xmm1, xmm5
 
 	movdqa xmm4, xmm2
@@ -494,7 +494,7 @@
 	pshufb xmm2, xmm7
 	pshufb xmm4, xmm6
 ;	psubb xmm4, xmm2
-;	psrlw xmm4, 8	
+;	psrlw xmm4, 8
 	pavgb xmm2, xmm4
 
 	movdqa xmm5, xmm3
@@ -501,13 +501,13 @@
 	pshufb xmm3, xmm7
 	pshufb xmm5, xmm6
 ;	psubb xmm5, xmm3
-;	psrlw xmm5, 8	
+;	psrlw xmm5, 8
 	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
 
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
 	; write pDst
 	movdqa [edi], xmm0
 
@@ -526,7 +526,7 @@
 
 	dec ebp
 	jg near .yloops
-	
+
 	pop ebp
 	pop	edi
 	pop esi
@@ -551,11 +551,11 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
-	movdqa xmm7, [shufb_mask_low]	; mask low	
+
+	sar ebp, $1		; iSrcHeight >> 1
+	movdqa xmm7, [shufb_mask_low]	; mask low
 	movdqa xmm6, [shufb_mask_high]	; mask high
 
 .yloops:
@@ -574,10 +574,10 @@
 	;: H G F E D C B A, P O N M L K J I
 	;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movdqa xmm0, [esi]			; 1st_src_line	
-	movdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movdqa xmm0, [esi]			; 1st_src_line
+	movdqa xmm1, [esi+ecx]		; 2nd_src_line
+
 	; packing & avg
 	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
 	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -584,7 +584,7 @@
 	pshufb xmm2, xmm6			; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
 	; another implementation for xmm2 high bits
 ;	psubb xmm2, xmm0			; h 0 g 0 f 0 e 0 d 0 c 0 b 0 a 0
-;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a	
+;	psrlw xmm2, 8				; 0 h 0 g 0 f 0 e 0 d 0 c 0 b 0 a
 	pavgb xmm0, xmm2
 
 	movdqa xmm3, xmm1
@@ -591,14 +591,14 @@
 	pshufb xmm1, xmm7
 	pshufb xmm3, xmm6
 ;	psubb xmm3, xmm1
-;	psrlw xmm3, 8	
+;	psrlw xmm3, 8
 	pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
 
 	; write pDst
-	movq [edi], xmm0	
+	movq [edi], xmm0
 
 	; next SMB
 	lea esi, [esi+16]
@@ -615,7 +615,7 @@
 
 	dec ebp
 	jg near .yloops
-	
+
 	pop ebp
 	pop edi
 	pop esi
@@ -641,12 +641,12 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1			; iSrcHeight >> 1	
 
-	movdqa xmm7, [shufb_mask_low]	; mask low	
+	sar ebp, $1			; iSrcHeight >> 1
+
+	movdqa xmm7, [shufb_mask_low]	; mask low
 	movdqa xmm6, [shufb_mask_high]	; mask high
 
 .yloops:
@@ -668,13 +668,13 @@
 	;: p o n m l k j i h g f e d c b a
 	;: P ..                          A
 	;: p ..                          a
-	
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
+
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 	movntdqa xmm0, [esi]			; 1st_src_line
 	movntdqa xmm1, [esi+16]		; 1st_src_line + 16
 	movntdqa xmm2, [esi+ecx]		; 2nd_src_line
-	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16	
-	
+	movntdqa xmm3, [esi+ecx+16]	; 2nd_src_line + 16
+
 	; packing & avg
 	movdqa xmm4, xmm0			; h H g G f F e E d D c C b B a A
 	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -703,11 +703,11 @@
 ;	psubb xmm5, xmm3
 ;	psrlw xmm5, 8
 	pavgb xmm3, xmm5
-	
-	packuswb xmm0, xmm1	
-	packuswb xmm2, xmm3	
-	pavgb xmm0, xmm2	
 
+	packuswb xmm0, xmm1
+	packuswb xmm2, xmm3
+	pavgb xmm0, xmm2
+
 	; write pDst
 	movdqa [edi], xmm0
 
@@ -726,7 +726,7 @@
 
 	dec ebp
 	jg near .yloops
-	
+
 	pop ebp
 	pop	edi
 	pop esi
@@ -751,10 +751,10 @@
 	mov edi, [esp+24]	; pDst
 	mov edx, [esp+28]	; iDstStride
 	mov esi, [esp+32]	; pSrc
-	mov ecx, [esp+36]	; iSrcStride	
+	mov ecx, [esp+36]	; iSrcStride
 	mov ebp, [esp+44]	; iSrcHeight
-	
-	sar ebp, $1		; iSrcHeight >> 1	
+
+	sar ebp, $1		; iSrcHeight >> 1
 	movdqa xmm7, [shufb_mask_low]	; mask low
 	movdqa xmm6, [shufb_mask_high]	; mask high
 
@@ -774,10 +774,10 @@
 	;: H G F E D C B A, P O N M L K J I
 	;: h g f e d c b a, p o n m l k j i
 
-	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;	
-	movntdqa xmm0, [esi]			; 1st_src_line	
-	movntdqa xmm1, [esi+ecx]		; 2nd_src_line	
-	
+	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+	movntdqa xmm0, [esi]			; 1st_src_line
+	movntdqa xmm1, [esi+ecx]		; 2nd_src_line
+
 	; packing & avg
 	movdqa xmm2, xmm0			; h H g G f F e E d D c C b B a A
 	pshufb xmm0, xmm7			; 0 H 0 G 0 F 0 E 0 D 0 C 0 B 0 A
@@ -793,11 +793,11 @@
 ;	psrlw xmm3, 8
 	pavgb xmm1, xmm3
 
-	pavgb xmm0, xmm1	
-	packuswb xmm0, xmm1	
+	pavgb xmm0, xmm1
+	packuswb xmm0, xmm1
 
 	; write pDst
-	movq [edi], xmm0	
+	movq [edi], xmm0
 
 	; next SMB
 	lea esi, [esi+16]
@@ -814,7 +814,7 @@
 
 	dec ebp
 	jg near .yloops
-	
+
 	pop ebp
 	pop edi
 	pop esi
@@ -858,7 +858,7 @@
 %define		xInverse		esp + 20
 %define		dstStep			esp + 24
 	sub		esp,			localsize
-	
+
 	pxor	xmm0,	xmm0
 	mov		edx,	32767
 	mov		eax,	[uiScaleX]
@@ -871,7 +871,7 @@
 	psllq	xmm1,		32
 	por		xmm1,		xmm2					; 0 0  uinc  -uinc   (dword)
 	pshufd	xmm7,		xmm1,	01000100b		; xmm7: uinc -uinc uinc -uinc
-	
+
 	mov		eax,	[uiScaleY]
 	and		eax,	32767
 	mov		ebx,	eax
@@ -882,15 +882,15 @@
 	psllq	xmm6,		32
 	por		xmm6,		xmm2					; 0 0 vinc -vinc (dword)
 	pshufd	xmm6,		xmm6,	01010000b		; xmm6: vinc vinc -vinc -vinc
-	
+
 	mov		edx,		40003fffh
 	movd	xmm5,		edx
 	punpcklwd	xmm5,	xmm0					; 16384 16383
 	pshufd	xmm5,		xmm5,	01000100b		; xmm5: 16384 16383 16384 16383
-	
 
+
 DOWNSAMPLE:
-	
+
 	mov		eax,			[dwDstHeight]
 	mov		edi,			[pDstData]
 	mov		edx,			[dwDstStride]
@@ -901,10 +901,10 @@
 	mov		[tmpHeight],	eax
 	mov		eax,			16384
 	mov		[yInverse],		eax
-	
+
 	pshufd	xmm4,		xmm5,	01010000b	; initial v to 16384 16384 16383 16383
-	
-HEIGHT:	
+
+HEIGHT:
 	mov		eax,	[yInverse]
 	mov		esi,	[pSrcData]
 	shr		eax,	15
@@ -912,18 +912,18 @@
 	add		esi,	eax					; get current row address
 	mov		ebp,	esi
 	add		ebp,	[dwSrcStride]
-	
+
 	mov		eax,		16384
 	mov		[xInverse],		eax
 	mov		ecx,			[dwDstWidth]
 	dec		ecx
-	
+
 	movdqa	xmm3,		xmm5			; initial u to 16384 16383 16384 16383
-	
+
 WIDTH:
 	mov		eax,		[xInverse]
 	shr		eax,		15
-	
+
 	movd	xmm1,		[esi+eax]		; xxxxxxba
 	movd	xmm2,		[ebp+eax]		; xxxxxxdc
 	pxor	xmm0,		xmm0
@@ -930,7 +930,7 @@
 	punpcklwd	xmm1,	xmm2			; xxxxdcba
 	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
 	punpcklwd	xmm1,	xmm0			; 000d000c000b000a
-	
+
 	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
 	pmaddwd	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
 	movdqa	xmm0,	xmm2
@@ -942,20 +942,20 @@
 	pshufd	xmm1,	xmm2,	00001110b
 	paddq	xmm2,	xmm1
 	psrlq	xmm2,	29
-	
+
 	movd	eax,	xmm2
 	inc		eax
 	shr		eax,	1
 	mov		[edi],	al
 	inc		edi
-	
+
 	mov		eax,		[uiScaleX]
 	add		[xInverse],	eax
-	
+
 	paddw	xmm3,		xmm7			; inc u
 	psllw	xmm3,		1
 	psrlw	xmm3,		1
-	
+
 	loop	WIDTH
 
 WIDTH_END:
@@ -964,41 +964,41 @@
 	mov		cl,			[esi+eax]
 	mov		[edi],		cl
 	inc		edi
-	
+
 	mov		eax,		[uiScaleY]
 	add		[yInverse],	eax
 	add		edi,		[dstStep]
-	
+
 	paddw	xmm4,	xmm6				; inc v
 	psllw	xmm4,	1
 	psrlw	xmm4,	1
-	
+
 	dec		dword [tmpHeight]
 	jg		HEIGHT
 
 
-LAST_ROW:	
+LAST_ROW:
 	mov		eax,	[yInverse]
 	mov		esi,	[pSrcData]
 	shr		eax,	15
 	mul		dword [dwSrcStride]
 	add		esi,	eax					; get current row address
-	
+
 	mov		eax,		16384
 	mov		[xInverse],		eax
 	mov		ecx,			[dwDstWidth]
-	
+
 LAST_ROW_WIDTH:
 	mov		eax,		[xInverse]
 	shr		eax,		15
-	
+
 	mov		al,			[esi+eax]
 	mov		[edi],	al
 	inc		edi
-	
+
 	mov		eax,		[uiScaleX]
 	add		[xInverse],	eax
-	
+
 	loop	LAST_ROW_WIDTH
 
 LAST_ROW_END:
@@ -1026,10 +1026,10 @@
 %undef		xInverse
 %undef		dstStep
 	ret
-	
-	
-	
-	
+
+
+
+
 WELS_EXTERN	GeneralBilinearFastDownsampler_sse2
 ;**************************************************************************************************************
 ;int GeneralBilinearFastDownsampler_sse2(   unsigned char* pDst, const int iDstStride, const int iDstWidth, const int iDstHeight,
@@ -1062,7 +1062,7 @@
 %define		xInverse		esp + 20
 %define		dstStep			esp + 24
 	sub		esp,			localsize
-	
+
 	pxor	xmm0,	xmm0
 	mov		edx,	65535
 	mov		eax,	[uiScaleX]
@@ -1075,7 +1075,7 @@
 	psllq	xmm1,		32
 	por		xmm1,		xmm2					; 0 uinc 0 -uinc
 	pshuflw	xmm7,		xmm1,	10001000b		; xmm7: uinc -uinc uinc -uinc
-	
+
 	mov		eax,	[uiScaleY]
 	and		eax,	32767
 	mov		ebx,	eax
@@ -1086,15 +1086,15 @@
 	psllq	xmm6,		32
 	por		xmm6,		xmm2					; 0 vinc 0 -vinc
 	pshuflw	xmm6,		xmm6,	10100000b		; xmm6: vinc vinc -vinc -vinc
-	
+
 	mov		edx,		80007fffh				; 32768 32767
-	movd	xmm5,		edx					
+	movd	xmm5,		edx
 	pshuflw	xmm5,		xmm5,		01000100b	; 32768 32767 32768 32767
 	mov		ebx,		16384
-	
 
+
 FAST_DOWNSAMPLE:
-	
+
 	mov		eax,			[dwDstHeight]
 	mov		edi,			[pDstData]
 	mov		edx,			[dwDstStride]
@@ -1105,11 +1105,11 @@
 	mov		[tmpHeight],	eax
 	mov		eax,		16384
 	mov		[yInverse],		eax
-	
+
 	pshuflw	xmm4,		xmm5,	01010000b
 	psrlw	xmm4,		1				; initial v to 16384 16384 16383 16383
-	
-FAST_HEIGHT:	
+
+FAST_HEIGHT:
 	mov		eax,	[yInverse]
 	mov		esi,	[pSrcData]
 	shr		eax,	15
@@ -1117,23 +1117,23 @@
 	add		esi,	eax					; get current row address
 	mov		ebp,	esi
 	add		ebp,	[dwSrcStride]
-	
+
 	mov		eax,		32768
 	mov		[xInverse],		eax
 	mov		ecx,			[dwDstWidth]
 	dec		ecx
-	
+
 	movdqa	xmm3,		xmm5			; initial u to 32768 32767 32768 32767
-	
+
 FAST_WIDTH:
 	mov		eax,		[xInverse]
 	shr		eax,		16
-	
+
 	movd	xmm1,		[esi+eax]		; xxxxxxba
 	movd	xmm2,		[ebp+eax]		; xxxxxxdc
 	punpcklwd	xmm1,	xmm2			; xxxxdcba
 	punpcklbw	xmm1,	xmm0			; 0d0c0b0a
-	
+
 	movdqa	xmm2,	xmm4	; xmm2:  vv(1-v)(1-v)  tmpv
 	pmulhuw	xmm2,	xmm3	; mul u(1-u)u(1-u) on xmm2
 	pmaddwd		xmm2,	xmm1
@@ -1142,17 +1142,17 @@
 	movd	xmm1,	ebx
 	paddd	xmm2,	xmm1
 	psrld	xmm2,	15
-	
+
 	packuswb	xmm2,	xmm0
 	movd	eax,	xmm2
 	mov		[edi],	al
 	inc		edi
-	
+
 	mov		eax,		[uiScaleX]
 	add		[xInverse],	eax
-	
+
 	paddw	xmm3,		xmm7			; inc u
-	
+
 	loop	FAST_WIDTH
 
 FAST_WIDTH_END:
@@ -1161,41 +1161,41 @@
 	mov		cl,			[esi+eax]
 	mov		[edi],		cl
 	inc		edi
-	
+
 	mov		eax,		[uiScaleY]
 	add		[yInverse],	eax
 	add		edi,		[dstStep]
-	
+
 	paddw	xmm4,	xmm6				; inc v
 	psllw	xmm4,	1
 	psrlw	xmm4,	1
-	
+
 	dec		dword [tmpHeight]
 	jg		FAST_HEIGHT
 
 
-FAST_LAST_ROW:	
+FAST_LAST_ROW:
 	mov		eax,	[yInverse]
 	mov		esi,	[pSrcData]
 	shr		eax,	15
 	mul		dword [dwSrcStride]
 	add		esi,	eax					; get current row address
-	
+
 	mov		eax,		32768
 	mov		[xInverse],		eax
 	mov		ecx,			[dwDstWidth]
-	
+
 FAST_LAST_ROW_WIDTH:
 	mov		eax,		[xInverse]
 	shr		eax,		16
-	
+
 	mov		al,			[esi+eax]
 	mov		[edi],	al
 	inc		edi
-	
+
 	mov		eax,		[uiScaleX]
 	add		[xInverse],	eax
-	
+
 	loop	FAST_LAST_ROW_WIDTH
 
 FAST_LAST_ROW_END:
--- a/processing/src/asm/intra_pred.asm
+++ b/processing/src/asm/intra_pred.asm
@@ -85,7 +85,7 @@
 
 %macro SSE2_PRED_H_16X16_TWO_LINE 1
     lea     eax,	[eax+ecx*2]
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [edx+%1],	xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
@@ -97,13 +97,13 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     COPY_16_TIMES eax,	xmm0
     movdqa  [edx],		xmm0
     COPY_16_TIMESS eax,	xmm0,	ecx
     movdqa  [edx+0x10],	xmm0
-    
-	SSE2_PRED_H_16X16_TWO_LINE   0x20 
+
+	SSE2_PRED_H_16X16_TWO_LINE   0x20
 	SSE2_PRED_H_16X16_TWO_LINE   0x40
 	SSE2_PRED_H_16X16_TWO_LINE   0x60
 	SSE2_PRED_H_16X16_TWO_LINE   0x80
@@ -110,9 +110,9 @@
 	SSE2_PRED_H_16X16_TWO_LINE   0xa0
 	SSE2_PRED_H_16X16_TWO_LINE   0xc0
 	SSE2_PRED_H_16X16_TWO_LINE   0xe0
-   
+
     ret
-    
+
 ;***********************************************************************
 ; void WelsI16x16LumaPredV_sse2(uint8_t *pred, uint8_t *pRef, int32_t stride);
 ;***********************************************************************
@@ -121,10 +121,10 @@
     mov     edx, [esp+4]    ; pred
     mov     eax, [esp+8]	; pRef
     mov     ecx, [esp+12]   ; stride
-    
+
     sub     eax, ecx
     movdqa  xmm0, [eax]
-    
+
     movdqa  [edx], xmm0
     movdqa  [edx+10h], xmm0
     movdqa  [edx+20h], xmm0
@@ -135,11 +135,11 @@
     movdqa  [edx+70h], xmm0
     movdqa  [edx+80h], xmm0
     movdqa  [edx+90h], xmm0
-    movdqa  [edx+160], xmm0 
+    movdqa  [edx+160], xmm0
 	movdqa  [edx+176], xmm0
     movdqa  [edx+192], xmm0
     movdqa  [edx+208], xmm0
     movdqa  [edx+224], xmm0
     movdqa  [edx+240], xmm0
-    
+
     ret
\ No newline at end of file
--- a/processing/src/asm/sad.asm
+++ b/processing/src/asm/sad.asm
@@ -67,7 +67,7 @@
 %endmacro
 
 
-  
+
 %macro CACHE_SPLIT_CHECK 3 ; address, width, cacheline
 and    %1,  0x1f|(%3>>1)
 cmp    %1,  (32-%2)|(%3>>1)
@@ -108,15 +108,15 @@
 	push   edi
 	mov    eax,    [esp+12]
 	mov    ebx,    [esp+16]
-    
+
     pxor   xmm7,   xmm7
-    
+
     mov    edi,    ecx
     and    edi,    0x07
-    sub    ecx,    edi   
+    sub    ecx,    edi
     mov    edx,    8
     sub    edx,    edi
-    
+
     shl    edi,    3
     shl    edx,    3
     movd   xmm5,   edi
@@ -124,10 +124,10 @@
 	mov    edi,    8
 	add    edi,    ecx
     mov    edx,    [esp+24]
-    
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -135,17 +135,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -153,7 +153,7 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
 
@@ -160,10 +160,10 @@
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -171,17 +171,17 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
 	lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
 	lea    edi,    [edi+2*edx]
-	 
+
     movq   xmm0,   [eax]
 	movhps xmm0,   [eax+ebx]
-		
+
 	movq   xmm1,   [ecx]
 	movq   xmm2,   [edi]
 	movhps xmm1,   [ecx+edx]
@@ -189,10 +189,10 @@
 	psrlq  xmm1,   xmm5
 	psllq  xmm2,   xmm6
 	por    xmm1,   xmm2
-	
+
 	psadbw xmm0,   xmm1
 	paddw  xmm7,   xmm0
-	
+
     movhlps    xmm0, xmm7
 	paddw      xmm0, xmm7
 	movd       eax,  xmm0
@@ -202,12 +202,12 @@
     push   ebx
     mov    eax,    [esp+8]
 	mov    ebx,    [esp+12]
-	mov    edx,    [esp+20]    
+	mov    edx,    [esp+20]
 	pxor   xmm6,   xmm6
 	SSE2_GetSad8x4
     lea    eax,    [eax+2*ebx]
 	lea    ecx,    [ecx+2*edx]
-    SSE2_GetSad8x4    
+    SSE2_GetSad8x4
     movhlps    xmm0, xmm6
 	paddw      xmm0, xmm6
 	movd       eax,  xmm0
--- a/processing/src/asm/vaa.asm
+++ b/processing/src/asm/vaa.asm
@@ -163,7 +163,7 @@
 	paddd	xmm6,	xmm1
 	paddd	xmm6,	xmm3
 	lea		esi,	[esi+ebx*2]
-	lea		edi,	[edi+ebx*2]	
+	lea		edi,	[edi+ebx*2]
 %endmacro
 
 %macro	WELS_SAD_SUM_SQSUM_16x1_SSE2 0
@@ -172,11 +172,11 @@
 	movdqa	xmm3,	xmm1
 	psadbw	xmm3,	xmm2
 	paddd	xmm6,	xmm3
-	
+
 	movdqa	xmm3,	xmm1
 	psadbw	xmm3,	xmm0
 	paddd	xmm5,	xmm3
-	
+
 	movdqa		xmm2,	xmm1
 	punpcklbw	xmm1,	xmm0
 	punpckhbw	xmm2,	xmm0
@@ -184,7 +184,7 @@
 	pmaddwd		xmm2,	xmm2
 	paddd		xmm4,	xmm1
 	paddd		xmm4,	xmm2
-	
+
 	add		esi,	ebx
 	add		edi,	ebx
 %endmacro
@@ -195,16 +195,16 @@
 	movdqa	xmm3,	xmm1
 	psadbw	xmm3,	xmm2
 	paddd	xmm7,	xmm3	; sad
-	
+
 	movdqa	xmm3,	xmm1
 	pmaxub	xmm3,	xmm2
 	pminub	xmm2,	xmm1
 	psubb	xmm3,	xmm2	; diff
-	
+
 	movdqa	xmm2,	xmm1
 	psadbw	xmm2,	xmm0
 	paddd	xmm6,	xmm2	; sum
-	
+
 	movdqa		xmm2,	xmm1
 	punpcklbw	xmm1,	xmm0
 	punpckhbw	xmm2,	xmm0
@@ -212,7 +212,7 @@
 	pmaddwd		xmm2,	xmm2
 	paddd		xmm5,	xmm1
 	paddd		xmm5,	xmm2	; sqsum
-	
+
 	movdqa		xmm1,	xmm3
 	punpcklbw	xmm1,	xmm0
 	punpckhbw	xmm3,	xmm0
@@ -220,7 +220,7 @@
 	pmaddwd		xmm3,	xmm3
 	paddd		xmm4,	xmm1
 	paddd		xmm4,	xmm3	; sqdiff
-	
+
 	add		esi,	ebx
 	add		edi,	ebx
 %endmacro
@@ -238,16 +238,16 @@
 	movdqa	xmm3,		xmm2
 	psadbw	xmm3,		xmm0
 	paddd	sum_ref_reg,			xmm3	; sum_ref
-	
+
 	movdqa	xmm3,		xmm1
 	pmaxub	xmm3,		xmm2
 	pminub	xmm2,		xmm1
 	psubb	xmm3,		xmm2	; abs diff
 	pmaxub	mad_reg,	xmm3	; max abs diff
-	
+
 	psadbw	xmm3,		xmm0
 	paddd	sad_reg,	xmm3	; sad
-	
+
 	add			esi,		ebx
 	add			edi,		ebx
 %endmacro
@@ -285,7 +285,7 @@
 	psllq		xmm3,		32
 	paddd		xmm2,		xmm3
 	paddd		sad_reg,	xmm2		; sqsum
-	
+
 	movdqa	xmm2,		[edi]
 	movdqa	xmm3,		xmm1
 	psadbw	xmm3,		xmm0
@@ -294,13 +294,13 @@
 	psadbw	xmm3,		xmm0
 	pslldq	xmm3,		4
 	paddd	sum_reg,			xmm3	; sum_ref
-	
+
 	movdqa	xmm3,		xmm1
 	pmaxub	xmm3,		xmm2
 	pminub	xmm2,		xmm1
 	psubb	xmm3,		xmm2	; abs diff
 	pmaxub	mad_reg,	xmm3	; max abs diff
-	
+
 	movdqa	xmm1,		xmm3
 	psadbw	xmm3,		xmm0
 	paddd	sad_reg,	xmm3	; sad
@@ -312,7 +312,7 @@
 	pmaddwd		xmm3,	xmm3
 	paddd		sqdiff_reg,	xmm1
 	paddd		sqdiff_reg,	xmm3	; sqdiff
-	
+
 	add		esi,	ebx
 	add		edi,	ebx
 %endmacro
@@ -351,7 +351,7 @@
 	mov ebx, [esp+32]
 	mov ecx, [esp+36]
 	mov edx, [esp+40]
-	pxor xmm0, xmm0	
+	pxor xmm0, xmm0
 .hloop:
 	mov eax, ebx
 	mov ebp, $0
@@ -361,7 +361,7 @@
 	psadbw xmm1, xmm2
 	pshufd xmm2, xmm1, 0f6h	; 11110110 B ; movhlps for float
 	paddd xmm1, xmm2
-	paddd xmm0, xmm1	
+	paddd xmm0, xmm1
 	add ebp, 010h
 	dec eax
 	jnz near .wloop
@@ -384,20 +384,20 @@
 ;   void SampleVariance16x16_sse2(	uint8_t * y_ref, int32_t y_ref_stride, uint8_t * y_src, int32_t y_src_stride,SMotionTextureUnit* pMotionTexture );
 ;***********************************************************************
 ALIGN 16
-SampleVariance16x16_sse2:	
+SampleVariance16x16_sse2:
 	push esi
 	push edi
 	push ebx
-	
+
 	sub esp, 16
 	%define SUM			[esp]
 	%define SUM_CUR		[esp+4]
 	%define SQR			[esp+8]
 	%define SQR_CUR		[esp+12]
-	%define PUSH_SIZE	28	; 12 + 16	
+	%define PUSH_SIZE	28	; 12 + 16
 
 	mov edi, [esp+PUSH_SIZE+4]	; y_ref
-	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride	
+	mov edx, [esp+PUSH_SIZE+8]	; y_ref_stride
 	mov esi, [esp+PUSH_SIZE+12]	; y_src
 	mov eax, [esp+PUSH_SIZE+16]	; y_src_stride
 	mov ecx, 010h				; height = 16
@@ -422,7 +422,7 @@
 	; sqr += diff * diff;
 	pmaxub xmm0, xmm1
 	pminub xmm1, xmm2
-	psubb xmm0, xmm1				; diff	
+	psubb xmm0, xmm1				; diff
 	SUM_SQR_SSE2 xmm1, xmm0, xmm7	; dst, pSrc, zero
 	movd ebx, xmm1
 	add SQR, ebx
@@ -433,7 +433,7 @@
 	punpcklbw xmm0, xmm7
 	punpckhbw xmm1, xmm7
 	paddw xmm0, xmm1		; 8x2
-	SUM_WORD_8x2_SSE2 xmm0, xmm1	
+	SUM_WORD_8x2_SSE2 xmm0, xmm1
 	movd ebx, xmm0
 	and ebx, 0ffffh
 	add SUM_CUR, ebx
@@ -442,12 +442,12 @@
 	SUM_SQR_SSE2 xmm0, xmm3, xmm7	; dst, pSrc, zero
 	movd ebx, xmm0
 	add SQR_CUR, ebx
-	
+
 	lea edi, [edi+edx]
 	lea esi, [esi+eax]
 	dec ecx
 	jnz near .hloops
-	
+
 	mov ebx, 0
 	mov bx, word SUM
 	sar ebx, 8
@@ -465,7 +465,7 @@
 	sar ecx, 8
 	sub ecx, ebx
 	mov [edi+2], cx				; to store uiTextureIndex
-	
+
 	%undef SUM
 	%undef SUM_CUR
 	%undef SQR
@@ -472,10 +472,10 @@
 	%undef SQR_CUR
 	%undef PUSH_SIZE
 
-	add esp, 16	
+	add esp, 16
 	pop ebx
 	pop edi
-	pop esi	
+	pop esi
 
 	ret
 
@@ -497,7 +497,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -509,31 +509,31 @@
 	add edx, ecx		; linesize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; linesize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+8], xmm0	
+	movq [esp+8], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
 	movq [esp+24], xmm0
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
 	paddw xmm0, xmm1
 	SUM_WORD_8x2_SSE2 xmm0, xmm3
-	
+
 	pmullw xmm1, xmm1
 	pmullw xmm2, xmm2
 	movdqa xmm3, xmm1
@@ -549,7 +549,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low word truncated
 	mov ecx, ebx
@@ -557,7 +557,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -567,7 +567,7 @@
 	pop edx
 	pop ebx
 	ret
-        
+
 WELS_EXTERN AnalysisVaaInfoIntra_ssse3
 ;***********************************************************************
 ;	int32_t AnalysisVaaInfoIntra_ssse3(	uint8_t *pDataY, const int32_t linesize );
@@ -583,7 +583,7 @@
 	mov ebp, esp
 	and ebp, 0fh
 	sub esp, ebp
-	sub esp, 32	
+	sub esp, 32
 	%define PUSH_SIZE	52	; 20 + 32
 
 	mov esi, [esp+ebp+PUSH_SIZE+4]	; data_y
@@ -595,25 +595,25 @@
 	add edx, ecx		; linesize x 3 [edx]
 	mov eax, ebx
 	sal eax, $1			; linesize x 4 [eax]
-	
+
 	pxor xmm7, xmm7
-	
+
 	; loops
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp], xmm0	
+	movq [esp], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
-	movq [esp+8], xmm1	
+	movq [esp+8], xmm1
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
-	movq [esp+16], xmm0	
+	movq [esp+16], xmm0
 
 	lea esi, [esi+eax]
 	VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
 	movq [esp+24], xmm1
-		
+
 	movdqa xmm0, [esp]		; block 0~7
 	movdqa xmm1, [esp+16]	; block 8~15
 	movdqa xmm2, xmm0
@@ -635,7 +635,7 @@
 	paddd xmm1, xmm2
 	pshufd xmm2, xmm1, 0B1h
 	paddd xmm1, xmm2
-	
+
 	movd ebx, xmm0
 	and ebx, 0ffffh		; effective low work truncated
 	mov ecx, ebx
@@ -643,7 +643,7 @@
 	sar ebx, $4
 	movd eax, xmm1
 	sub eax, ebx
-	
+
 	%undef PUSH_SIZE
 	add esp, 32
 	add esp, ebp
@@ -654,12 +654,12 @@
 	pop ebx
 	ret
 %endif
-	
-	
 
+
+
 WELS_EXTERN abs_difference_mbrow_sse2
 ;*************************************************************************************************************
-;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride, 
+;void abs_difference_mbrow_sse2( uint8_t *ref_orig, uint8_t *cur_orig, int32_t iPicStride,
 ;								 int32_t gom_pixel_num, int32_t *pSum)
 ;*************************************************************************************************************
 ALIGN 16
@@ -691,13 +691,13 @@
 	add		edi,	16
 	cmp		esi,	edx
 	jl		gom_row_loop_p
-	
+
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	ebx
 	add		edi,	ebx
 	loop	mb_width_loop_p
-	
+
 	movdqa	xmm1,	xmm0
 	psrldq	xmm1,	8
 	paddd	xmm1,	xmm0
@@ -710,7 +710,7 @@
 %undef		iPicStride
 %undef		gom_pixel_num
 %undef		pSum
-%undef		pushsize	
+%undef		pushsize
 	pop		ebx
 	pop		edi
 	pop		esi
@@ -721,7 +721,7 @@
 
 WELS_EXTERN sum_sqrsum_mbrow_sse2
 ;*************************************************************************************************************
-;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride, 
+;void sum_sqrsum_mbrow_sse2( uint8_t *cur_orig, int32_t iPicStride,
 ;							 int32_t gom_pixel_num, int32_t *pSum, int32_t *pSqrSum)
 ;*************************************************************************************************************
 ALIGN 16
@@ -759,11 +759,11 @@
 	add			esi,	16
 	cmp			esi,	edx
 	jl			gom_row_loop_i
-	
+
 	sub			esi,	eax
 	add			esi,	ebx
 	loop		mb_width_loop_i
-	
+
 	movdqa		xmm3,	xmm1
 	psrldq		xmm3,	8
 	paddd		xmm1,	xmm3
@@ -770,7 +770,7 @@
 	movd		eax,	xmm1
 	mov			edx,	[pSum]
 	add			[edx],	eax
-	
+
 	movdqa		xmm3,	xmm2
 	psrldq		xmm3,	8
 	paddd		xmm2,	xmm3
@@ -787,7 +787,7 @@
 %undef		gom_pixel_num
 %undef		pSum
 %undef		pSqrSum
-%undef		pushsize	
+%undef		pushsize
 	pop			ebx
 	pop			esi
 	ret
@@ -819,7 +819,7 @@
 	mov		ebx,	[iPicStride]
 	mov		edx,	[psad8x8]
 	mov		eax,	ebx
-	
+
 	shr		dword [iPicWidth],	4					; iPicWidth/16
 	shr		dword [iPicHeight],	4					; iPicHeight/16
 	shl		eax,	4								; iPicStride*16
@@ -839,7 +839,7 @@
 	movd	[edx],		xmm6
 	psrldq	xmm6,		8
 	movd	[edx+4],	xmm6
-	
+
 	pxor	xmm6,	xmm6
 	WELS_SAD_16x2_SSE2
 	WELS_SAD_16x2_SSE2
@@ -849,24 +849,24 @@
 	movd	[edx+8],	xmm6
 	psrldq	xmm6,		8
 	movd	[edx+12],	xmm6
-	
+
 	add		edx,	16
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	16
 	add		edi,	16
-	
+
 	dec		ecx
 	jnz		width_loop
-	
+
 	pop		edi
 	pop		esi
 	add		esi,	eax
 	add		edi,	eax
-	
+
 	dec	dword [iPicHeight]
 	jnz		height_loop
-	
+
 	mov		edx,	[psadframe]
 	movdqa	xmm5,	xmm7
 	psrldq	xmm7,	8
@@ -880,16 +880,16 @@
 %undef		iPicStride
 %undef		psadframe
 %undef		psad8x8
-%undef		pushsize	
+%undef		pushsize
 	pop		ebx
 	pop		edi
 	pop		esi
 	ret
-	
-	
+
+
 WELS_EXTERN VAACalcSadVar_sse2
 ;*************************************************************************************************************
-;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight 
+;void VAACalcSadVar_sse2( uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight
 ;		int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
 ;*************************************************************************************************************
 
@@ -919,7 +919,7 @@
 	mov		ebx,	[iPicStride]
 	mov		edx,	[psad8x8]
 	mov		eax,	ebx
-	
+
 	shr		dword [iPicWidth],	4					; iPicWidth/16
 	shr		dword [iPicHeight],	4					; iPicHeight/16
 	shl		eax,	4							; iPicStride*16
@@ -945,7 +945,7 @@
 	movd	[edx],		xmm6
 	psrldq	xmm6,		8
 	movd	[edx+4],	xmm6
-	
+
 	pxor	xmm6,	xmm6
 	WELS_SAD_SUM_SQSUM_16x1_SSE2
 	WELS_SAD_SUM_SQSUM_16x1_SSE2
@@ -959,7 +959,7 @@
 	movd	[edx+8],	xmm6
 	psrldq	xmm6,		8
 	movd	[edx+12],	xmm6
-	
+
 	mov		ebp,	[psum16x16]
 	movdqa	xmm1,	xmm5
 	psrldq	xmm1,	8
@@ -966,7 +966,7 @@
 	paddd	xmm5,	xmm1
 	movd	[ebp],	xmm5
 	add		dword [psum16x16], 4
-	
+
 	movdqa	xmm5,	xmm4
 	psrldq	xmm5,	8
 	paddd	xmm4,	xmm5
@@ -973,28 +973,28 @@
 	movdqa	xmm3,	xmm4
 	psrldq	xmm3,	4
 	paddd	xmm4,	xmm3
-	
+
 	mov		ebp,	[psqsum16x16]
 	movd	[ebp],	xmm4
 	add		dword [psqsum16x16], 4
-	
+
 	add		edx,	16
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	16
 	add		edi,	16
-	
+
 	dec		ecx
 	jnz		var_width_loop
-	
+
 	mov		esi,	[tmp_esi]
 	mov		edi,	[tmp_edi]
 	add		esi,	eax
 	add		edi,	eax
-	
+
 	dec	dword [iPicHeight]
 	jnz		var_height_loop
-	
+
 	mov		edx,	[psadframe]
 	movdqa	xmm5,	xmm7
 	psrldq	xmm7,	8
@@ -1001,7 +1001,7 @@
 	paddd	xmm7,	xmm5
 	movd	[edx],	xmm7
 
-	add		esp,	localsize	
+	add		esp,	localsize
 	pop		ebx
 	pop		edi
 	pop		esi
@@ -1020,12 +1020,12 @@
 %undef		pushsize
 %undef		localsize
 	ret
-	
-	
 
+
+
 WELS_EXTERN VAACalcSadSsd_sse2
 ;*************************************************************************************************************
-;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,  
+;void VAACalcSadSsd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;	int32_t iPicStride,int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
 ;*************************************************************************************************************
 
@@ -1059,7 +1059,7 @@
 	mov		ebx,	[iPicStride]
 	mov		edx,	[psad8x8]
 	mov		eax,	ebx
-	
+
 	shr		dword [iPicWidth],	4					; iPicWidth/16
 	shr		dword [iPicHeight],	4					; iPicHeight/16
 	shl		eax,	4							; iPicStride*16
@@ -1091,7 +1091,7 @@
 	movd	[edx+4],	xmm7
 	movd	ebp,		xmm1
 	add		[tmp_sadframe],	ebp
-	
+
 	pxor	xmm7,	xmm7
 	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
 	WELS_SAD_SUM_SQSUM_SQDIFF_16x1_SSE2
@@ -1108,7 +1108,7 @@
 	movd	[edx+12],	xmm7
 	movd	ebp,		xmm1
 	add		[tmp_sadframe],	ebp
-	
+
 	mov		ebp,	[psum16x16]
 	movdqa	xmm1,	xmm6
 	psrldq	xmm1,	8
@@ -1115,7 +1115,7 @@
 	paddd	xmm6,	xmm1
 	movd	[ebp],	xmm6
 	add		dword [psum16x16], 4
-	
+
 	mov		ebp,	[psqsum16x16]
 	pshufd	xmm6,	xmm5,	14 ;00001110
 	paddd	xmm6,	xmm5
@@ -1123,7 +1123,7 @@
 	paddd	xmm5,	xmm6
 	movd	[ebp],	xmm5
 	add		dword [psqsum16x16], 4
-	
+
 	mov		ebp,	[psqdiff16x16]
 	pshufd	xmm5,	xmm4,	14	; 00001110
 	paddd	xmm5,	xmm4
@@ -1131,29 +1131,29 @@
 	paddd	xmm4,	xmm5
 	movd	[ebp],	xmm4
 	add		dword	[psqdiff16x16],	4
-	
+
 	add		edx,	16
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	16
 	add		edi,	16
-	
+
 	dec		ecx
 	jnz		sqdiff_width_loop
-	
+
 	mov		esi,	[tmp_esi]
 	mov		edi,	[tmp_edi]
 	add		esi,	eax
 	add		edi,	eax
-	
+
 	dec	dword [iPicHeight]
 	jnz		sqdiff_height_loop
-	
+
 	mov		ebx,	[tmp_sadframe]
 	mov		eax,	[psadframe]
 	mov		[eax],	ebx
 
-	add		esp,	localsize	
+	add		esp,	localsize
 	pop		ebx
 	pop		edi
 	pop		esi
@@ -1174,14 +1174,14 @@
 %undef		pushsize
 %undef		localsize
 	ret
-	
-	
-	
-	
 
+
+
+
+
 WELS_EXTERN VAACalcSadBgd_sse2
 ;*************************************************************************************************************
-;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
+;void VAACalcSadBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
 ;				int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
 ;*************************************************************************************************************
 
@@ -1211,7 +1211,7 @@
 	mov		edi,	[ref_data]
 	mov		ebx,	[iPicStride]
 	mov		eax,	ebx
-	
+
 	shr		dword [iPicWidth],	4					; iPicWidth/16
 	shr		dword [iPicHeight],	4					; iPicHeight/16
 	shl		eax,	4							; iPicStride*16
@@ -1234,11 +1234,11 @@
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
-	
+
+
 	mov			edx,		[p_mad8x8]
 	WELS_MAX_REG_SSE2	xmm4
-	
+
 	;movdqa		xmm1,	xmm4
 	;punpcklbw	xmm1,	xmm0
 	;punpcklwd	xmm1,	xmm0
@@ -1247,7 +1247,7 @@
 	;punpcklwd	xmm4,	xmm0
 	;movd		[edx+4],	xmm4
 	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
+	;mov			[p_mad8x8],	edx
 	mov			[tmp_ecx],	ecx
 	movhlps		xmm1,	xmm4
 	movd		ecx,	xmm4
@@ -1257,12 +1257,12 @@
 	add			edx,	2
 	mov			[p_mad8x8],	edx
 
-	
+
 	pslldq		xmm7,	4
 	pslldq		xmm6,	4
 	pslldq		xmm5,	4
-	
-	
+
+
 	pxor	xmm4,	xmm4		; pMad8x8
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
@@ -1272,10 +1272,10 @@
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_SD_MAD_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
+
 	mov			edx,		[p_mad8x8]
 	WELS_MAX_REG_SSE2	xmm4
-	
+
 	;movdqa		xmm1,	xmm4
 	;punpcklbw	xmm1,	xmm0
 	;punpcklwd	xmm1,	xmm0
@@ -1284,7 +1284,7 @@
 	;punpcklwd	xmm4,	xmm0
 	;movd		[edx+4],	xmm4
 	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
+	;mov			[p_mad8x8],	edx
 	movhlps		xmm1,	xmm4
 	movd		ecx,	xmm4
 	mov			[edx],	cl
@@ -1292,21 +1292,21 @@
 	mov			[edx+1],cl
 	add			edx,	2
 	mov			[p_mad8x8],	edx
-	
+
 	; data in xmm7, xmm6, xmm5:  D1 D3 D0 D2
-	
+
 	mov		edx,	[psad8x8]
 	pshufd	xmm1,	xmm7,	10001101b		; D3 D2 D1 D0
-	movdqa	[edx],	xmm1					
+	movdqa	[edx],	xmm1
 	add		edx,	16
 	mov		[psad8x8],	edx					; sad8x8
-	
+
 	paddd	xmm1,	xmm7					; D1+3 D3+2 D0+1 D2+0
 	pshufd	xmm2,	xmm1,	00000011b
 	paddd	xmm1,	xmm2
 	movd	edx,	xmm1
 	add		ebp,	edx						; sad frame
-	
+
 	mov		edx,	[p_sd8x8]
 	psubd	xmm6,	xmm5
 	pshufd	xmm1,	xmm6,	10001101b
@@ -1313,30 +1313,30 @@
 	movdqa	[edx],	xmm1
 	add		edx,	16
 	mov		[p_sd8x8],	edx
-	
-	
+
+
 	add		edx,	16
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	16
 	add		edi,	16
-	
+
 	mov		ecx,	[tmp_ecx]
 	dec		ecx
 	jnz		bgd_width_loop
-	
+
 	mov		esi,	[tmp_esi]
 	mov		edi,	[tmp_edi]
 	add		esi,	eax
 	add		edi,	eax
-	
+
 	dec		dword [iPicHeight]
 	jnz		bgd_height_loop
-	
+
 	mov		edx,	[psadframe]
 	mov		[edx],	ebp
 
-	add		esp,	localsize	
+	add		esp,	localsize
 	pop		ebx
 	pop		edi
 	pop		esi
@@ -1360,8 +1360,8 @@
 
 WELS_EXTERN VAACalcSadSsdBgd_sse2
 ;*************************************************************************************************************
-;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight, 
-;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, 
+;void VAACalcSadSsdBgd_sse2(uint8_t *cur_data, uint8_t *ref_data, int32_t iPicWidth, int32_t iPicHeight,
+;		 int32_t iPicStride, int32_t *psadframe, int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16,
 ;			int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
 ;*************************************************************************************************************
 
@@ -1395,7 +1395,7 @@
 	mov		edi,	[ref_data]
 	mov		ebx,	[iPicStride]
 	mov		eax,	ebx
-	
+
 	shr		dword [iPicWidth],	4					; iPicWidth/16
 	shr		dword [iPicHeight],	4					; iPicHeight/16
 	shl		eax,	4							; iPicStride*16
@@ -1418,7 +1418,7 @@
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
+
 	mov		edx,		[psad8x8]
 	movdqa	xmm2,		xmm7
 	pshufd	xmm1,		xmm2,		00001110b
@@ -1426,17 +1426,17 @@
 	movd	[edx+4],	xmm1
 	add		edx,		8
 	mov		[psad8x8],	edx			; sad8x8
-	
+
 	paddd	xmm1,				xmm2
 	movd	edx,				xmm1
 	add		[tmp_sadframe],		edx			; iFrameSad
-	
+
 	mov		edx,		[psum16x16]
 	movdqa	xmm1,		xmm6
 	pshufd	xmm2,		xmm1,		00001110b
 	paddd	xmm1,		xmm2
 	movd	[edx],		xmm1				; sum
-	
+
 	mov		edx,		[p_sd8x8]
 	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
 	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
@@ -1444,7 +1444,7 @@
 	movq	[edx],		xmm1
 	add		edx,		8
 	mov		[p_sd8x8],	edx
-	
+
 	mov			edx,		[p_mad8x8]
 	WELS_MAX_REG_SSE2	xmm5
 	;movdqa		xmm1,	xmm5
@@ -1464,7 +1464,7 @@
 	mov			[edx+1],cl
 	add			edx,	2
 	mov			[p_mad8x8],	edx
-	
+
 	psrlq	xmm7,	32
 	psllq	xmm7,	32			; clear sad
 	pxor	xmm6,	xmm6		; sum_8x8 interleaves cur and pRef in Dword,  Sref1 Scur1 Sref0 Scur0
@@ -1477,7 +1477,7 @@
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
 	WELS_SAD_BGD_SQDIFF_16x1_SSE2	xmm7,	xmm6,	xmm5,	xmm4
-	
+
 	mov		edx,		[psad8x8]
 	movdqa	xmm2,		xmm7
 	pshufd	xmm1,		xmm2,		00001110b
@@ -1485,11 +1485,11 @@
 	movd	[edx+4],	xmm1
 	add		edx,		8
 	mov		[psad8x8],	edx			; sad8x8
-	
+
 	paddd	xmm1,				xmm2
 	movd	edx,				xmm1
 	add		[tmp_sadframe],		edx			; iFrameSad
-	
+
 	mov		edx,			[psum16x16]
 	movdqa	xmm1,			xmm6
 	pshufd	xmm2,			xmm1,		00001110b
@@ -1498,7 +1498,7 @@
 	add		[edx],			ebp
 	add		edx,			4
 	mov		[psum16x16],	edx
-	
+
 	mov		edx,			[psqsum16x16]
 	psrlq	xmm7,			32
 	pshufd	xmm2,			xmm7,		00001110b
@@ -1506,7 +1506,7 @@
 	movd	[edx],			xmm2				; sqsum
 	add		edx,			4
 	mov		[psqsum16x16],	edx
-	
+
 	mov		edx,		[p_sd8x8]
 	pshufd	xmm1,		xmm6,		11110101b			; Sref1 Sref1 Sref0 Sref0
 	psubd	xmm6,		xmm1		; 00 diff1 00 diff0
@@ -1514,7 +1514,7 @@
 	movq	[edx],		xmm1
 	add		edx,		8
 	mov		[p_sd8x8],	edx
-	
+
 	mov		edx,		[p_mad8x8]
 	WELS_MAX_REG_SSE2	xmm5
 	;movdqa		xmm1,	xmm5
@@ -1525,7 +1525,7 @@
 	;punpcklwd	xmm5,	xmm0
 	;movd		[edx+4],	xmm5
 	;add			edx,		8
-	;mov			[p_mad8x8],	edx	
+	;mov			[p_mad8x8],	edx
 	movhlps		xmm1,	xmm5
 	movd		ecx,	xmm5
 	mov			[edx],	cl
@@ -1533,7 +1533,7 @@
 	mov			[edx+1],cl
 	add			edx,	2
 	mov			[p_mad8x8],	edx
-	
+
 	mov		edx,		[psqdiff16x16]
 	pshufd	xmm1,		xmm4,		00001110b
 	paddd	xmm4,		xmm1
@@ -1542,30 +1542,30 @@
 	movd	[edx],		xmm4
 	add		edx,		4
 	mov		[psqdiff16x16],	edx
-	
+
 	add		edx,	16
 	sub		esi,	eax
 	sub		edi,	eax
 	add		esi,	16
 	add		edi,	16
-	
+
 	mov		ecx,	[tmp_ecx]
 	dec		ecx
 	jnz		sqdiff_bgd_width_loop
-	
+
 	mov		esi,	[tmp_esi]
 	mov		edi,	[tmp_edi]
 	add		esi,	eax
 	add		edi,	eax
-	
+
 	dec	dword [iPicHeight]
 	jnz		sqdiff_bgd_height_loop
-	
+
 	mov		edx,	[psadframe]
 	mov		ebp,	[tmp_sadframe]
 	mov		[edx],	ebp
 
-	add		esp,	localsize	
+	add		esp,	localsize
 	pop		ebx
 	pop		edi
 	pop		esi
--- a/processing/src/common/WelsVP.def
+++ b/processing/src/common/WelsVP.def
@@ -33,4 +33,4 @@
 LIBRARY		    welsvp.dll
 EXPORTS
                 CreateVpInterface    PRIVATE
-                DestroyVpInterface   PRIVATE      
\ No newline at end of file
+                DestroyVpInterface   PRIVATE
\ No newline at end of file
--- a/processing/src/common/WelsVP.rc
+++ b/processing/src/common/WelsVP.rc
@@ -27,18 +27,18 @@
 // TEXTINCLUDE
 //
 
-1 TEXTINCLUDE 
+1 TEXTINCLUDE
 BEGIN
     "resource.h\0"
 END
 
-2 TEXTINCLUDE 
+2 TEXTINCLUDE
 BEGIN
     "#include ""afxres.h""\r\n"
     "\0"
 END
 
-3 TEXTINCLUDE 
+3 TEXTINCLUDE
 BEGIN
     "\r\n"
     "\0"
--- a/testbin/AutoBuild_Windows_VS2008.bat
+++ b/testbin/AutoBuild_Windows_VS2008.bat
@@ -23,7 +23,7 @@
 rem call VP build
 echo "Welsvp Building....."
 cd %VPProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsVP_2008.vcproj
 
 
@@ -33,7 +33,7 @@
 
 cd %CurDir%
 cd %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsEncCore.vcproj
 %VCBUILDEXE% WelsEncPlus.vcproj
 %VCBUILDEXE% encConsole.vcproj
@@ -44,7 +44,7 @@
 
 cd %CurDir%
 cd %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 %VCBUILDEXE% WelsDecCore.vcproj
 %VCBUILDEXE% WelsDecPlus.vcproj
 %VCBUILDEXE% decConsole.vcproj
--- a/testbin/AutoBuild_Windows_VS2010.bat
+++ b/testbin/AutoBuild_Windows_VS2010.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2010.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2010.sln
 
--- a/testbin/AutoBuild_Windows_VS2012.bat
+++ b/testbin/AutoBuild_Windows_VS2012.bat
@@ -36,7 +36,7 @@
 cd %CurDir%
 cd %EncoderProjectDir%
 echo current directory is %EncoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
 %VCMSBUILDEXE_RELEASE% WelsEncoder_2012.sln
@@ -49,7 +49,7 @@
 cd %CurDir%
 cd %DecoderProjectDir%
 echo current directory is %DecoderProjectDir%
-rem vcclean 
+rem vcclean
 
 echo %VCMSBUILDEXE_RELEASE% WelsDecoder_2012.sln
 
--- a/testbin/welsenc.cfg
+++ b/testbin/welsenc.cfg
@@ -12,19 +12,19 @@
 EnableFrameCropping 	1 		       # enable frame cropping flag
 
 #============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
 LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
 
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
 
 #============================== LONG TERM REFERENCE CONTROL ==============================
 EnableLongTermReference             0              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
 
 #============================== LAYER DEFINITION ==============================
 PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
--- a/testbin/welsenc_vd_1d.cfg
+++ b/testbin/welsenc_vd_1d.cfg
@@ -12,19 +12,19 @@
 EnableFrameCropping 	1 		       # enable frame cropping flag
 
 #============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
 LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
 
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
 
 #============================== LONG TERM REFERENCE CONTROL ==============================
 EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
 
 #============================== LAYER DEFINITION ==============================
 PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)
--- a/testbin/welsenc_vd_rc.cfg
+++ b/testbin/welsenc_vd_rc.cfg
@@ -12,19 +12,19 @@
 EnableFrameCropping 	1 		       # enable frame cropping flag
 
 #============================== LOOP FILTER ==============================
-LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off, 
+LoopFilterDisableIDC       0                   # Loop filter idc (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 LoopFilterAlphaC0Offset	0                      # AlphaOffset(-6..+6): valid range
 LoopFilterBetaOffset	0                      # BetaOffset (-6..+6): valid range
 
-InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off, 
+InterLayerLoopFilterDisableIDC       0         # filter idc for inter-layer deblocking (0: on, 1: off,
                                                # 2: on except for slice boundaries,
                                                # 3: two stage. slice boundries on in second stage
-                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)  
+                                               # 4: Luma on but Chroma off in enh. layer (w.r.t. idc=0)
                                                # 5: Luma on except on slice boundaries, but Chroma off in enh. layer (w.r.t. idc=2)
                                                # 6: Luma on in two stage. slice boundries on in second stage, but Chroma off (w.r.t. idc=3)
 InterLayerLoopFilterAlphaC0Offset 0            # AlphaOffset for inter-layer deblocking
@@ -51,7 +51,7 @@
 
 #============================== LONG TERM REFERENCE CONTROL ==============================
 EnableLongTermReference             1              # Enable Long Term Reference (1: enable, 0: disable)
-LtrMarkPeriod                       30             # Long Term Reference Marking Period 
+LtrMarkPeriod                       30             # Long Term Reference Marking Period
 
 #============================== LAYER DEFINITION ==============================
 PrefixNALAddingCtrl		0						# Control flag of adding prefix unit (0: off, 1: on)